diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..1e32fab7 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-09-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2408.09075v2","updated":"2024-09-03T16:47:09Z","published":"2024-08-17T02:26:29Z","title":"Improving Rare Word Translation With Dictionaries and Attention Masking","summary":" In machine translation, rare words continue to be a problem for the dominant\nencoder-decoder architecture, especially in low-resource and out-of-domain\ntranslation settings. Human translators solve this problem with monolingual or\nbilingual dictionaries. In this paper, we propose appending definitions from a\nbilingual dictionary to source sentences and using attention masking to link\ntogether rare words with their definitions. We find that including definitions\nfor rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1.\n","authors":["Kenneth J. Sible","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2408.09075v2.pdf","comment":"11 pages, 3 figures, 3 tables. Accepted at AMTA 2024"},{"id":"http://arxiv.org/abs/2406.06385v3","updated":"2024-09-03T16:36:06Z","published":"2024-06-10T15:44:22Z","title":"Low-Rank Quantization-Aware Training for LLMs","summary":" Large language models (LLMs) are omnipresent, however their practical\ndeployment is challenging due to their ever increasing computational and memory\ndemands. Quantization is one of the most effective ways to make them more\ncompute and memory efficient. Quantization-aware training (QAT) methods,\ngenerally produce the best quantized performance, however it comes at the cost\nof potentially long training time and excessive memory usage, making it\nimpractical when applying for LLMs. Inspired by parameter-efficient fine-tuning\n(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a\nlightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several\ncomponents to save memory without sacrificing predictive performance: (a)\nlow-rank auxiliary weights that are aware of the quantization grid; (b) a\ndowncasting operator using fixed-point or double-packed integers and (c)\ncheckpointing. Unlike most related work, our method (i) is inference-efficient,\nleading to no additional overhead compared to traditional PTQ; (ii) can be seen\nas a general extended pretraining framework, meaning that the resulting model\ncan still be utilized for any downstream task afterwards; (iii) can be applied\nacross a wide range of quantization settings, such as different choices\nquantization granularity, activation quantization, and seamlessly combined with\nmany PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families\nand validate its effectiveness on several downstream tasks. Our method\noutperforms common post-training quantization (PTQ) approaches and reaches the\nsame model performance as full-model QAT at the fraction of its memory usage.\nSpecifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of\nmemory. Our source code is available at\nhttps://github.com/qualcomm-ai-research/LR-QAT\n","authors":["Yelysei Bondarenko","Riccardo Del Chiaro","Markus Nagel"],"pdf_url":"https://arxiv.org/pdf/2406.06385v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14340v3","updated":"2024-09-03T14:53:34Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":" In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wenhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17024v2","updated":"2024-09-03T13:55:01Z","published":"2024-08-30T05:42:31Z","title":"InkubaLM: A small language model for low-resource African languages","summary":" High-resource language models often fall short in the African context, where\nthere is a critical need for models that are efficient, accessible, and locally\nrelevant, even amidst significant computing and data constraints. This paper\nintroduces InkubaLM, a small language model with 0.4 billion parameters, which\nachieves performance comparable to models with significantly larger parameter\ncounts and more extensive training data on tasks such as machine translation,\nquestion-answering, AfriMMLU, and the AfriXnli task. Notably, InkubaLM\noutperforms many larger models in sentiment analysis and demonstrates\nremarkable consistency across multiple languages. This work represents a\npivotal advancement in challenging the conventional paradigm that effective\nlanguage models must rely on substantial resources. Our model and datasets are\npublicly available at https://huggingface.co/lelapa to encourage research and\ndevelopment on low-resource languages.\n","authors":["Atnafu Lambebo Tonja","Bonaventure F. P. Dossou","Jessica Ojo","Jenalea Rajab","Fadel Thior","Eric Peter Wairagala","Anuoluwapo Aremu","Pelonomi Moiloa","Jade Abbott","Vukosi Marivate","Benjamin Rosman"],"pdf_url":"https://arxiv.org/pdf/2408.17024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02031v8","updated":"2024-09-03T10:19:52Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":" Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reasons are the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever large language\nmodel in the ocean domain, which is expert in various ocean science tasks. We\nalso propose OceanGPT, a novel framework to automatically obtain a large volume\nof ocean domain instruction data, which generates instructions based on\nmulti-agent collaboration. Additionally, we construct the first oceanography\nbenchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean\ndomain. Though comprehensive experiments, OceanGPT not only shows a higher\nlevel of knowledge expertise for oceans science tasks but also gains\npreliminary embodied intelligence capabilities in ocean technology.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Daxiong Ji","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v8.pdf","comment":"ACL2024. Project Website: http://oceangpt.zjukg.cn/"},{"id":"http://arxiv.org/abs/2312.01082v2","updated":"2024-09-03T07:59:58Z","published":"2023-12-02T09:20:10Z","title":"A Survey on Stability of Learning with Limited Labelled Data and its\n Sensitivity to the Effects of Randomness","summary":" Learning with limited labelled data, such as prompting, in-context learning,\nfine-tuning, meta-learning or few-shot learning, aims to effectively train a\nmodel using only a small amount of labelled samples. However, these approaches\nhave been observed to be excessively sensitive to the effects of uncontrolled\nrandomness caused by non-determinism in the training process. The randomness\nnegatively affects the stability of the models, leading to large variances in\nresults across training runs. When such sensitivity is disregarded, it can\nunintentionally, but unfortunately also intentionally, create an imaginary\nperception of research progress. Recently, this area started to attract\nresearch attention and the number of relevant studies is continuously growing.\nIn this survey, we provide a comprehensive overview of 415 papers addressing\nthe effects of randomness on the stability of learning with limited labelled\ndata. We distinguish between four main tasks addressed in the papers\n(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness\neffects), providing findings for each one. Furthermore, we identify and discuss\nseven challenges and open problems together with possible directions to\nfacilitate further research. The ultimate goal of this survey is to emphasise\nthe importance of this growing research area, which so far has not received an\nappropriate level of attention, and reveal impactful directions for future\nresearch.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2312.01082v2.pdf","comment":"Accepted to ACM Comput. Surv. 2024"},{"id":"http://arxiv.org/abs/2406.01252v3","updated":"2024-09-03T07:07:59Z","published":"2024-06-03T12:10:26Z","title":"Towards Scalable Automated Alignment of LLMs: A Survey","summary":" Alignment is the most critical step in building large language models (LLMs)\nthat meet human needs. With the rapid development of LLMs gradually surpassing\nhuman capabilities, traditional alignment methods based on human-annotation are\nincreasingly unable to meet the scalability demands. Therefore, there is an\nurgent need to explore new sources of automated alignment signals and technical\napproaches. In this paper, we systematically review the recently emerging\nmethods of automated alignment, attempting to explore how to achieve effective,\nscalable, automated alignment once the capabilities of LLMs exceed those of\nhumans. Specifically, we categorize existing automated alignment methods into 4\nmajor categories based on the sources of alignment signals and discuss the\ncurrent status and potential development of each category. Additionally, we\nexplore the underlying mechanisms that enable automated alignment and discuss\nthe essential factors that make automated alignment technologies feasible and\neffective from the fundamental role of alignment.\n","authors":["Boxi Cao","Keming Lu","Xinyu Lu","Jiawei Chen","Mengjie Ren","Hao Xiang","Peilin Liu","Yaojie Lu","Ben He","Xianpei Han","Le Sun","Hongyu Lin","Bowen Yu"],"pdf_url":"https://arxiv.org/pdf/2406.01252v3.pdf","comment":"Paper List: https://github.com/cascip/awesome-auto-alignment"},{"id":"http://arxiv.org/abs/2311.13110v3","updated":"2024-09-03T06:31:48Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v3.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes and formatting improvements. V3: improvements from\n journal revisions"},{"id":"http://arxiv.org/abs/2406.10203v2","updated":"2024-09-03T06:05:32Z","published":"2024-06-14T17:38:21Z","title":"A Fundamental Trade-off in Aligned Language Models and its Relation to\n Sampling Adaptors","summary":" The relationship between the quality of a string, as judged by a human\nreader, and its probability, $p(\\boldsymbol{y})$ under a language model\nundergirds the development of better language models. For example, many popular\nalgorithms for sampling from a language model have been conceived with the goal\nof manipulating $p(\\boldsymbol{y})$ to place higher probability on strings that\nhumans deem of high quality. In this article, we examine the\nprobability--quality relationship in language models explicitly aligned to\nhuman preferences, e.g., through reinforcement learning through human feedback.\nWe show that, when sampling corpora from an aligned language model, there\nexists a trade-off between the strings' average reward and average\nlog-likelihood under the prior language model, i.e., the same model before\nalignment with human preferences. We provide a formal treatment of this\nphenomenon and demonstrate how a choice of sampling adaptor allows for a\nselection of how much likelihood we exchange for the reward.\n","authors":["Naaman Tan","Josef Valvoda","Tianyu Liu","Anej Svete","Yanxia Qin","Kan Min-Yen","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2406.10203v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11169v4","updated":"2024-09-03T05:51:40Z","published":"2024-03-17T10:59:09Z","title":"Correcting misinformation on social media with a large language model","summary":" Real-world misinformation, often multimodal, can be partially or fully\nfactual but misleading using diverse tactics like conflating correlation with\ncausation. Such misinformation is severely understudied, challenging to\naddress, and harms various social domains, particularly on social media, where\nit can spread rapidly. High-quality and timely correction of misinformation\nthat identifies and explains its (in)accuracies effectively reduces false\nbeliefs. Despite the wide acceptance of manual correction, it is difficult to\nbe timely and scalable. While LLMs have versatile capabilities that could\naccelerate misinformation correction, they struggle due to a lack of recent\ninformation, a tendency to produce false content, and limitations in addressing\nmultimodal information. We propose MUSE, an LLM augmented with access to and\ncredibility evaluation of up-to-date information. By retrieving evidence as\nrefutations or supporting context, MUSE identifies and explains content\n(in)accuracies with references. It conducts multimodal retrieval and interprets\nvisual content to verify and correct multimodal content. Given the absence of a\ncomprehensive evaluation approach, we propose 13 dimensions of misinformation\ncorrection quality. Then, fact-checking experts evaluate responses to social\nmedia content that are not presupposed to be misinformation but broadly include\n(partially) incorrect and correct posts that may (not) be misleading. Results\ndemonstrate MUSE's ability to write high-quality responses to potential\nmisinformation--across modalities, tactics, domains, political leanings, and\nfor information that has not previously been fact-checked online--within\nminutes of its appearance on social media. Overall, MUSE outperforms GPT-4 by\n37% and even high-quality responses from laypeople by 29%. Our work provides a\ngeneral methodological and evaluative framework to correct misinformation at\nscale.\n","authors":["Xinyi Zhou","Ashish Sharma","Amy X. Zhang","Tim Althoff"],"pdf_url":"https://arxiv.org/pdf/2403.11169v4.pdf","comment":"50 pages"},{"id":"http://arxiv.org/abs/2405.01481v2","updated":"2024-09-03T05:47:42Z","published":"2024-05-02T17:13:40Z","title":"NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment","summary":" Aligning Large Language Models (LLMs) with human values and preferences is\nessential for making them helpful and safe. However, building efficient tools\nto perform alignment can be challenging, especially for the largest and most\ncompetent LLMs which often contain tens or hundreds of billions of parameters.\nWe create NeMo-Aligner, a toolkit for model alignment that can efficiently\nscale to a thousand GPUs for training the largest open-source LLMs such as\nNemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized\nand scalable implementations for major paradigms of model alignment such as:\nReinforcement Learning from Human Feedback (RLHF), Direct Preference\nOptimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally,\nour toolkit supports running most of the alignment techniques in a Parameter\nEfficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for\nextensibility, allowing support for other alignment techniques with minimal\neffort. It is open-sourced with Apache 2.0 License and we invite community\ncontributions at https://github.com/NVIDIA/NeMo-Aligner\n","authors":["Gerald Shen","Zhilin Wang","Olivier Delalleau","Jiaqi Zeng","Yi Dong","Daniel Egert","Shengyang Sun","Jimmy Zhang","Sahil Jain","Ali Taghibakhshi","Markel Sanz Ausin","Ashwath Aithal","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2405.01481v2.pdf","comment":"16 pages, 4 figures, Accepted to COLM 2024"},{"id":"http://arxiv.org/abs/2408.15518v2","updated":"2024-09-03T04:38:16Z","published":"2024-08-28T04:06:14Z","title":"Squid: Long Context as a New Modality for Energy-Efficient On-Device\n Language Models","summary":" This paper presents Dolphin, a novel decoder-decoder architecture for\nenergy-efficient processing of long contexts in language models. Our approach\naddresses the significant energy consumption and latency challenges inherent in\non-device models. Dolphin employs a compact 0.5B parameter decoder to distill\nextensive contextual information into a memory embedding, substantially\nreducing the input length for the primary 7B parameter decoder model. Inspired\nby vision-language models, we repurpose the image embedding projector to encode\nlong textual contexts, effectively treating extended context as a distinct\nmodality. This innovative method enables processing of substantially longer\ncontexts without the typical computational overhead associated with extended\ninput sequences. Empirical evaluations demonstrate a 10-fold improvement in\nenergy efficiency and a 5-fold reduction in latency compared to conventional\nfull-length context processing methods without losing quality of the response.\nOur work contributes to the development of more sustainable and scalable\nlanguage models for on-device applications, addressing the critical need for\nenergy-efficient and responsive AI technologies in resource-constrained\nenvironments while maintaining the accuracy to understand long contexts. This\nresearch has implications for the broader field of natural language processing,\nparticularly in the domain of efficient model design for resource-limited\nsettings. By enabling more sophisticated AI capabilities on edge devices,\nDolphin paves the way for advanced language processing in a wide range of\napplications where computational resources are at a premium. The Dolphin model\nis publicly available at https://huggingface.co/NexaAIDev/Dolphin.\n","authors":["Wei Chen","Zhiyuan Li","Shuo Xin","Yihao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.15518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06576v4","updated":"2024-09-03T02:11:01Z","published":"2024-06-04T04:17:40Z","title":"OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step","summary":" Despite significant advancements in text generation and reasoning, Large\nLanguage Models (LLMs) still face challenges in accurately performing complex\narithmetic operations. Language model systems often enable LLMs to generate\ncode for arithmetic operations to achieve accurate calculations. However, this\napproach compromises speed and security, and fine-tuning risks the language\nmodel losing prior capabilities. We propose a framework that enables exact\narithmetic in a single autoregressive step, providing faster, more secure, and\nmore interpretable LLM systems with arithmetic capabilities. We use the hidden\nstates of a LLM to control a symbolic architecture that performs arithmetic.\nOur implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama)\nachieves 100\\% accuracy on single arithmetic operations\n($+,-,\\times,\\div,\\sin{},\\cos{},\\log{},\\exp{},\\sqrt{}$), outperforming GPT 4o\nwith and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o\nwith and without a code interpreter on average across a range of mathematical\nproblem solving benchmarks, demonstrating that OccamLLMs can excel in\narithmetic tasks, even surpassing much larger models. We will make our code\npublic shortly.\n","authors":["Owen Dugan","Donato Manuel Jimenez Beneto","Charlotte Loh","Zhuo Chen","Rumen Dangovski","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2406.06576v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05219v3","updated":"2024-09-03T02:10:16Z","published":"2024-07-07T00:43:05Z","title":"Flood of Techniques and Drought of Theories: Emotion Mining in Disasters","summary":" Emotion mining has become a crucial tool for understanding human emotions\nduring disasters, leveraging the extensive data generated on social media\nplatforms. This paper aims to summarize existing research on emotion mining\nwithin disaster contexts, highlighting both significant discoveries and\npersistent issues. On the one hand, emotion mining techniques have achieved\nacceptable accuracy enabling applications such as rapid damage assessment and\nmental health surveillance. On the other hand, with many studies adopting\ndata-driven approaches, several methodological issues remain. These include\narbitrary emotion classification, ignoring biases inherent in data collection\nfrom social media, such as the overrepresentation of individuals from higher\nsocioeconomic status on Twitter, and the lack of application of theoretical\nframeworks like cross-cultural comparisons. These problems can be summarized as\na notable lack of theory-driven research and ignoring insights from social and\nbehavioral sciences. This paper underscores the need for interdisciplinary\ncollaboration between computer scientists and social scientists to develop more\nrobust and theoretically grounded approaches in emotion mining. By addressing\nthese gaps, we aim to enhance the effectiveness and reliability of emotion\nmining methodologies, ultimately contributing to improved disaster\npreparedness, response, and recovery.\n Keywords: emotion mining, sentiment analysis, natural disasters, psychology,\ntechnological disasters\n","authors":["Soheil Shapouri","Saber Soleymani","Saed Rezayi"],"pdf_url":"https://arxiv.org/pdf/2407.05219v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11807v3","updated":"2024-09-03T01:14:30Z","published":"2024-03-18T14:04:47Z","title":"How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming\n Ability in Multi-Agent Environments","summary":" Decision-making, a complicated task requiring various types of abilities,\npresents an excellent framework for assessing Large Language Models (LLMs). Our\nresearch investigates decision-making capabilities of LLMs through the lens of\nGame Theory. We focus specifically on games that support the simultaneous\nparticipation of more than two agents. We introduce GAMA($\\gamma$)-Bench, which\nevaluates LLMs' Gaming Ability in Multi-Agent environments. $\\gamma$-Bench\nincludes eight classical multi-agent games and a scoring scheme specially\ndesigned to quantitatively assess LLMs' performance. Leveraging $\\gamma$-Bench,\nwe investigate LLMs' robustness, generalizability, and strategies for\nenhancement. Results reveal that while GPT-3.5 shows satisfying robustness, its\ngeneralizability is relatively limited. However, its performance can be\nimproved through approaches such as Chain-of-Thought. Additionally, we evaluate\ntwelve versions from six models, including GPT-3.5, GPT-4, Gemini, LLaMA-3.1,\nMixtral, and Qwen-2. We find that Gemini-1.5-Pro outperforms other models with\na score of $63.8$ out of $100$, followed by LLaMA-3.1-70B and GPT-4 with scores\nof $60.9$ and $60.5$, respectively. The code and experimental results are made\npublicly available via https://github.com/CUHK-ARISE/GAMABench.\n","authors":["Jen-tse Huang","Eric John Li","Man Ho Lam","Tian Liang","Wenxuan Wang","Youliang Yuan","Wenxiang Jiao","Xing Wang","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2403.11807v3.pdf","comment":"11 pages of main text. 20 pages of appendices. 12 figures, 9 tables.\n Added models: Gemini-1.5-Pro, LLaMA-3.1-{7, 70, 405}B, Mixtral-8x{7, 22}B,\n Qwen-2-72B"},{"id":"http://arxiv.org/abs/2406.16746v3","updated":"2024-09-03T23:03:41Z","published":"2024-06-24T15:55:49Z","title":"The Responsible Foundation Model Development Cheatsheet: A Review of\n Tools & Resources","summary":" Foundation model development attracts a rapidly expanding body of\ncontributors, scientists, and applications. To help shape responsible\ndevelopment practices, we introduce the Foundation Model Development\nCheatsheet: a growing collection of 250+ tools and resources spanning text,\nvision, and speech modalities. We draw on a large body of prior work to survey\nresources (e.g. software, documentation, frameworks, guides, and practical\ntools) that support informed data selection, processing, and understanding,\nprecise and limitation-aware artifact documentation, efficient model training,\nadvance awareness of the environmental impact from training, careful model\nevaluation of capabilities, risks, and claims, as well as responsible model\nrelease, licensing and deployment practices. We hope this curated collection of\nresources helps guide more responsible development. The process of curating\nthis list, enabled us to review the AI development ecosystem, revealing what\ntools are critically missing, misused, or over-used in existing practices. We\nfind that (i) tools for data sourcing, model evaluation, and monitoring are\ncritically under-serving ethical and real-world needs, (ii) evaluations for\nmodel safety, capabilities, and environmental impact all lack reproducibility\nand transparency, (iii) text and particularly English-centric analyses continue\nto dominate over multilingual and multi-modal analyses, and (iv) evaluation of\nsystems, rather than just models, is needed so that capabilities and impact are\nassessed in context.\n","authors":["Shayne Longpre","Stella Biderman","Alon Albalak","Hailey Schoelkopf","Daniel McDuff","Sayash Kapoor","Kevin Klyman","Kyle Lo","Gabriel Ilharco","Nay San","Maribeth Rauh","Aviya Skowron","Bertie Vidgen","Laura Weidinger","Arvind Narayanan","Victor Sanh","David Adelani","Percy Liang","Rishi Bommasani","Peter Henderson","Sasha Luccioni","Yacine Jernite","Luca Soldaini"],"pdf_url":"https://arxiv.org/pdf/2406.16746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02326v1","updated":"2024-09-03T22:36:42Z","published":"2024-09-03T22:36:42Z","title":"Arctic-SnowCoder: Demystifying High-Quality Data in Code Pretraining","summary":" Recent studies have been increasingly demonstrating that high-quality data is\ncrucial for effective pretraining of language models. However, the precise\ndefinition of \"high-quality\" remains underexplored. Focusing on the code\ndomain, we introduce Arctic-SnowCoder-1.3B, a data-efficient base code model\npretrained on 555B tokens through three phases of progressively refined data:\n(1) general pretraining with 500B standard-quality code tokens, preprocessed\nthrough basic filtering, deduplication, and decontamination, (2) continued\npretraining with 50B high-quality tokens, selected from phase one by a\nBERT-style quality annotator trained to distinguish good code from random data,\nusing positive examples drawn from high-quality code files, along with\ninstruction data from Magicoder and StarCoder2-Instruct, and (3) enhanced\npretraining with 5B synthetic data created by Llama-3.1-70B using phase two\ndata as seeds, adapting the Magicoder approach for pretraining. Despite being\ntrained on a limited dataset, Arctic-SnowCoder achieves state-of-the-art\nperformance on BigCodeBench, a coding benchmark focusing on practical and\nchallenging programming tasks, compared to similarly sized models trained on no\nmore than 1T tokens, outperforming Phi-1.5-1.3B by 36%. Across all evaluated\nbenchmarks, Arctic-SnowCoder-1.3B beats StarCoderBase-3B pretrained on 1T\ntokens. Additionally, it matches the performance of leading small base code\nmodels trained on trillions of tokens. For example, Arctic-SnowCoder-1.3B\nsurpasses StarCoder2-3B, pretrained on over 3.3T tokens, on HumanEval+, a\nbenchmark that evaluates function-level code generation, and remains\ncompetitive on BigCodeBench. Our evaluation presents a comprehensive analysis\njustifying various design choices for Arctic-SnowCoder. Most importantly, we\nfind that the key to high-quality data is its alignment with the distribution\nof downstream applications.\n","authors":["Yuxiang Wei","Hojae Han","Rajhans Samdani"],"pdf_url":"https://arxiv.org/pdf/2409.02326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14452v3","updated":"2024-09-03T22:06:17Z","published":"2023-03-25T12:12:30Z","title":"COFFEE: A Contrastive Oracle-Free Framework for Event Extraction","summary":" Event extraction is a complex information extraction task that involves\nextracting events from unstructured text. Prior classification-based methods\nrequire comprehensive entity annotations for joint training, while newer\ngeneration-based methods rely on heuristic templates containing oracle\ninformation such as event type, which is often unavailable in real-world\nscenarios. In this study, we consider a more realistic setting of this task,\nnamely the Oracle-Free Event Extraction (OFEE) task, where only the input\ncontext is given without any oracle information, including event type, event\nontology and trigger word. To solve this task, we propose a new framework,\ncalled COFFEE, which extracts the events solely based on the document context\nwithout referring to any oracle information. In particular, a contrastive\nselection model is introduced in COFFEE to rectify the generated triggers and\nhandle multi-event instances. The proposed COFFEE outperforms state-of-the-art\napproaches under the oracle-free setting of the event extraction task, as\nevaluated on a public event extraction benchmark ACE05.\n","authors":["Meiru Zhang","Yixuan Su","Zaiqiao Meng","Zihao Fu","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2303.14452v3.pdf","comment":"Accepted to MATCHING Workshop at ACL 2023"},{"id":"http://arxiv.org/abs/2403.06023v2","updated":"2024-09-03T20:09:57Z","published":"2024-03-09T22:18:26Z","title":"Persian Slang Text Conversion to Formal and Deep Learning of Persian\n Short Texts on Social Media for Sentiment Classification","summary":" The lack of a suitable tool for the analysis of conversational texts in the\nPersian language has made various analyses of these texts, including Sentiment\nAnalysis, difficult. In this research, we tried to make the understanding of\nthese texts easier for the machine by providing PSC, Persian Slang Converter, a\ntool for converting conversational texts into formal ones, and by using the\nmost up-to-date and best deep learning methods along with the PSC, the\nsentiment learning of short Persian language texts for the machine in a better\nway. be made More than 10 million unlabeled texts from various social networks\nand movie subtitles (as Conversational texts) and about 10 million news texts\n(as formal texts) have been used for training unsupervised models and formal\nimplementation of the tool. 60,000 texts from the comments of Instagram social\nnetwork users with positive, negative, and neutral labels are considered\nsupervised data for training the emotion classification model of short texts.\nUsing the formal tool, 57% of the words of the corpus of conversation were\nconverted. Finally, by using the formalizer, FastText model, and deep LSTM\nnetwork, an accuracy of 81.91 was obtained on the test data.\n","authors":["Mohsen Khazeni","Mohammad Heydari","Amir Albadvi"],"pdf_url":"https://arxiv.org/pdf/2403.06023v2.pdf","comment":"16 pages, 4 figures, 14 tables"},{"id":"http://arxiv.org/abs/2405.19534v2","updated":"2024-09-03T19:37:27Z","published":"2024-05-29T21:29:44Z","title":"Preference Learning Algorithms Do Not Learn Preference Rankings","summary":" Preference learning algorithms (e.g., RLHF and DPO) are frequently used to\nsteer LLMs to produce generations that are more preferred by humans, but our\nunderstanding of their inner workings is still limited. In this work, we study\nthe conventional wisdom that preference learning trains models to assign higher\nlikelihoods to more preferred outputs than less preferred outputs, measured via\n$\\textit{ranking accuracy}$. Surprisingly, we find that most state-of-the-art\npreference-tuned models achieve a ranking accuracy of less than 60% on common\npreference datasets. We furthermore derive the $\\textit{idealized ranking\naccuracy}$ that a preference-tuned LLM would achieve if it optimized the DPO or\nRLHF objective perfectly. We demonstrate that existing models exhibit a\nsignificant $\\textit{alignment gap}$ -- $\\textit{i.e.}$, a gap between the\nobserved and idealized ranking accuracies. We attribute this discrepancy to the\nDPO objective, which is empirically and theoretically ill-suited to fix even\nmild ranking errors in the reference model, and derive a simple and efficient\nformula for quantifying the difficulty of learning a given preference\ndatapoint. Finally, we demonstrate that ranking accuracy strongly correlates\nwith the empirically popular win rate metric when the model is close to the\nreference model used in the objective, shedding further light on the\ndifferences between on-policy (e.g., RLHF) and off-policy (e.g., DPO)\npreference learning algorithms.\n","authors":["Angelica Chen","Sadhika Malladi","Lily H. Zhang","Xinyi Chen","Qiuyi Zhang","Rajesh Ranganath","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2405.19534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02259v1","updated":"2024-09-03T19:34:25Z","published":"2024-09-03T19:34:25Z","title":"Optimal L-Systems for Stochastic L-system Inference Problems","summary":" This paper presents two novel theorems that address two open problems in\nstochastic Lindenmayer-system (L-system) inference, specifically focusing on\nthe construction of an optimal stochastic L-system capable of generating a\ngiven sequence of strings. The first theorem delineates a method for crafting a\nstochastic L-system that maximizes the likelihood of producing a given sequence\nof words through a singular derivation. Furthermore, the second theorem\ndetermines the stochastic L-systems with the highest probability of producing a\ngiven sequence of words with multiple possible derivations. From these, we\nintroduce an algorithm to infer an optimal stochastic L-system from a given\nsequence. This algorithm incorporates sophisticated optimization techniques,\nsuch as interior point methods, ensuring production of a stochastically optimal\nstochastic L-system suitable for generating the given sequence. This allows for\nthe use of using stochastic L-systems as model for machine learning using only\npositive data for training.\n","authors":["Ali Lotfi","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2409.02259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02257v1","updated":"2024-09-03T19:31:03Z","published":"2024-09-03T19:31:03Z","title":"MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in\n LLMs","summary":" Existing benchmarks for large language models (LLMs) increasingly struggle to\ndifferentiate between top-performing models, underscoring the need for more\nchallenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced\nbenchmark building upon MMLU-Pro to assess shortcut learning and higher-order\nreasoning in LLMs. By incorporating questions with multiple correct answers\nacross diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex\nreasoning and resist simplistic problem-solving strategies. Our results show\nthat MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous\ntest of model discrimination, particularly in multi-correct answer scenarios.\nWe introduce novel metrics like shortcut selection ratio and correct pair\nidentification ratio, offering deeper insights into model behavior and\nanchoring bias. Evaluations of five state-of-the-art LLMs reveal significant\nperformance gaps, highlighting variations in reasoning abilities and bias\nsusceptibility. We release the dataset and evaluation codes at\n\\url{https://github.com/asgsaeid/mmlu-pro-plus}.\n","authors":["Saeid Asgari Taghanaki","Aliasgahr Khani","Amir Khasahmadi"],"pdf_url":"https://arxiv.org/pdf/2409.02257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16994v2","updated":"2024-09-03T19:28:39Z","published":"2024-07-24T04:27:55Z","title":"A Voter-Based Stochastic Rejection-Method Framework for Asymptotically\n Safe Language Model Outputs","summary":" This paper proposes a new method for preventing unsafe or otherwise low\nquality large language model (LLM) outputs, by leveraging the stochasticity of\nLLMs. We propose a system whereby LLM checkers vote on the acceptability of a\ngenerated output, regenerating it if a threshold of disapproval is reached,\nuntil sufficient checkers approve. We further propose estimators for cost and\nfailure rate, and based on those estimators and experimental data tailored to\nthe application, we propose an algorithm that achieves a desired failure rate\nat the least possible cost. We demonstrate that, under these models, failure\nrate decreases exponentially as a function of cost when voter count and\nthreshold are chosen according to the algorithm, and that the models reasonably\nestimate the actual performance of such a system in action, even with limited\ndata.\n","authors":["Jake R. Watts","Joel Sokol"],"pdf_url":"https://arxiv.org/pdf/2407.16994v2.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.00996v4","updated":"2024-09-03T19:27:19Z","published":"2023-10-02T08:58:29Z","title":"ARN: Analogical Reasoning on Narratives","summary":" As a core cognitive skill that enables the transferability of information\nacross domains, analogical reasoning has been extensively studied for both\nhumans and computational models. However, while cognitive theories of analogy\noften focus on narratives and study the distinction between surface,\nrelational, and system similarities, existing work in natural language\nprocessing has a narrower focus as far as relational analogies between word\npairs. This gap brings a natural question: can state-of-the-art large language\nmodels (LLMs) detect system analogies between narratives? To gain insight into\nthis question and extend word-based relational analogies to relational system\nanalogies, we devise a comprehensive computational framework that\noperationalizes dominant theories of analogy, using narrative elements to\ncreate surface and system mappings. Leveraging the interplay between these\nmappings, we create a binary task and benchmark for Analogical Reasoning on\nNarratives (ARN), covering four categories of far (cross-domain)/near\n(within-domain) analogies and disanalogies. We show that while all LLMs can\nlargely recognize near analogies, even the largest ones struggle with far\nanalogies in a zero-shot setting, with GPT4.0 scoring below random. Guiding the\nmodels through solved examples and chain-of-thought reasoning enhances their\nanalogical reasoning ability. Yet, since even in the few-shot setting, the best\nmodel only performs halfway between random and humans, ARN opens exciting\ndirections for computational analogical reasoners.\n","authors":["Zhivar Sourati","Filip Ilievski","Pia Sommerauer","Yifan Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.00996v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02244v1","updated":"2024-09-03T19:19:13Z","published":"2024-09-03T19:19:13Z","title":"Therapy as an NLP Task: Psychologists' Comparison of LLMs and Human\n Peers in CBT","summary":" Wider access to therapeutic care is one of the biggest challenges in mental\nhealth treatment. Due to institutional barriers, some people seeking mental\nhealth support have turned to large language models (LLMs) for personalized\ntherapy, even though these models are largely unsanctioned and untested. We\ninvestigate the potential and limitations of using LLMs as providers of\nevidence-based therapy by using mixed methods clinical metrics. Using HELPERT,\na prompt run on a large language model using the same process and training as a\ncomparative group of peer counselors, we replicated publicly accessible mental\nhealth conversations rooted in Cognitive Behavioral Therapy (CBT) to compare\nsession dynamics and counselor's CBT-based behaviors between original peer\nsupport sessions and their reconstructed HELPERT sessions. Two licensed,\nCBT-trained clinical psychologists evaluated the sessions using the Cognitive\nTherapy Rating Scale and provided qualitative feedback. Our findings show that\nthe peer sessions are characterized by empathy, small talk, therapeutic\nalliance, and shared experiences but often exhibit therapist drift. Conversely,\nHELPERT reconstructed sessions exhibit minimal therapist drift and higher\nadherence to CBT methods but display a lack of collaboration, empathy, and\ncultural understanding. Through CTRS ratings and psychologists' feedback, we\nhighlight the importance of human-AI collaboration for scalable mental health.\nOur work outlines the ethical implication of imparting human-like subjective\nqualities to LLMs in therapeutic settings, particularly the risk of deceptive\nempathy, which may lead to unrealistic patient expectations and potential harm.\n","authors":["Zainab Iftikhar","Sean Ransom","Amy Xiao","Jeff Huang"],"pdf_url":"https://arxiv.org/pdf/2409.02244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02239v1","updated":"2024-09-03T19:11:15Z","published":"2024-09-03T19:11:15Z","title":"Temporal Order Preserved Optimal Transport-based Cross-modal Knowledge\n Transfer Learning for ASR","summary":" Transferring linguistic knowledge from a pretrained language model (PLM) to\nan acoustic model has been shown to greatly improve the performance of\nautomatic speech recognition (ASR). However, due to the heterogeneous feature\ndistributions in cross-modalities, designing an effective model for feature\nalignment and knowledge transfer between linguistic and acoustic sequences\nremains a challenging task. Optimal transport (OT), which efficiently measures\nprobability distribution discrepancies, holds great potential for aligning and\ntransferring knowledge between acoustic and linguistic modalities. Nonetheless,\nthe original OT treats acoustic and linguistic feature sequences as two\nunordered sets in alignment and neglects temporal order information during OT\ncoupling estimation. Consequently, a time-consuming pretraining stage is\nrequired to learn a good alignment between the acoustic and linguistic\nrepresentations. In this paper, we propose a Temporal Order Preserved OT\n(TOT)-based Cross-modal Alignment and Knowledge Transfer (CAKT) (TOT-CAKT) for\nASR. In the TOT-CAKT, local neighboring frames of acoustic sequences are\nsmoothly mapped to neighboring regions of linguistic sequences, preserving\ntheir temporal order relationship in feature alignment and matching. With the\nTOT-CAKT model framework, we conduct Mandarin ASR experiments with a pretrained\nChinese PLM for linguistic knowledge transfer. Our results demonstrate that the\nproposed TOT-CAKT significantly improves ASR performance compared to several\nstate-of-the-art models employing linguistic knowledge transfer, and addresses\nthe weaknesses of the original OT-based method in sequential feature alignment\nfor ASR.\n","authors":["Xugang Lu","Peng Shen","Yu Tsao","Hisashi Kawai"],"pdf_url":"https://arxiv.org/pdf/2409.02239v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2406.01981v2","updated":"2024-09-03T19:11:11Z","published":"2024-06-04T05:47:17Z","title":"Zyda: A 1.3T Dataset for Open Language Modeling","summary":" The size of large language models (LLMs) has scaled dramatically in recent\nyears and their computational and data requirements have surged\ncorrespondingly. State-of-the-art language models, even at relatively smaller\nsizes, typically require training on at least a trillion tokens. This rapid\nadvancement has eclipsed the growth of open-source datasets available for\nlarge-scale LLM pretraining. In this paper, we introduce Zyda (Zyphra Dataset),\na dataset under a permissive license comprising 1.3 trillion tokens, assembled\nby integrating several major respected open-source datasets into a single,\nhigh-quality corpus. We apply rigorous filtering and deduplication processes,\nboth within and across datasets, to maintain and enhance the quality derived\nfrom the original datasets. Our evaluations show that Zyda not only competes\nfavorably with other open datasets like Dolma, FineWeb, and RefinedWeb, but\nalso substantially improves the performance of comparable models from the\nPythia suite. Our rigorous data processing methods significantly enhance Zyda's\neffectiveness, outperforming even the best of its constituent datasets when\nused independently.\n","authors":["Yury Tokpanov","Beren Millidge","Paolo Glorioso","Jonathan Pilault","Adam Ibrahim","James Whittington","Quentin Anthony"],"pdf_url":"https://arxiv.org/pdf/2406.01981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16176v3","updated":"2024-09-03T19:08:18Z","published":"2023-10-24T20:48:11Z","title":"Correction with Backtracking Reduces Hallucination in Summarization","summary":" Abstractive summarization aims at generating natural language summaries of a\nsource document that are succinct while preserving the important elements.\nDespite recent advances, neural text summarization models are known to be\nsusceptible to hallucinating (or more correctly confabulating), that is to\nproduce summaries with details that are not grounded in the source document. In\nthis paper, we introduce a simple yet efficient technique, CoBa, to reduce\nhallucination in abstractive summarization. The approach is based on two steps:\nhallucination detection and mitigation. We show that the former can be achieved\nthrough measuring simple statistics about conditional word probabilities and\ndistance to context words. Further, we demonstrate that straight-forward\nbacktracking is surprisingly effective at mitigation. We thoroughly evaluate\nthe proposed method with prior art on three benchmark datasets for text\nsummarization. The results show that CoBa is effective and efficient in\nreducing hallucination, and offers great adaptability and flexibility. Code can\nbe found at https://github.com/zhenzhel/CoBa.\n","authors":["Zhenzhen Liu","Chao Wan","Varsha Kishore","Jin Peng Zhou","Minmin Chen","Kilian Q. Weinberger"],"pdf_url":"https://arxiv.org/pdf/2310.16176v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02228v1","updated":"2024-09-03T18:55:54Z","published":"2024-09-03T18:55:54Z","title":"Unforgettable Generalization in Language Models","summary":" When language models (LMs) are trained to forget (or \"unlearn'') a skill, how\nprecisely does their behavior change? We study the behavior of transformer LMs\nin which tasks have been forgotten via fine-tuning on randomized labels. Such\nLMs learn to generate near-random predictions for individual examples in the\n\"training'' set used for forgetting. Across tasks, however, LMs exhibit extreme\nvariability in whether LM predictions change on examples outside the training\nset. In some tasks (like entailment classification), forgetting generalizes\nrobustly, and causes models to produce uninformative predictions on new task\ninstances; in other tasks (like physical commonsense reasoning and scientific\nquestion answering) forgetting affects only the training examples, and models\ncontinue to perform the \"forgotten'' task accurately even for examples very\nsimilar to those that appeared in the training set. Dataset difficulty is not\npredictive of whether a behavior can be forgotten; instead, generalization in\nforgetting is (weakly) predicted by the confidence of LMs' initial task\npredictions and the variability of LM representations of training data, with\nlow confidence and low variability both associated with greater generalization.\nPerhaps most surprisingly, random-label forgetting appears to be somewhat\ninsensitive to the contents of the training set: for example, models trained on\nscience questions with random labels continue to answer other science questions\naccurately, but begin to produce random labels on entailment classification\ntasks. Finally, we show that even generalizable forgetting is shallow: linear\nprobes trained on LMs' representations can still perform tasks reliably after\nforgetting. Our results highlight the difficulty and unpredictability of\nperforming targeted skill removal from models via fine-tuning.\n","authors":["Eric Zhang","Leshem Chosen","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2409.02228v1.pdf","comment":"18 pages, 9 figures, published in First Conference on Language\n Modeling 2024"},{"id":"http://arxiv.org/abs/2409.02865v1","updated":"2024-09-03T17:59:50Z","published":"2024-09-03T17:59:50Z","title":"Visually Grounded Speech Models for Low-resource Languages and Cognitive\n Modelling","summary":" This dissertation examines visually grounded speech (VGS) models that learn\nfrom unlabelled speech paired with images. It focuses on applications for\nlow-resource languages and understanding human language acquisition. We\nintroduce a task called visually prompted keyword localisation to detect and\nlocalise keywords in speech using images. We demonstrate the effectiveness of\nVGS models in few-shot learning scenarios for low-resource languages like\nYoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our\nmonolingual VGS model exhibits this bias, but we found that multilingualism\ndoes not affect the bias in this VGS model similarly to what is observed in\nchildren.\n","authors":["Leanne Nortje"],"pdf_url":"https://arxiv.org/pdf/2409.02865v1.pdf","comment":"PhD Dissertation"},{"id":"http://arxiv.org/abs/2409.02098v1","updated":"2024-09-03T17:54:40Z","published":"2024-09-03T17:54:40Z","title":"CRAFT Your Dataset: Task-Specific Synthetic Dataset Generation Through\n Corpus Retrieval and Augmentation","summary":" Building high-quality datasets for specialized tasks is a time-consuming and\nresource-intensive process that often requires specialized domain knowledge. We\npropose Corpus Retrieval and Augmentation for Fine-Tuning (CRAFT), a method for\ngenerating synthetic datasets, given a small number of user-written few-shots\nthat demonstrate the task to be performed. Given the few-shot examples, we use\nlarge-scale public web-crawled corpora and similarity-based document retrieval\nto find other relevant human-written documents. Lastly, instruction-tuned large\nlanguage models (LLMs) augment the retrieved documents into custom-formatted\ntask samples, which then can be used for fine-tuning. We demonstrate that CRAFT\ncan efficiently generate large-scale task-specific training datasets for four\ndiverse tasks: biology question-answering (QA), medicine QA and commonsense QA\nas well as summarization. Our experiments show that CRAFT-based models\noutperform or achieve comparable performance to general LLMs for QA tasks,\nwhile CRAFT-based summarization models outperform models trained on\nhuman-curated data by 46 preference points.\n","authors":["Ingo Ziegler","Abdullatif Köksal","Desmond Elliott","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2409.02098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15444v2","updated":"2024-09-03T17:48:55Z","published":"2024-05-30T18:07:13Z","title":"Investigating the Robustness of LLMs on Math Word Problems","summary":" Large Language Models (LLMs) excel at various tasks, including solving math\nword problems (MWPs), but struggle with real-world problems containing\nirrelevant information. To address this, we propose a prompting framework that\ngenerates adversarial variants of MWPs by adding irrelevant variables. We\nintroduce a dataset, ProbleMATHIC, containing both adversarial and\nnon-adversarial MWPs. Our experiments reveal that LLMs are susceptible to\ndistraction by numerical noise, resulting in an average relative performance\ndrop of ~26% on adversarial MWPs. To mitigate this, we fine-tune LLMs (Llama-2,\nMistral) on the adversarial samples from our dataset. Fine-tuning on\nadversarial training instances improves performance on adversarial MWPs by ~8%,\nindicating increased robustness to noise and better ability to identify\nrelevant data for reasoning. Finally, to assess the generalizability of our\nprompting framework, we introduce GSM-8K-Adv, an adversarial variant of the\nGSM-8K benchmark. LLMs continue to struggle when faced with adversarial\ninformation, reducing performance by up to ~6%.\n","authors":["Ujjwala Anantheswaran","Himanshu Gupta","Kevin Scaria","Shreyas Verma","Chitta Baral","Swaroop Mishra"],"pdf_url":"https://arxiv.org/pdf/2406.15444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02078v1","updated":"2024-09-03T17:26:17Z","published":"2024-09-03T17:26:17Z","title":"Political DEBATE: Efficient Zero-shot and Few-shot Classifiers for\n Political Text","summary":" Social scientists quickly adopted large language models due to their ability\nto annotate documents without supervised training, an ability known as\nzero-shot learning. However, due to their compute demands, cost, and often\nproprietary nature, these models are often at odds with replication and open\nscience standards. This paper introduces the Political DEBATE (DeBERTa\nAlgorithm for Textual Entailment) language models for zero-shot and few-shot\nclassification of political documents. These models are not only as good, or\nbetter than, state-of-the art large language models at zero and few-shot\nclassification, but are orders of magnitude more efficient and completely open\nsource. By training the models on a simple random sample of 10-25 documents,\nthey can outperform supervised classifiers trained on hundreds or thousands of\ndocuments and state-of-the-art generative models with complex, engineered\nprompts. Additionally, we release the PolNLI dataset used to train these models\n-- a corpus of over 200,000 political documents with highly accurate labels\nacross over 800 classification tasks.\n","authors":["Michael Burnham","Kayla Kahn","Ryan Yank Wang","Rachel X. Peng"],"pdf_url":"https://arxiv.org/pdf/2409.02078v1.pdf","comment":"26 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02076v1","updated":"2024-09-03T17:25:54Z","published":"2024-09-03T17:25:54Z","title":"Spinning the Golden Thread: Benchmarking Long-Form Generation in\n Language Models","summary":" The abilities of long-context language models (LMs) are often evaluated using\nthe \"Needle-in-a-Haystack\" (NIAH) test, which comprises tasks designed to\nassess a model's ability to identify specific information (\"needle\") within\nlarge text sequences (\"haystack\"). While these benchmarks measure how well\nmodels understand long-context input sequences, they do not effectively gauge\nthe quality of long-form text generation--a critical aspect for applications\nsuch as design proposals and creative writing. To address this gap, we have\nintroduced a new long-form text evaluation benchmark, Spinning the Golden\nThread (SGT), which tests models' ability to identify specific events within\ngenerated long text sequences. In this benchmark, we prompt long-context LMs to\ncreate long-form text that must include particular events or constraints and\nevaluate their ability to incorporate these elements. We evaluated ten\nlong-context LMs across four distinct scenarios, three types of prompt\ninstructions, and two different generation-length settings (16K and 32K).\nAlthough these models perform well on NIAH benchmarks, none demonstrated\nsatisfactory performance on the Spinning the Golden Thread, raising concerns\nabout their ability to generate coherent long-form text that follows\ninstructions. Additionally, as the length of the generated text increases, all\nmodels exhibit a significant drop in performance.\n","authors":["Yuhao Wu","Ming Shan Hee","Zhiqing Hu","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2409.02076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02060v1","updated":"2024-09-03T17:08:20Z","published":"2024-09-03T17:08:20Z","title":"OLMoE: Open Mixture-of-Experts Language Models","summary":" We introduce OLMoE, a fully open, state-of-the-art language model leveraging\nsparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but\nuses only 1B per input token. We pretrain it on 5 trillion tokens and further\nadapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available\nmodels with similar active parameters, even surpassing larger ones like\nLlama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE\ntraining, analyze routing in our model showing high specialization, and\nopen-source all aspects of our work: model weights, training data, code, and\nlogs.\n","authors":["Niklas Muennighoff","Luca Soldaini","Dirk Groeneveld","Kyle Lo","Jacob Morrison","Sewon Min","Weijia Shi","Pete Walsh","Oyvind Tafjord","Nathan Lambert","Yuling Gu","Shane Arora","Akshita Bhagia","Dustin Schwenk","David Wadden","Alexander Wettig","Binyuan Hui","Tim Dettmers","Douwe Kiela","Ali Farhadi","Noah A. Smith","Pang Wei Koh","Amanpreet Singh","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2409.02060v1.pdf","comment":"61 pages (24 main), 36 figures, 14 tables"},{"id":"http://arxiv.org/abs/2409.02050v1","updated":"2024-09-03T16:53:38Z","published":"2024-09-03T16:53:38Z","title":"Enhancing Code-Switching Speech Recognition with LID-Based Collaborative\n Mixture of Experts Model","summary":" Due to the inherent difficulty in modeling phonetic similarities across\ndifferent languages, code-switching speech recognition presents a formidable\nchallenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE)\nmodel that leverages a collaborative mechanism among expert groups. Initially,\na preceding routing network explicitly learns Language Identification (LID)\ntasks and selects experts based on acquired LID weights. This process ensures\nrobust routing information to the MoE layer, mitigating interference from\ndiverse language domains on expert network parameter updates. The LID weights\nare also employed to facilitate inter-group collaboration, enabling the\nintegration of language-specific representations. Furthermore, within each\nlanguage expert group, a gating network operates unsupervised to foster\ncollaboration on attributes beyond language. Extensive experiments demonstrate\nthe efficacy of our approach, achieving significant performance enhancements\ncompared to alternative methods. Importantly, our method preserves the\nefficient inference capabilities characteristic of MoE models without\nnecessitating additional pre-training.\n","authors":["Hukai Huang","Jiayan Lin","Kaidi Wang","Yishuang Li","Wenhao Guan","Qingyang Hong","Lin Li"],"pdf_url":"https://arxiv.org/pdf/2409.02050v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.02038v1","updated":"2024-09-03T16:37:45Z","published":"2024-09-03T16:37:45Z","title":"BEAVER: An Enterprise Benchmark for Text-to-SQL","summary":" Existing text-to-SQL benchmarks have largely been constructed using publicly\navailable tables from the web with human-generated tests containing question\nand SQL statement pairs. They typically show very good results and lead people\nto think that LLMs are effective at text-to-SQL tasks. In this paper, we apply\noff-the-shelf LLMs to a benchmark containing enterprise data warehouse data. In\nthis environment, LLMs perform poorly, even when standard prompt engineering\nand RAG techniques are utilized. As we will show, the reasons for poor\nperformance are largely due to three characteristics: (1) public LLMs cannot\ntrain on enterprise data warehouses because they are largely in the \"dark web\",\n(2) schemas of enterprise tables are more complex than the schemas in public\ndata, which leads the SQL-generation task innately harder, and (3)\nbusiness-oriented questions are often more complex, requiring joins over\nmultiple tables and aggregations. As a result, we propose a new dataset BEAVER,\nsourced from real enterprise data warehouses together with natural language\nqueries and their correct SQL statements which we collected from actual user\nhistory. We evaluated this dataset using recent LLMs and demonstrated their\npoor performance on this task. We hope this dataset will facilitate future\nresearchers building more sophisticated text-to-SQL systems which can do better\non this important class of data.\n","authors":["Peter Baile Chen","Fabian Wenz","Yi Zhang","Moe Kayali","Nesime Tatbul","Michael Cafarella","Çağatay Demiralp","Michael Stonebraker"],"pdf_url":"https://arxiv.org/pdf/2409.02038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.05783v2","updated":"2024-09-03T16:23:55Z","published":"2024-04-08T17:53:21Z","title":"A Survey on Responsible Generative AI: What to Generate and What Not","summary":" In recent years, generative AI (GenAI), like large language models and\ntext-to-image models, has received significant attention across various\ndomains. However, ensuring the responsible generation of content by these\nmodels is crucial for their real-world applicability. This raises an\ninteresting question: What should responsible GenAI generate, and what should\nit not? To answer the question, this paper investigates the practical\nresponsible requirements of both textual and visual generative models,\noutlining five key considerations: generating truthful content, avoiding toxic\ncontent, refusing harmful instruction, leaking no training data-related\ncontent, and ensuring generated content identifiable. Specifically, we review\nrecent advancements and challenges in addressing these requirements. Besides,\nwe discuss and emphasize the importance of responsible GenAI across healthcare,\neducation, finance, and artificial general intelligence domains. Through a\nunified perspective on both textual and visual generative models, this paper\naims to provide insights into practical safety-related issues and further\nbenefit the community in building responsible GenAI.\n","authors":["Jindong Gu"],"pdf_url":"https://arxiv.org/pdf/2404.05783v2.pdf","comment":"77 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.02026v1","updated":"2024-09-03T16:20:22Z","published":"2024-09-03T16:20:22Z","title":"Foundations of Large Language Model Compression -- Part 1: Weight\n Quantization","summary":" In recent years, compression of large language models (LLMs) has emerged as\nan important problem to allow language model deployment on resource-constrained\ndevices, reduce computational costs, and mitigate the environmental footprint\nof large-scale AI infrastructure. In this paper, we present the foundations of\nLLM quantization from a convex optimization perspective and propose a\nquantization method that builds on these foundations and outperforms previous\nmethods. Our quantization framework, CVXQ, scales to models containing hundreds\nof billions of weight parameters and provides users with the flexibility to\ncompress models to any specified model size, post-training. A reference\nimplementation of CVXQ can be obtained from https://github.com/seannz/cvxq.\n","authors":["Sean I. Young"],"pdf_url":"https://arxiv.org/pdf/2409.02026v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2309.00649v2","updated":"2024-09-03T15:23:26Z","published":"2023-08-31T12:53:52Z","title":"GPT has become financially literate: Insights from financial literacy\n tests of GPT and a preliminary test of how people use it as a source of\n advice","summary":" We assess the ability of GPT -- a large language model -- to serve as a\nfinancial robo-advisor for the masses, by using a financial literacy test.\nDavinci and ChatGPT based on GPT-3.5 score 66% and 65% on the financial\nliteracy test, respectively, compared to a baseline of 33%. However, ChatGPT\nbased on GPT-4 achieves a near-perfect 99% score, pointing to financial\nliteracy becoming an emergent ability of state-of-the-art models. We use the\nJudge-Advisor System and a savings dilemma to illustrate how researchers might\nassess advice-utilization from large language models. We also present a number\nof directions for future research.\n","authors":["Paweł Niszczota","Sami Abbas"],"pdf_url":"https://arxiv.org/pdf/2309.00649v2.pdf","comment":"43 pages, 2 figures and 2 tables in main text; in V2 added\n information that this is the Author Accepted Manuscript version"},{"id":"http://arxiv.org/abs/2402.01115v4","updated":"2024-09-03T15:14:11Z","published":"2024-02-02T03:15:13Z","title":"Interpretation of Intracardiac Electrograms Through Textual\n Representations","summary":" Understanding the irregular electrical activity of atrial fibrillation (AFib)\nhas been a key challenge in electrocardiography. For serious cases of AFib,\ncatheter ablations are performed to collect intracardiac electrograms (EGMs).\nEGMs offer intricately detailed and localized electrical activity of the heart\nand are an ideal modality for interpretable cardiac studies. Recent\nadvancements in artificial intelligence (AI) has allowed some works to utilize\ndeep learning frameworks to interpret EGMs during AFib. Additionally, language\nmodels (LMs) have shown exceptional performance in being able to generalize to\nunseen domains, especially in healthcare. In this study, we are the first to\nleverage pretrained LMs for finetuning of EGM interpolation and AFib\nclassification via masked language modeling. We formulate the EGM as a textual\nsequence and present competitive performances on AFib classification compared\nagainst other representations. Lastly, we provide a comprehensive\ninterpretability study to provide a multi-perspective intuition of the model's\nbehavior, which could greatly benefit the clinical use.\n","authors":["William Jongwon Han","Diana Gomez","Avi Alok","Chaojing Duan","Michael A. Rosenberg","Douglas Weber","Emerson Liu","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.01115v4.pdf","comment":"17 pages, 7 figures; Accepted to CHIL 2024"},{"id":"http://arxiv.org/abs/2409.01944v1","updated":"2024-09-03T14:40:31Z","published":"2024-09-03T14:40:31Z","title":"FuzzCoder: Byte-level Fuzzing Test via Large Language Model","summary":" Fuzzing is an important dynamic program analysis technique designed for\nfinding vulnerabilities in complex software. Fuzzing involves presenting a\ntarget program with crafted malicious input to cause crashes, buffer overflows,\nmemory errors, and exceptions. Crafting malicious inputs in an efficient manner\nis a difficult open problem and the best approaches often apply uniform random\nmutations to pre-existing valid inputs. In this work, we propose to adopt\nfine-tuned large language models (FuzzCoder) to learn patterns in the input\nfiles from successful attacks to guide future fuzzing explorations.\nSpecifically, we develop a framework to leverage the code LLMs to guide the\nmutation process of inputs in fuzzing. The mutation process is formulated as\nthe sequence-to-sequence modeling, where LLM receives a sequence of bytes and\nthen outputs the mutated byte sequence. FuzzCoder is fine-tuned on the created\ninstruction dataset (Fuzz-Instruct), where the successful fuzzing history is\ncollected from the heuristic fuzzing tool. FuzzCoder can predict mutation\nlocations and strategies locations in input files to trigger abnormal behaviors\nof the program. Experimental results show that FuzzCoder based on AFL (American\nFuzzy Lop) gain significant improvements in terms of effective proportion of\nmutation (EPM) and number of crashes (NC) for various input formats including\nELF, JPG, MP3, and XML.\n","authors":["Liqun Yang","Jian Yang","Chaoren Wei","Guanglin Niu","Ge Zhang","Yunli Wang","Linzheng ChaI","Wanxu Xia","Hongcheng Guo","Shun Zhang","Jiaheng Liu","Yuwei Yin","Junran Peng","Jiaxin Ma","Liang Sun","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2409.01944v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.01941v1","updated":"2024-09-03T14:38:29Z","published":"2024-09-03T14:38:29Z","title":"Towards Leveraging Large Language Models for Automated Medical Q&A\n Evaluation","summary":" This paper explores the potential of using Large Language Models (LLMs) to\nautomate the evaluation of responses in medical Question and Answer (Q\\&A)\nsystems, a crucial form of Natural Language Processing. Traditionally, human\nevaluation has been indispensable for assessing the quality of these responses.\nHowever, manual evaluation by medical professionals is time-consuming and\ncostly. Our study examines whether LLMs can reliably replicate human\nevaluations by using questions derived from patient data, thereby saving\nvaluable time for medical experts. While the findings suggest promising\nresults, further research is needed to address more specific or complex\nquestions that were beyond the scope of this initial investigation.\n","authors":["Jack Krolik","Herprit Mahal","Feroz Ahmad","Gaurav Trivedi","Bahador Saket"],"pdf_url":"https://arxiv.org/pdf/2409.01941v1.pdf","comment":"10 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.00267v3","updated":"2024-09-03T14:01:54Z","published":"2023-09-01T05:53:33Z","title":"RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with\n AI Feedback","summary":" Reinforcement learning from human feedback (RLHF) has proven effective in\naligning large language models (LLMs) with human preferences, but gathering\nhigh-quality preference labels is expensive. RL from AI Feedback (RLAIF),\nintroduced in Bai et al., offers a promising alternative that trains the reward\nmodel (RM) on preferences generated by an off-the-shelf LLM. Across the tasks\nof summarization, helpful dialogue generation, and harmless dialogue\ngeneration, we show that RLAIF achieves comparable performance to RLHF.\nFurthermore, we take a step towards \"self-improvement\" by demonstrating that\nRLAIF can outperform a supervised fine-tuned baseline even when the AI labeler\nis the same size as the policy, or even the exact same checkpoint as the\ninitial policy. Finally, we introduce direct-RLAIF (d-RLAIF) - a technique that\ncircumvents RM training by obtaining rewards directly from an off-the-shelf LLM\nduring RL, which achieves superior performance to canonical RLAIF. Our results\nsuggest that RLAIF can achieve performance on-par with using human feedback,\noffering a potential solution to the scalability limitations of RLHF.\n","authors":["Harrison Lee","Samrat Phatale","Hassan Mansoor","Thomas Mesnard","Johan Ferret","Kellie Lu","Colton Bishop","Ethan Hall","Victor Carbune","Abhinav Rastogi","Sushant Prakash"],"pdf_url":"https://arxiv.org/pdf/2309.00267v3.pdf","comment":"Presented at ICML 2024"},{"id":"http://arxiv.org/abs/2409.01901v1","updated":"2024-09-03T13:44:56Z","published":"2024-09-03T13:44:56Z","title":"3D-LEX v1.0: 3D Lexicons for American Sign Language and Sign Language of\n the Netherlands","summary":" In this work, we present an efficient approach for capturing sign language in\n3D, introduce the 3D-LEX v1.0 dataset, and detail a method for semi-automatic\nannotation of phonetic properties. Our procedure integrates three motion\ncapture techniques encompassing high-resolution 3D poses, 3D handshapes, and\ndepth-aware facial features, and attains an average sampling rate of one sign\nevery 10 seconds. This includes the time for presenting a sign example,\nperforming and recording the sign, and archiving the capture. The 3D-LEX\ndataset includes 1,000 signs from American Sign Language and an additional\n1,000 signs from the Sign Language of the Netherlands. We showcase the dataset\nutility by presenting a simple method for generating handshape annotations\ndirectly from 3D-LEX. We produce handshape labels for 1,000 signs from American\nSign Language and evaluate the labels in a sign recognition task. The labels\nenhance gloss recognition accuracy by 5% over using no handshape annotations,\nand by 1% over expert annotations. Our motion capture data supports in-depth\nanalysis of sign features and facilitates the generation of 2D projections from\nany viewpoint. The 3D-LEX collection has been aligned with existing sign\nlanguage benchmarks and linguistic resources, to support studies in 3D-aware\nsign language processing.\n","authors":["Oline Ranum","Gomer Otterspeer","Jari I. Andersen","Robert G. Belleman","Floris Roelofsen"],"pdf_url":"https://arxiv.org/pdf/2409.01901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01893v1","updated":"2024-09-03T13:30:00Z","published":"2024-09-03T13:30:00Z","title":"What are the Essential Factors in Crafting Effective Long Context\n Multi-Hop Instruction Datasets? Insights and Best Practices","summary":" Recent advancements in large language models (LLMs) with extended context\nwindows have significantly improved tasks such as information extraction,\nquestion answering, and complex planning scenarios. In order to achieve success\nin long context tasks, a large amount of work has been done to enhance the long\ncontext capabilities of the model through synthetic data. Existing methods\ntypically utilize the Self-Instruct framework to generate instruction tuning\ndata for better long context capability improvement. However, our preliminary\nexperiments indicate that less than 35% of generated samples are multi-hop, and\nmore than 40% exhibit poor quality, limiting comprehensive understanding and\nfurther research. To improve the quality of synthetic data, we propose the\nMulti-agent Interactive Multi-hop Generation (MIMG) framework, incorporating a\nQuality Verification Agent, a Single-hop Question Generation Agent, a Multiple\nQuestion Sampling Strategy, and a Multi-hop Question Merger Agent. This\nframework improves the data quality, with the proportion of high-quality,\nmulti-hop, and diverse data exceeding 85%. Furthermore, we systematically\ninvestigate strategies for document selection, question merging, and validation\ntechniques through extensive experiments across various models. Our findings\nshow that our synthetic high-quality long-context instruction data\nsignificantly enhances model performance, even surpassing models trained on\nlarger amounts of human-annotated data. Our code is available at:\nhttps://github.com/WowCZ/LongMIT.\n","authors":["Zhi Chen","Qiguang Chen","Libo Qin","Qipeng Guo","Haijun Lv","Yicheng Zou","Wanxiang Che","Hang Yan","Kai Chen","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2409.01893v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2409.01882v1","updated":"2024-09-03T13:23:11Z","published":"2024-09-03T13:23:11Z","title":"Investigating Expert-in-the-Loop LLM Discourse Patterns for Ancient\n Intertextual Analysis","summary":" This study explores the potential of large language models (LLMs) for\nidentifying and examining intertextual relationships within biblical, Koine\nGreek texts. By evaluating the performance of LLMs on various intertextuality\nscenarios the study demonstrates that these models can detect direct\nquotations, allusions, and echoes between texts. The LLM's ability to generate\nnovel intertextual observations and connections highlights its potential to\nuncover new insights. However, the model also struggles with long query\npassages and the inclusion of false intertextual dependences, emphasizing the\nimportance of expert evaluation. The expert-in-the-loop methodology presented\noffers a scalable approach for intertextual research into the complex web of\nintertextuality within and beyond the biblical corpus.\n","authors":["Ray Umphrey","Jesse Roberts","Lindsey Roberts"],"pdf_url":"https://arxiv.org/pdf/2409.01882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01864v1","updated":"2024-09-03T13:05:38Z","published":"2024-09-03T13:05:38Z","title":"The Role of Large Language Models in Musicology: Are We Ready to Trust\n the Machines?","summary":" In this work, we explore the use and reliability of Large Language Models\n(LLMs) in musicology. From a discussion with experts and students, we assess\nthe current acceptance and concerns regarding this, nowadays ubiquitous,\ntechnology. We aim to go one step further, proposing a semi-automatic method to\ncreate an initial benchmark using retrieval-augmented generation models and\nmultiple-choice question generation, validated by human experts. Our evaluation\non 400 human-validated questions shows that current vanilla LLMs are less\nreliable than retrieval augmented generation from music dictionaries. This\npaper suggests that the potential of LLMs in musicology requires musicology\ndriven research that can specialized LLMs by including accurate and reliable\ndomain knowledge.\n","authors":["Pedro Ramoneda","Emilia Parada-Cabaleiro","Benno Weck","Xavier Serra"],"pdf_url":"https://arxiv.org/pdf/2409.01864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01854v1","updated":"2024-09-03T12:53:05Z","published":"2024-09-03T12:53:05Z","title":"AgentRE: An Agent-Based Framework for Navigating Complex Information\n Landscapes in Relation Extraction","summary":" The relation extraction (RE) in complex scenarios faces challenges such as\ndiverse relation types and ambiguous relations between entities within a single\nsentence, leading to the poor performance of pure \"text-in, text-out\" language\nmodels (LMs). To address these challenges, in this paper, we propose an\nagent-based RE framework, namely AgentRE, which fully leverages the potential\nof large language models (LLMs) including memory, retrieval and reflection, to\nachieve RE in complex scenarios. Specifically, three major modules are built in\nAgentRE serving as the tools to help the agent acquire and process various\nuseful information, thereby obtaining improved RE performance. Our extensive\nexperimental results upon two datasets in English and Chinese demonstrate our\nAgentRE's superior performance, especially in low-resource scenarios.\nAdditionally, the trajectories generated by AgentRE can be refined to construct\na high-quality training dataset incorporating different reasoning methods,\nwhich can be used to fine-tune smaller models. Code is available at\nhttps://github.com/Lightblues/AgentRE.\n","authors":["Yuchen Shi","Guochao Jiang","Tian Qiu","Deqing Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01854v1.pdf","comment":"Accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2409.01835v1","updated":"2024-09-03T12:34:21Z","published":"2024-09-03T12:34:21Z","title":"Towards Generative Class Prompt Learning for Few-shot Visual Recognition","summary":" Although foundational vision-language models (VLMs) have proven to be very\nsuccessful for various semantic discrimination tasks, they still struggle to\nperform faithfully for fine-grained categorization. Moreover, foundational\nmodels trained on one domain do not generalize well on a different domain\nwithout fine-tuning. We attribute these to the limitations of the VLM's\nsemantic representations and attempt to improve their fine-grained visual\nawareness using generative modeling. Specifically, we propose two novel\nmethods: Generative Class Prompt Learning (GCPL) and Contrastive Multi-class\nPrompt Learning (CoMPLe). Utilizing text-to-image diffusion models, GCPL\nsignificantly improves the visio-linguistic synergy in class embeddings by\nconditioning on few-shot exemplars with learnable class prompts. CoMPLe builds\non this foundation by introducing a contrastive learning component that\nencourages inter-class separation during the generative optimization process.\nOur empirical results demonstrate that such a generative class prompt learning\napproach substantially outperform existing methods, offering a better\nalternative to few shot image recognition challenges. The source code will be\nmade available at: https://github.com/soumitri2001/GCPL.\n","authors":["Soumitri Chattopadhyay","Sanket Biswas","Emanuele Vivoli","Josep Lladós"],"pdf_url":"https://arxiv.org/pdf/2409.01835v1.pdf","comment":"Accepted at BMVC 2024"},{"id":"http://arxiv.org/abs/2409.01808v1","updated":"2024-09-03T11:40:38Z","published":"2024-09-03T11:40:38Z","title":"Dialogue You Can Trust: Human and AI Perspectives on Generated\n Conversations","summary":" As dialogue systems and chatbots increasingly integrate into everyday\ninteractions, the need for efficient and accurate evaluation methods becomes\nparamount. This study explores the comparative performance of human and AI\nassessments across a range of dialogue scenarios, focusing on seven key\nperformance indicators (KPIs): Coherence, Innovation, Concreteness, Goal\nContribution, Commonsense Contradiction, Incorrect Fact, and Redundancy.\nUtilizing the GPT-4o API, we generated a diverse dataset of conversations and\nconducted a two-part experimental analysis. In Experiment 1, we evaluated\nmulti-party conversations on Coherence, Innovation, Concreteness, and Goal\nContribution, revealing that GPT models align closely with human judgments.\nNotably, both human and AI evaluators exhibited a tendency towards binary\njudgment rather than linear scaling, highlighting a shared challenge in these\nassessments. Experiment 2 extended the work of Finch et al. (2023) by focusing\non dyadic dialogues and assessing Commonsense Contradiction, Incorrect Fact,\nand Redundancy. The results indicate that while GPT-4o demonstrates strong\nperformance in maintaining factual accuracy and commonsense reasoning, it still\nstruggles with reducing redundancy and self-contradiction. Our findings\nunderscore the potential of GPT models to closely replicate human evaluation in\ndialogue systems, while also pointing to areas for improvement. This research\noffers valuable insights for advancing the development and implementation of\nmore refined dialogue evaluation methodologies, contributing to the evolution\nof more effective and human-like AI communication tools.\n","authors":["Ike Ebubechukwu","Johane Takeuchi","Antonello Ceravola","Frank Joublin"],"pdf_url":"https://arxiv.org/pdf/2409.01808v1.pdf","comment":"17 pages, 15 figures, shorter version submitted to 22nd Annual\n Workshop of the Australasian Language Technology Association (ALTA'24)"},{"id":"http://arxiv.org/abs/2409.01806v1","updated":"2024-09-03T11:39:52Z","published":"2024-09-03T11:39:52Z","title":"LASP: Surveying the State-of-the-Art in Large Language Model-Assisted AI\n Planning","summary":" Effective planning is essential for the success of any task, from organizing\na vacation to routing autonomous vehicles and developing corporate strategies.\nIt involves setting goals, formulating plans, and allocating resources to\nachieve them. LLMs are particularly well-suited for automated planning due to\ntheir strong capabilities in commonsense reasoning. They can deduce a sequence\nof actions needed to achieve a goal from a given state and identify an\neffective course of action. However, it is frequently observed that plans\ngenerated through direct prompting often fail upon execution. Our survey aims\nto highlight the existing challenges in planning with language models, focusing\non key areas such as embodied environments, optimal scheduling, competitive and\ncooperative games, task decomposition, reasoning, and planning. Through this\nstudy, we explore how LLMs transform AI planning and provide unique insights\ninto the future of LM-assisted planning.\n","authors":["Haoming Li","Zhaoliang Chen","Jonathan Zhang","Fei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01790v1","updated":"2024-09-03T11:09:44Z","published":"2024-09-03T11:09:44Z","title":"Training on the Benchmark Is Not All You Need","summary":" The success of Large Language Models (LLMs) relies heavily on the huge amount\nof pre-training data learned in the pre-training phase. The opacity of the\npre-training process and the training data causes the results of many benchmark\ntests to become unreliable. If any model has been trained on a benchmark test\nset, it can seriously hinder the health of the field. In order to automate and\nefficiently test the capabilities of large language models, numerous mainstream\nbenchmarks adopt a multiple-choice format. As the swapping of the contents of\nmultiple-choice options does not affect the meaning of the question itself, we\npropose a simple and effective data leakage detection method based on this\nproperty. Specifically, we shuffle the contents of the options in the data to\ngenerate the corresponding derived data sets, and then detect data leakage\nbased on the model's log probability distribution over the derived data sets.\nIf there is a maximum and outlier in the set of log probabilities, it indicates\nthat the data is leaked. Our method is able to work under black-box conditions\nwithout access to model training data or weights, effectively identifying data\nleakage from benchmark test sets in model pre-training data, including both\nnormal scenarios and complex scenarios where options may have been shuffled\nintentionally or unintentionally. Through experiments based on two LLMs and\nbenchmark designs, we demonstrate the effectiveness of our method. In addition,\nwe evaluate the degree of data leakage of 31 mainstream open-source LLMs on\nfour benchmark datasets and give a ranking of the leaked LLMs for each\nbenchmark, and we find that the Qwen family of LLMs has the highest degree of\ndata leakage.\n","authors":["Shiwen Ni","Xiangtao Kong","Chengming Li","Xiping Hu","Ruifeng Xu","Jia Zhu","Min Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01787v1","updated":"2024-09-03T11:06:45Z","published":"2024-09-03T11:06:45Z","title":"LLM-GAN: Construct Generative Adversarial Network Through Large Language\n Models For Explainable Fake News Detection","summary":" Explainable fake news detection predicts the authenticity of news items with\nannotated explanations. Today, Large Language Models (LLMs) are known for their\npowerful natural language understanding and explanation generation abilities.\nHowever, presenting LLMs for explainable fake news detection remains two main\nchallenges. Firstly, fake news appears reasonable and could easily mislead\nLLMs, leaving them unable to understand the complex news-faking process.\nSecondly, utilizing LLMs for this task would generate both correct and\nincorrect explanations, which necessitates abundant labor in the loop. In this\npaper, we propose LLM-GAN, a novel framework that utilizes prompting mechanisms\nto enable an LLM to become Generator and Detector and for realistic fake news\ngeneration and detection. Our results demonstrate LLM-GAN's effectiveness in\nboth prediction performance and explanation quality. We further showcase the\nintegration of LLM-GAN to a cloud-native AI platform to provide better fake\nnews detection service in the cloud.\n","authors":["Yifeng Wang","Zhouhong Gu","Siwei Zhang","Suhang Zheng","Tao Wang","Tianyu Li","Hongwei Feng","Yanghua Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.01787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01780v1","updated":"2024-09-03T10:49:42Z","published":"2024-09-03T10:49:42Z","title":"State-of-the-art Advances of Deep-learning Linguistic Steganalysis\n Research","summary":" With the evolution of generative linguistic steganography techniques,\nconventional steganalysis falls short in robustly quantifying the alterations\ninduced by steganography, thereby complicating detection. Consequently, the\nresearch paradigm has pivoted towards deep-learning-based linguistic\nsteganalysis. This study offers a comprehensive review of existing\ncontributions and evaluates prevailing developmental trajectories.\nSpecifically, we first provided a formalized exposition of the general formulas\nfor linguistic steganalysis, while comparing the differences between this field\nand the domain of text classification. Subsequently, we classified the existing\nwork into two levels based on vector space mapping and feature extraction\nmodels, thereby comparing the research motivations, model advantages, and other\ndetails. A comparative analysis of the experiments is conducted to assess the\nperformances. Finally, the challenges faced by this field are discussed, and\nseveral directions for future development and key issues that urgently need to\nbe addressed are proposed.\n","authors":["Yihao Wang","Ru Zhang","Yifan Tang","Jianyi Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01780v1.pdf","comment":"Accepted by 2023 International Conference on Data, Information and\n Computing Science"},{"id":"http://arxiv.org/abs/2409.01763v1","updated":"2024-09-03T10:16:43Z","published":"2024-09-03T10:16:43Z","title":"FC-KAN: Function Combinations in Kolmogorov-Arnold Networks","summary":" In this paper, we introduce FC-KAN, a Kolmogorov-Arnold Network (KAN) that\nleverages combinations of popular mathematical functions such as B-splines,\nwavelets, and radial basis functions on low-dimensional data through\nelement-wise operations. We explore several methods for combining the outputs\nof these functions, including sum, element-wise product, the addition of sum\nand element-wise product, quadratic function representation, and concatenation.\nIn our experiments, we compare FC-KAN with multi-layer perceptron network (MLP)\nand other existing KANs, such as BSRBF-KAN, EfficientKAN, FastKAN, and\nFasterKAN, on the MNIST and Fashion-MNIST datasets. A variant of FC-KAN, which\nuses a combination of outputs from B-splines and Difference of Gaussians (DoG)\nin the form of a quadratic function, outperformed all other models on the\naverage of 5 independent training runs. We expect that FC-KAN can leverage\nfunction combinations to design future KANs. Our repository is publicly\navailable at: https://github.com/hoangthangta/FC_KAN.\n","authors":["Hoang-Thang Ta","Duy-Quy Thai","Abu Bakar Siddiqur Rahman","Grigori Sidorov","Alexander Gelbukh"],"pdf_url":"https://arxiv.org/pdf/2409.01763v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.01754v1","updated":"2024-09-03T10:01:51Z","published":"2024-09-03T10:01:51Z","title":"Empirical evidence of Large Language Model's influence on human spoken\n communication","summary":" Artificial Intelligence (AI) agents now interact with billions of humans in\nnatural language, thanks to advances in Large Language Models (LLMs) like\nChatGPT. This raises the question of whether AI has the potential to shape a\nfundamental aspect of human culture: the way we speak. Recent analyses revealed\nthat scientific publications already exhibit evidence of AI-specific language.\nBut this evidence is inconclusive, since scientists may simply be using AI to\ncopy-edit their writing. To explore whether AI has influenced human spoken\ncommunication, we transcribed and analyzed about 280,000 English-language\nvideos of presentations, talks, and speeches from more than 20,000 YouTube\nchannels of academic institutions. We find a significant shift in the trend of\nword usage specific to words distinctively associated with ChatGPT following\nits release. These findings provide the first empirical evidence that humans\nincreasingly imitate LLMs in their spoken language. Our results raise societal\nand policy-relevant concerns about the potential of AI to unintentionally\nreduce linguistic diversity, or to be deliberately misused for mass\nmanipulation. They also highlight the need for further investigation into the\nfeedback loops between machine behavior and human culture.\n","authors":["Hiromu Yakura","Ezequiel Lopez-Lopez","Levin Brinkmann","Ignacio Serna","Prateek Gupta","Iyad Rahwan"],"pdf_url":"https://arxiv.org/pdf/2409.01754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01690v1","updated":"2024-09-03T08:13:06Z","published":"2024-09-03T08:13:06Z","title":"Taming CLIP for Fine-grained and Structured Visual Understanding of\n Museum Exhibits","summary":" CLIP is a powerful and widely used tool for understanding images in the\ncontext of natural language descriptions to perform nuanced tasks. However, it\ndoes not offer application-specific fine-grained and structured understanding,\ndue to its generic nature. In this work, we aim to adapt CLIP for fine-grained\nand structured -- in the form of tabular data -- visual understanding of museum\nexhibits. To facilitate such understanding we (a) collect, curate, and\nbenchmark a dataset of 200K+ image-table pairs, and (b) develop a method that\nallows predicting tabular outputs for input images. Our dataset is the first of\nits kind in the public domain. At the same time, the proposed method is novel\nin leveraging CLIP's powerful representations for fine-grained and tabular\nunderstanding. The proposed method (MUZE) learns to map CLIP's image embeddings\nto the tabular structure by means of a proposed transformer-based parsing\nnetwork (parseNet). More specifically, parseNet enables prediction of missing\nattribute values while integrating context from known attribute-value pairs for\nan input image. We show that this leads to significant improvement in accuracy.\nThrough exhaustive experiments, we show the effectiveness of the proposed\nmethod on fine-grained and structured understanding of museum exhibits, by\nachieving encouraging results in a newly established benchmark. Our dataset and\nsource-code can be found at: https://github.com/insait-institute/MUZE\n","authors":["Ada-Astrid Balauca","Danda Pani Paudel","Kristina Toutanova","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2409.01690v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2409.01666v1","updated":"2024-09-03T07:17:41Z","published":"2024-09-03T07:17:41Z","title":"In Defense of RAG in the Era of Long-Context Language Models","summary":" Overcoming the limited context limitations in early-generation LLMs,\nretrieval-augmented generation (RAG) has been a reliable solution for\ncontext-based answer generation in the past. Recently, the emergence of\nlong-context LLMs allows the models to incorporate much longer text sequences,\nmaking RAG less attractive. Recent studies show that long-context LLMs\nsignificantly outperform RAG in long-context applications. Unlike the existing\nworks favoring the long-context LLM over RAG, we argue that the extremely long\ncontext in LLMs suffers from a diminished focus on relevant information and\nleads to potential degradation in answer quality. This paper revisits the RAG\nin long-context answer generation. We propose an order-preserve\nretrieval-augmented generation (OP-RAG) mechanism, which significantly improves\nthe performance of RAG for long-context question-answer applications. With\nOP-RAG, as the number of retrieved chunks increases, the answer quality\ninitially rises, and then declines, forming an inverted U-shaped curve. There\nexist sweet points where OP-RAG could achieve higher answer quality with much\nless tokens than long-context LLM taking the whole context as input. Extensive\nexperiments on public benchmark demonstrate the superiority of our OP-RAG.\n","authors":["Tan Yu","Anbang Xu","Rama Akkiraju"],"pdf_url":"https://arxiv.org/pdf/2409.01666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01659v1","updated":"2024-09-03T07:01:46Z","published":"2024-09-03T07:01:46Z","title":"Interpreting and Improving Large Language Models in Arithmetic\n Calculation","summary":" Large language models (LLMs) have demonstrated remarkable potential across\nnumerous applications and have shown an emergent ability to tackle complex\nreasoning tasks, such as mathematical computations. However, even for the\nsimplest arithmetic calculations, the intrinsic mechanisms behind LLMs remain\nmysterious, making it challenging to ensure reliability. In this work, we delve\ninto uncovering a specific mechanism by which LLMs execute calculations.\nThrough comprehensive experiments, we find that LLMs frequently involve a small\nfraction (< 5%) of attention heads, which play a pivotal role in focusing on\noperands and operators during calculation processes. Subsequently, the\ninformation from these operands is processed through multi-layer perceptrons\n(MLPs), progressively leading to the final solution. These pivotal heads/MLPs,\nthough identified on a specific dataset, exhibit transferability across\ndifferent datasets and even distinct tasks. This insight prompted us to\ninvestigate the potential benefits of selectively fine-tuning these essential\nheads/MLPs to boost the LLMs' computational performance. We empirically find\nthat such precise tuning can yield notable enhancements on mathematical\nprowess, without compromising the performance on non-mathematical tasks. Our\nwork serves as a preliminary exploration into the arithmetic calculation\nabilities inherent in LLMs, laying a solid foundation to reveal more intricate\nmathematical tasks.\n","authors":["Wei Zhang","Chaoqun Wan","Yonggang Zhang","Yiu-ming Cheung","Xinmei Tian","Xu Shen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2409.01659v1.pdf","comment":"Accepted by ICML 2024 (oral)"},{"id":"http://arxiv.org/abs/2409.01658v1","updated":"2024-09-03T07:01:37Z","published":"2024-09-03T07:01:37Z","title":"From Yes-Men to Truth-Tellers: Addressing Sycophancy in Large Language\n Models with Pinpoint Tuning","summary":" Large Language Models (LLMs) tend to prioritize adherence to user prompts\nover providing veracious responses, leading to the sycophancy issue. When\nchallenged by users, LLMs tend to admit mistakes and provide inaccurate\nresponses even if they initially provided the correct answer. Recent works\npropose to employ supervised fine-tuning (SFT) to mitigate the sycophancy\nissue, while it typically leads to the degeneration of LLMs' general\ncapability. To address the challenge, we propose a novel supervised pinpoint\ntuning (SPT), where the region-of-interest modules are tuned for a given\nobjective. Specifically, SPT first reveals and verifies a small percentage\n(<5%) of the basic modules, which significantly affect a particular behavior of\nLLMs. i.e., sycophancy. Subsequently, SPT merely fine-tunes these identified\nmodules while freezing the rest. To verify the effectiveness of the proposed\nSPT, we conduct comprehensive experiments, demonstrating that SPT significantly\nmitigates the sycophancy issue of LLMs (even better than SFT). Moreover, SPT\nintroduces limited or even no side effects on the general capability of LLMs.\nOur results shed light on how to precisely, effectively, and efficiently\nexplain and improve the targeted ability of LLMs.\n","authors":["Wei Chen","Zhen Huang","Liang Xie","Binbin Lin","Houqiang Li","Le Lu","Xinmei Tian","Deng Cai","Yonggang Zhang","Wenxiao Wan","Xu Shen","Jieping Ye"],"pdf_url":"https://arxiv.org/pdf/2409.01658v1.pdf","comment":"Accepted by ICML 2024"},{"id":"http://arxiv.org/abs/2409.01628v1","updated":"2024-09-03T05:53:57Z","published":"2024-09-03T05:53:57Z","title":"CTG-KrEW: Generating Synthetic Structured Contextually Correlated\n Content by Conditional Tabular GAN with K-Means Clustering and Efficient Word\n Embedding","summary":" Conditional Tabular Generative Adversarial Networks (CTGAN) and their various\nderivatives are attractive for their ability to efficiently and flexibly create\nsynthetic tabular data, showcasing strong performance and adaptability.\nHowever, there are certain critical limitations to such models. The first is\ntheir inability to preserve the semantic integrity of contextually correlated\nwords or phrases. For instance, skillset in freelancer profiles is one such\nattribute where individual skills are semantically interconnected and\nindicative of specific domain interests or qualifications. The second challenge\nof traditional approaches is that, when applied to generate contextually\ncorrelated tabular content, besides generating semantically shallow content,\nthey consume huge memory resources and CPU time during the training stage. To\naddress these problems, we introduce a novel framework, CTGKrEW (Conditional\nTabular GAN with KMeans Clustering and Word Embedding), which is adept at\ngenerating realistic synthetic tabular data where attributes are collections of\nsemantically and contextually coherent words. CTGKrEW is trained and evaluated\nusing a dataset from Upwork, a realworld freelancing platform. Comprehensive\nexperiments were conducted to analyze the variability, contextual similarity,\nfrequency distribution, and associativity of the generated data, along with\ntesting the framework's system feasibility. CTGKrEW also takes around 99\\% less\nCPU time and 33\\% less memory footprints than the conventional approach.\nFurthermore, we developed KrEW, a web application to facilitate the generation\nof realistic data containing skill-related information. This application,\navailable at https://riyasamanta.github.io/krew.html, is freely accessible to\nboth the general public and the research community.\n","authors":["Riya Samanta","Bidyut Saha","Soumya K. Ghosh","Sajal K. Das"],"pdf_url":"https://arxiv.org/pdf/2409.01628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01586v1","updated":"2024-09-03T03:59:22Z","published":"2024-09-03T03:59:22Z","title":"Booster: Tackling Harmful Fine-tuing for Large Language Models via\n Attenuating Harmful Perturbation","summary":" Harmful fine-tuning issue \\citep{qi2023fine} poses serious safety concerns\nfor Large language models' fine-tuning-as-a-service. While existing defenses\n\\citep{huang2024vaccine,rosati2024representation} have been proposed to\nmitigate the issue, their performances are still far away from satisfactory,\nand the root cause of the problem has not been fully recovered. For the first\ntime in the literature, we in this paper show that \\textit{harmful\nperturbation} over the model weights should be the root cause of\nalignment-broken of harmful fine-tuning. In order to attenuate the negative\nimpact of harmful perturbation, we propose an alignment-stage solution, dubbed\nBooster. Technically, along with the original alignment loss, we append a loss\nregularizer in the alignment stage's optimization. The regularizer ensures that\nthe model's harmful loss reduction before/after simulated harmful perturbation\nis attenuated, thereby mitigating the subsequent fine-tuning risk. Empirical\nresults show that Booster can effectively reduce the harmful score of the\nfine-tuned models while maintaining the performance of downstream tasks. Our\ncode is available at \\url{https://github.com/git-disl/Booster}.\n","authors":["Tiansheng Huang","Sihao Hu","Fatih Ilhan","Selim Furkan Tekin","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01584v1","updated":"2024-09-03T03:42:56Z","published":"2024-09-03T03:42:56Z","title":"Towards Cross-Lingual Explanation of Artwork in Large-scale Vision\n Language Models","summary":" As the performance of Large-scale Vision Language Models (LVLMs) improves,\nthey are increasingly capable of responding in multiple languages, and there is\nan expectation that the demand for explanations generated by LVLMs will grow.\nHowever, pre-training of Vision Encoder and the integrated training of LLMs\nwith Vision Encoder are mainly conducted using English training data, leaving\nit uncertain whether LVLMs can completely handle their potential when\ngenerating explanations in languages other than English. In addition,\nmultilingual QA benchmarks that create datasets using machine translation have\ncultural differences and biases, remaining issues for use as evaluation tasks.\nTo address these challenges, this study created an extended dataset in multiple\nlanguages without relying on machine translation. This dataset that takes into\naccount nuances and country-specific phrases was then used to evaluate the\ngeneration explanation abilities of LVLMs. Furthermore, this study examined\nwhether Instruction-Tuning in resource-rich English improves performance in\nother languages. Our findings indicate that LVLMs perform worse in languages\nother than English compared to English. In addition, it was observed that LVLMs\nstruggle to effectively manage the knowledge learned from English data.\n","authors":["Shintaro Ozaki","Kazuki Hayashi","Yusuke Sakai","Hidetaka Kamigaito","Katsuhiko Hayashi","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2409.01584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01579v1","updated":"2024-09-03T03:25:59Z","published":"2024-09-03T03:25:59Z","title":"AdaComp: Extractive Context Compression with Adaptive Predictor for\n Retrieval-Augmented Large Language Models","summary":" Retrieved documents containing noise will hinder RAG from detecting answer\nclues and make the inference process slow and expensive. Therefore, context\ncompression is necessary to enhance its accuracy and efficiency. Existing\ncontext compression methods use extractive or generative models to retain the\nmost query-relevant sentences or apply the information bottleneck theory to\npreserve sufficient information. However, these methods may face issues such as\nover-compression or high computational costs. We observe that the retriever\noften ranks relevant documents at the top, but the exact number of documents\nneeded to answer the query is uncertain due to the impact of query complexity\nand retrieval quality: complex queries like multi-hop questions may require\nretaining more documents than simpler queries, and a low-quality retrieval may\nneed to rely on more documents to generate accurate outputs. Therefore,\ndetermining the minimum number of required documents (compression rate) is\nstill a challenge for RAG. In this paper, we introduce AdaComp, a low-cost\nextractive context compression method that adaptively determines the\ncompression rate based on both query complexity and retrieval quality.\nSpecifically, we first annotate the minimum top-k documents necessary for the\nRAG system to answer the current query as the compression rate and then\nconstruct triplets of the query, retrieved documents, and its compression rate.\nThen, we use this triplet dataset to train a compression-rate predictor.\nExperiments on three QA datasets and one conversational Muiti-doc QA dataset\nshow that AdaComp significantly reduces inference costs while maintaining\nperformance nearly identical to uncompressed models, achieving a balance\nbetween efficiency and performance.\n","authors":["Qianchi Zhang","Hainan Zhang","Liang Pang","Hongwei Zheng","Zhiming Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.01579v1.pdf","comment":"8 pages, 5 figures, code available at\n https://anonymous.4open.science/r/AdaComp-8C0C/"},{"id":"http://arxiv.org/abs/2409.01575v1","updated":"2024-09-03T03:16:03Z","published":"2024-09-03T03:16:03Z","title":"An Implementation of Werewolf Agent That does not Truly Trust LLMs","summary":" Werewolf is an incomplete information game, which has several challenges when\ncreating a computer agent as a player given the lack of understanding of the\nsituation and individuality of utterance (e.g., computer agents are not capable\nof characterful utterance or situational lying). We propose a werewolf agent\nthat solves some of those difficulties by combining a Large Language Model\n(LLM) and a rule-based algorithm. In particular, our agent uses a rule-based\nalgorithm to select an output either from an LLM or a template prepared\nbeforehand based on the results of analyzing conversation history using an LLM.\nIt allows the agent to refute in specific situations, identify when to end the\nconversation, and behave with persona. This approach mitigated conversational\ninconsistencies and facilitated logical utterance as a result. We also\nconducted a qualitative evaluation, which resulted in our agent being perceived\nas more human-like compared to an unmodified LLM. The agent is freely available\nfor contributing to advance the research in the field of Werewolf game.\n","authors":["Takehiro Sato","Shintaro Ozaki","Daisaku Yokoyama"],"pdf_url":"https://arxiv.org/pdf/2409.01575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01556v1","updated":"2024-09-03T02:50:04Z","published":"2024-09-03T02:50:04Z","title":"Benchmarking Cognitive Domains for LLMs: Insights from Taiwanese Hakka\n Culture","summary":" This study introduces a comprehensive benchmark designed to evaluate the\nperformance of large language models (LLMs) in understanding and processing\ncultural knowledge, with a specific focus on Hakka culture as a case study.\nLeveraging Bloom's Taxonomy, the study develops a multi-dimensional framework\nthat systematically assesses LLMs across six cognitive domains: Remembering,\nUnderstanding, Applying, Analyzing, Evaluating, and Creating. This benchmark\nextends beyond traditional single-dimensional evaluations by providing a deeper\nanalysis of LLMs' abilities to handle culturally specific content, ranging from\nbasic recall of facts to higher-order cognitive tasks such as creative\nsynthesis. Additionally, the study integrates Retrieval-Augmented Generation\n(RAG) technology to address the challenges of minority cultural knowledge\nrepresentation in LLMs, demonstrating how RAG enhances the models' performance\nby dynamically incorporating relevant external information. The results\nhighlight the effectiveness of RAG in improving accuracy across all cognitive\ndomains, particularly in tasks requiring precise retrieval and application of\ncultural knowledge. However, the findings also reveal the limitations of RAG in\ncreative tasks, underscoring the need for further optimization. This benchmark\nprovides a robust tool for evaluating and comparing LLMs in culturally diverse\ncontexts, offering valuable insights for future research and development in\nAI-driven cultural knowledge preservation and dissemination.\n","authors":["Chen-Chi Chang","Ching-Yuan Chen","Hung-Shin Lee","Chih-Cheng Lee"],"pdf_url":"https://arxiv.org/pdf/2409.01556v1.pdf","comment":"Submitted to O-COCOSDA 2024"},{"id":"http://arxiv.org/abs/2409.01552v1","updated":"2024-09-03T02:42:39Z","published":"2024-09-03T02:42:39Z","title":"Self-Instructed Derived Prompt Generation Meets In-Context Learning:\n Unlocking New Potential of Black-Box LLMs","summary":" Large language models (LLMs) have shown success in generating high-quality\nresponses. In order to achieve better alignment with LLMs with human\npreference, various works are proposed based on specific optimization process,\nwhich, however, is not suitable to Black-Box LLMs like GPT-4, due to\ninaccessible parameters. In Black-Box LLMs case, their performance is highly\ndependent on the quality of the provided prompts. Existing methods to enhance\nresponse quality often involve a prompt refinement model, yet these approaches\npotentially suffer from semantic inconsistencies between the refined and\noriginal prompts, and typically overlook the relationship between them. To\naddress these challenges, we introduce a self-instructed in-context learning\nframework that empowers LLMs to deliver more effective responses by generating\nreliable derived prompts to construct informative contextual environments. Our\napproach incorporates a self-instructed reinforcement learning mechanism,\nenabling direct interaction with the response model during derived prompt\ngeneration for better alignment. We then formulate querying as an in-context\nlearning task, using responses from LLMs combined with the derived prompts to\nestablish a contextual demonstration for the original prompt. This strategy\nensures alignment with the original query, reduces discrepancies from refined\nprompts, and maximizes the LLMs' in-context learning capability. Extensive\nexperiments demonstrate that the proposed method not only generates more\nreliable derived prompts but also significantly enhances LLMs' ability to\ndeliver more effective responses, including Black-Box models such as GPT-4.\n","authors":["Zhuo Li","Yuhao Du","Jinpeng Hu","Xiang Wan","Anningzhe Gao"],"pdf_url":"https://arxiv.org/pdf/2409.01552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01548v1","updated":"2024-09-03T02:37:34Z","published":"2024-09-03T02:37:34Z","title":"VoxHakka: A Dialectally Diverse Multi-speaker Text-to-Speech System for\n Taiwanese Hakka","summary":" This paper introduces VoxHakka, a text-to-speech (TTS) system designed for\nTaiwanese Hakka, a critically under-resourced language spoken in Taiwan.\nLeveraging the YourTTS framework, VoxHakka achieves high naturalness and\naccuracy and low real-time factor in speech synthesis while supporting six\ndistinct Hakka dialects. This is achieved by training the model with\ndialect-specific data, allowing for the generation of speaker-aware Hakka\nspeech. To address the scarcity of publicly available Hakka speech corpora, we\nemployed a cost-effective approach utilizing a web scraping pipeline coupled\nwith automatic speech recognition (ASR)-based data cleaning techniques. This\nprocess ensured the acquisition of a high-quality, multi-speaker, multi-dialect\ndataset suitable for TTS training. Subjective listening tests conducted using\ncomparative mean opinion scores (CMOS) demonstrate that VoxHakka significantly\noutperforms existing publicly available Hakka TTS systems in terms of\npronunciation accuracy, tone correctness, and overall naturalness. This work\nrepresents a significant advancement in Hakka language technology and provides\na valuable resource for language preservation and revitalization efforts.\n","authors":["Li-Wei Chen","Hung-Shin Lee","Chen-Chi Chang"],"pdf_url":"https://arxiv.org/pdf/2409.01548v1.pdf","comment":"Submitted to O-COCOSDA 2024"},{"id":"http://arxiv.org/abs/2409.01545v1","updated":"2024-09-03T02:29:01Z","published":"2024-09-03T02:29:01Z","title":"Effective Noise-aware Data Simulation for Domain-adaptive Speech\n Enhancement Leveraging Dynamic Stochastic Perturbation","summary":" Cross-domain speech enhancement (SE) is often faced with severe challenges\ndue to the scarcity of noise and background information in an unseen target\ndomain, leading to a mismatch between training and test conditions. This study\nputs forward a novel data simulation method to address this issue, leveraging\nnoise-extractive techniques and generative adversarial networks (GANs) with\nonly limited target noisy speech data. Notably, our method employs a noise\nencoder to extract noise embeddings from target-domain data. These embeddings\naptly guide the generator to synthesize utterances acoustically fitted to the\ntarget domain while authentically preserving the phonetic content of the input\nclean speech. Furthermore, we introduce the notion of dynamic stochastic\nperturbation, which can inject controlled perturbations into the noise\nembeddings during inference, thereby enabling the model to generalize well to\nunseen noise conditions. Experiments on the VoiceBank-DEMAND benchmark dataset\ndemonstrate that our domain-adaptive SE method outperforms an existing strong\nbaseline based on data simulation.\n","authors":["Chien-Chun Wang","Li-Wei Chen","Hung-Shin Lee","Berlin Chen","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2409.01545v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.01539v1","updated":"2024-09-03T02:15:34Z","published":"2024-09-03T02:15:34Z","title":"It is Time to Develop an Auditing Framework to Promote Value Aware\n Chatbots","summary":" The launch of ChatGPT in November 2022 marked the beginning of a new era in\nAI, the availability of generative AI tools for everyone to use. ChatGPT and\nother similar chatbots boast a wide range of capabilities from answering\nstudent homework questions to creating music and art. Given the large amounts\nof human data chatbots are built on, it is inevitable that they will inherit\nhuman errors and biases. These biases have the potential to inflict significant\nharm or increase inequity on different subpopulations. Because chatbots do not\nhave an inherent understanding of societal values, they may create new content\nthat is contrary to established norms. Examples of concerning generated content\nincludes child pornography, inaccurate facts, and discriminatory posts. In this\nposition paper, we argue that the speed of advancement of this technology\nrequires us, as computer and data scientists, to mobilize and develop a\nvalues-based auditing framework containing a community established standard set\nof measurements to monitor the health of different chatbots and LLMs. To\nsupport our argument, we use a simple audit template to share the results of\nbasic audits we conduct that are focused on measuring potential bias in search\nengine style tasks, code generation, and story generation. We identify\nresponses from GPT 3.5 and GPT 4 that are both consistent and not consistent\nwith values derived from existing law. While the findings come as no surprise,\nthey do underscore the urgency of developing a robust auditing framework for\nopenly sharing results in a consistent way so that mitigation strategies can be\ndeveloped by the academic community, government agencies, and companies when\nour values are not being adhered to. We conclude this paper with\nrecommendations for value-based strategies for improving the technologies.\n","authors":["Yanchen Wang","Lisa Singh"],"pdf_url":"https://arxiv.org/pdf/2409.01539v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.07500"},{"id":"http://arxiv.org/abs/2409.01524v1","updated":"2024-09-03T01:40:21Z","published":"2024-09-03T01:40:21Z","title":"S$^3$c-Math: Spontaneous Step-level Self-correction Makes Large Language\n Models Better Mathematical Reasoners","summary":" Self-correction is a novel method that can stimulate the potential reasoning\nabilities of large language models (LLMs). It involves detecting and correcting\nerrors during the inference process when LLMs solve reasoning problems.\nHowever, recent works do not regard self-correction as a spontaneous and\nintrinsic capability of LLMs. Instead, such correction is achieved through\npost-hoc generation, external knowledge introduction, multi-model\ncollaboration, and similar techniques. In this paper, we propose a series of\nmathematical LLMs called S$^3$c-Math, which are able to perform Spontaneous\nStep-level Self-correction for Mathematical reasoning. This capability helps\nLLMs to recognize whether their ongoing inference tends to contain errors and\nsimultaneously correct these errors to produce a more reliable response. We\nproposed a method, which employs a step-level sampling approach to construct\nstep-wise self-correction data for achieving such ability. Additionally, we\nimplement a training strategy that uses above constructed data to equip LLMs\nwith spontaneous step-level self-correction capacities. Our data and methods\nhave been demonstrated to be effective across various foundation LLMs,\nconsistently showing significant progress in evaluations on GSM8K, MATH, and\nother mathematical benchmarks. To the best of our knowledge, we are the first\nto introduce the spontaneous step-level self-correction ability of LLMs in\nmathematical reasoning.\n","authors":["Yuchen Yan","Jin Jiang","Yang Liu","Yixin Cao","Xin Xu","Mengdi zhang","Xunliang Cai","Jian Shao"],"pdf_url":"https://arxiv.org/pdf/2409.01524v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.17422v2","updated":"2024-09-03T16:00:58Z","published":"2024-08-30T17:12:14Z","title":"Open-vocabulary Temporal Action Localization using VLMs","summary":" Video action localization aims to find timings of a specific action from a\nlong video. Although existing learning-based approaches have been successful,\nthose require annotating videos that come with a considerable labor cost. This\npaper proposes a learning-free, open-vocabulary approach based on emerging\noff-the-shelf vision-language models (VLM). The challenge stems from the fact\nthat VLMs are neither designed to process long videos nor tailored for finding\nactions. We overcome these problems by extending an iterative visual prompting\ntechnique. Specifically, we sample video frames into a concatenated image with\nframe index labels, making a VLM guess a frame that is considered to be closest\nto the start/end of the action. Iterating this process by narrowing a sampling\ntime window results in finding a specific frame of start and end of an action.\nWe demonstrate that this sampling technique yields reasonable results,\nillustrating a practical extension of VLMs for understanding videos. A sample\ncode is available at\nhttps://microsoft.github.io/VLM-Video-Action-Localization/.\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2408.17422v2.pdf","comment":"7 pages, 5 figures, 4 tables. Last updated on September 3rd, 2024"},{"id":"http://arxiv.org/abs/2408.14013v2","updated":"2024-09-03T15:34:09Z","published":"2024-08-26T04:36:10Z","title":"A Multiscale Gradient Fusion Method for Edge Detection in Color Images\n Utilizing the CBM3D Filter","summary":" In this paper, a color edge detection strategy based on collaborative\nfiltering combined with multiscale gradient fusion is proposed. The\nblock-matching and 3D (BM3D) filter are used to enhance the sparse\nrepresentation in the transform domain and achieve the effect of denoising,\nwhereas the multiscale gradient fusion makes up for the defect of loss of\ndetails in single-scale edge detection and improves the edge detection\nresolution and quality. First, the RGB images in the dataset are converted to\nXYZ color space images through mathematical operations. Second, the colored\nblock-matching and 3D (CBM3D) filter are used on the sparse images and to\nremove noise interference. Then, the vector gradients of the color image and\nthe anisotropic Gaussian directional derivative of the two scale parameters are\ncalculated and averaged pixel-by-pixel to obtain a new edge strength map.\nFinally, the edge features are enhanced by image normalization and non-maximum\nsuppression technology, and on that basis, the edge contour is obtained by\ndouble threshold selection and a new morphological refinement method. Through\nan experimental analysis of the edge detection dataset, the method proposed has\ngood noise robustness and high edge quality, which is better than the Color\nSobel, Color Canny, SE and Color AGDD as shown by the PR curve, AUC, PSNR, MSE,\nand FOM indicators.\n","authors":["Zhuoyue Wang","Yiyi Tao","Danqing Ma","Jiajing Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14013v2.pdf","comment":"1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2408.16395v2","updated":"2024-09-03T14:57:18Z","published":"2024-08-29T09:57:55Z","title":"IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial\n Intelligence Evaluation in Histopathology","summary":" Histopathological image analysis is crucial for accurate cancer diagnosis and\ntreatment planning. While deep learning models, especially convolutional neural\nnetworks, have advanced this field, their \"black-box\" nature raises concerns\nabout interpretability and trustworthiness. Explainable Artificial Intelligence\n(XAI) techniques aim to address these concerns, but evaluating their\neffectiveness remains challenging. A significant issue with current\nocclusion-based XAI methods is that they often generate Out-of-Distribution\n(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce\nInpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a\nDenoising Diffusion Probabilistic Model to inpaint occluded regions in\nhistopathological images. By replacing cancerous areas with realistic,\nnon-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity.\nWe evaluate our method on the CAMELYON16 dataset through two phases: first, by\nassessing perceptual similarity using the Learned Perceptual Image Patch\nSimilarity (LPIPS) metric, and second, by quantifying the impact on model\npredictions through Area Under the Curve (AUC) analysis. Our results\ndemonstrate that IBO significantly improves perceptual fidelity, achieving\nnearly twice the improvement in LPIPS scores compared to the best existing\nocclusion strategy. Additionally, IBO increased the precision of XAI\nperformance prediction from 42% to 71% compared to traditional methods. These\nresults demonstrate IBO's potential to provide more reliable evaluations of XAI\ntechniques, benefiting histopathology and other applications. The source code\nfor this study is available at https://github.com/a-fsh-r/IBO.\n","authors":["Pardis Afshar","Sajjad Hashembeiki","Pouya Khani","Emad Fatemizadeh","Mohammad Hossein Rohban"],"pdf_url":"https://arxiv.org/pdf/2408.16395v2.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.19477v2","updated":"2024-09-03T14:24:49Z","published":"2024-02-29T18:59:31Z","title":"Learning a Generalized Physical Face Model From Data","summary":" Physically-based simulation is a powerful approach for 3D facial animation as\nthe resulting deformations are governed by physical constraints, allowing to\neasily resolve self-collisions, respond to external forces and perform\nrealistic anatomy edits. Today's methods are data-driven, where the actuations\nfor finite elements are inferred from captured skin geometry. Unfortunately,\nthese approaches have not been widely adopted due to the complexity of\ninitializing the material space and learning the deformation model for each\ncharacter separately, which often requires a skilled artist followed by lengthy\nnetwork training. In this work, we aim to make physics-based facial animation\nmore accessible by proposing a generalized physical face model that we learn\nfrom a large 3D face dataset. Once trained, our model can be quickly fit to any\nunseen identity and produce a ready-to-animate physical face model\nautomatically. Fitting is as easy as providing a single 3D face scan, or even a\nsingle face image. After fitting, we offer intuitive animation controls, as\nwell as the ability to retarget animations across characters. All the while,\nthe resulting animations allow for physical effects like collision avoidance,\ngravity, paralysis, bone reshaping and more.\n","authors":["Lingchen Yang","Gaspard Zoss","Prashanth Chandran","Markus Gross","Barbara Solenthaler","Eftychios Sifakis","Derek Bradley"],"pdf_url":"https://arxiv.org/pdf/2402.19477v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08557v3","updated":"2024-09-03T13:40:28Z","published":"2024-03-13T14:08:45Z","title":"$OC^4-ReID$: Occluded Cloth-Changing Person Re-Identification","summary":" The study of Cloth-Changing Person Re-identification (CC-ReID) focuses on\nretrieving specific pedestrians when their clothing has changed, typically\nunder the assumption that the entire pedestrian images are visible. Pedestrian\nimages in real-world scenarios, however, are often partially obscured by\nobstacles, presenting a significant challenge to existing CC-ReID systems. In\nthis paper, we introduce a more challenging task termed Occluded Cloth-Changing\nPerson Re-Identification ($OC^4-ReID$), which simultaneously addresses two\nchallenges of clothing changes and occlusion. Concretely, we construct two new\ndatasets, Occ-LTCC and Occ-PRCC, based on original CC-ReID datasets to include\nrandom occlusions of key pedestrians components (e.g., head, torso). Moreover,\na novel benchmark is proposed for $OC^4-ReID$ incorporating a Train-Test Micro\nGranularity Screening ($T^2MGS$) module to mitigate the influence of occlusion\nand proposing a Part-Robust Triplet (PRT) loss for partial features learning.\nComprehensive experiments on the proposed datasets, as well as on two CC-ReID\nbenchmark datasets demonstrate the superior performance of proposed method\nagainst other state-of-the-art methods. The codes and datasets are available\nat: https://github.com/1024AILab/OC4-ReID.\n","authors":["Zhihao Chen","Yiyuan Ge","Ziyang Wang","Jiaju Kang","Mingya Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.08557v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12587v2","updated":"2024-09-03T13:36:22Z","published":"2024-06-18T13:18:32Z","title":"Restorer: Removing Multi-Degradation with All-Axis Attention and Prompt\n Guidance","summary":" There are many excellent solutions in image restoration.However, most methods\nrequire on training separate models to restore images with different types of\ndegradation.Although existing all-in-one models effectively address multiple\ntypes of degradation simultaneously, their performance in real-world scenarios\nis still constrained by the task confusion problem.In this work, we attempt to\naddress this issue by introducing \\textbf{Restorer}, a novel Transformer-based\nall-in-one image restoration model.To effectively address the complex\ndegradation present in real-world images, we propose All-Axis Attention (AAA),\na mechanism that simultaneously models long-range dependencies across both\nspatial and channel dimensions, capturing potential correlations along all\naxes.Additionally, we introduce textual prompts in Restorer to incorporate\nexplicit task priors, enabling the removal of specific degradation types based\non user instructions. By iterating over these prompts, Restorer can handle\ncomposite degradation in real-world scenarios without requiring additional\ntraining.Based on these designs, Restorer with one set of parameters\ndemonstrates state-of-the-art performance in multiple image restoration tasks\ncompared to existing all-in-one and even single-task models.Additionally,\nRestorer is efficient during inference, suggesting the potential in real-world\napplications.\n","authors":["Jiawei Mao","Juncheng Wu","Yuyin Zhou","Xuesong Yin","Yuanqi Chang"],"pdf_url":"https://arxiv.org/pdf/2406.12587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17147v4","updated":"2024-09-03T12:55:47Z","published":"2024-04-26T04:34:45Z","title":"On the Federated Learning Framework for Cooperative Perception","summary":" Cooperative perception is essential to enhance the efficiency and safety of\nfuture transportation systems, requiring extensive data sharing among vehicles\non the road, which raises significant privacy concerns. Federated learning\noffers a promising solution by enabling data privacy-preserving collaborative\nenhancements in perception, decision-making, and planning among connected and\nautonomous vehicles (CAVs). However, federated learning is impeded by\nsignificant challenges arising from data heterogeneity across diverse clients,\npotentially diminishing model accuracy and prolonging convergence periods. This\nstudy introduces a specialized federated learning framework for CP, termed the\nfederated dynamic weighted aggregation (FedDWA) algorithm, facilitated by\ndynamic adjusting loss (DALoss) function. This framework employs dynamic client\nweighting to direct model convergence and integrates a novel loss function that\nutilizes Kullback-Leibler divergence (KLD) to counteract the detrimental\neffects of non-independently and identically distributed (Non-IID) and\nunbalanced data. Utilizing the BEV transformer as the primary model, our\nrigorous testing on the OpenV2V dataset, augmented with FedBEVT data,\ndemonstrates significant improvements in the average intersection over union\n(IoU). These results highlight the substantial potential of our federated\nlearning framework to address data heterogeneity challenges in CP, thereby\nenhancing the accuracy of environmental perception models and facilitating more\nrobust and efficient collaborative learning solutions in the transportation\nsector.\n","authors":["Zhenrong Zhang","Jianan Liu","Xi Zhou","Tao Huang","Qing-Long Han","Jingxin Liu","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17147v4.pdf","comment":"accepted by IEEE RA-L"},{"id":"http://arxiv.org/abs/2407.09510v3","updated":"2024-09-03T11:54:52Z","published":"2024-06-17T11:43:38Z","title":"3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods","summary":" We present a work-in-progress survey on 3D Gaussian Splatting compression\nmethods, focusing on their statistical performance across various benchmarks.\nThis survey aims to facilitate comparability by summarizing key statistics of\ndifferent compression approaches in a tabulated format. The datasets evaluated\ninclude TanksAndTemples, MipNeRF360, DeepBlending, and SyntheticNeRF. For each\nmethod, we report the Peak Signal-to-Noise Ratio (PSNR), Structural Similarity\nIndex (SSIM), Learned Perceptual Image Patch Similarity (LPIPS), and the\nresultant size in megabytes (MB), as provided by the respective authors. This\nis an ongoing, open project, and we invite contributions from the research\ncommunity as GitHub issues or pull requests. Please visit\nhttp://w-m.github.io/3dgs-compression-survey/ for more information and a\nsortable version of the table.\n","authors":["Milena T. Bagdasarian","Paul Knoll","Florian Barthel","Anna Hilsmann","Peter Eisert","Wieland Morgenstern"],"pdf_url":"https://arxiv.org/pdf/2407.09510v3.pdf","comment":"3D Gaussian Splatting compression survey; 3DGS compression; new\n approaches added"},{"id":"http://arxiv.org/abs/2407.07805v3","updated":"2024-09-03T10:46:10Z","published":"2024-07-10T16:25:26Z","title":"SUMix: Mixup with Semantic and Uncertain Information","summary":" Mixup data augmentation approaches have been applied for various tasks of\ndeep learning to improve the generalization ability of deep neural networks.\nSome existing approaches CutMix, SaliencyMix, etc. randomly replace a patch in\none image with patches from another to generate the mixed image. Similarly, the\ncorresponding labels are linearly combined by a fixed ratio $\\lambda$ by l. The\nobjects in two images may be overlapped during the mixing process, so some\nsemantic information is corrupted in the mixed samples. In this case, the mixed\nimage does not match the mixed label information. Besides, such a label may\nmislead the deep learning model training, which results in poor performance. To\nsolve this problem, we proposed a novel approach named SUMix to learn the\nmixing ratio as well as the uncertainty for the mixed samples during the\ntraining process. First, we design a learnable similarity function to compute\nan accurate mix ratio. Second, an approach is investigated as a regularized\nterm to model the uncertainty of the mixed samples. We conduct experiments on\nfive image benchmarks, and extensive experimental results imply that our method\nis capable of improving the performance of classifiers with different\ncutting-based mixup approaches. The source code is available at\nhttps://github.com/JinXins/SUMix.\n","authors":["Huafeng Qin","Xin Jin","Hongyu Zhu","Hongchao Liao","Mounîm A. El-Yacoubi","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2407.07805v3.pdf","comment":"Accepted by ECCV2024 [Camera Ready] (19 pages, 7 figures) with the\n source code at https://github.com/JinXins/SUMix"},{"id":"http://arxiv.org/abs/2404.12501v3","updated":"2024-09-03T10:12:34Z","published":"2024-04-18T20:43:33Z","title":"SPIdepth: Strengthened Pose Information for Self-supervised Monocular\n Depth Estimation","summary":" Self-supervised monocular depth estimation has garnered considerable\nattention for its applications in autonomous driving and robotics. While recent\nmethods have made strides in leveraging techniques like the Self Query Layer\n(SQL) to infer depth from motion, they often overlook the potential of\nstrengthening pose information. In this paper, we introduce SPIdepth, a novel\napproach that prioritizes enhancing the pose network for improved depth\nestimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the\nimportance of pose information in capturing fine-grained scene structures. By\nenhancing the pose network's capabilities, SPIdepth achieves remarkable\nadvancements in scene understanding and depth estimation. Experimental results\non benchmark datasets such as KITTI, Cityscapes, and Make3D showcase SPIdepth's\nstate-of-the-art performance, surpassing previous methods by significant\nmargins. Specifically, SPIdepth tops the self-supervised KITTI benchmark.\nAdditionally, SPIdepth achieves the lowest AbsRel (0.029), SqRel (0.069), and\nRMSE (1.394) on KITTI, establishing new state-of-the-art results. On\nCityscapes, SPIdepth shows improvements over SQLdepth of 21.7% in AbsRel, 36.8%\nin SqRel, and 16.5% in RMSE, even without using motion masks. On Make3D,\nSPIdepth in zero-shot outperforms all other models. Remarkably, SPIdepth\nachieves these results using only a single image for inference, surpassing even\nmethods that utilize video sequences for inference, thus demonstrating its\nefficacy and efficiency in real-world applications. Our approach represents a\nsignificant leap forward in self-supervised monocular depth estimation,\nunderscoring the importance of strengthening pose information for advancing\nscene understanding in real-world applications. The code and pre-trained models\nare publicly available at https://github.com/Lavreniuk/SPIdepth.\n","authors":["Mykola Lavreniuk"],"pdf_url":"https://arxiv.org/pdf/2404.12501v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15656v2","updated":"2024-09-03T09:45:59Z","published":"2024-08-28T09:17:25Z","title":"Realigned Softmax Warping for Deep Metric Learning","summary":" Deep Metric Learning (DML) loss functions traditionally aim to control the\nforces of separability and compactness within an embedding space so that the\nsame class data points are pulled together and different class ones are pushed\napart. Within the context of DML, a softmax operation will typically normalize\ndistances into a probability for optimization, thus coupling all the push/pull\nforces together. This paper proposes a potential new class of loss functions\nthat operate within a euclidean domain and aim to take full advantage of the\ncoupled forces governing embedding space formation under a softmax. These\nforces of compactness and separability can be boosted or mitigated within\ncontrolled locations at will by using a warping function. In this work, we\nprovide a simple example of a warping function and use it to achieve\ncompetitive, state-of-the-art results on various metric learning benchmarks.\n","authors":["Michael G. DeMoor","John J. Prevost"],"pdf_url":"https://arxiv.org/pdf/2408.15656v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2407.19430v2","updated":"2024-09-03T09:42:24Z","published":"2024-07-28T08:43:16Z","title":"Progressive Domain Adaptation for Thermal Infrared Object Tracking","summary":" Due to the lack of large-scale labeled Thermal InfraRed (TIR) training\ndatasets, most existing TIR trackers are trained directly on RGB datasets.\nHowever, tracking methods trained on RGB datasets suffer a significant drop-off\nin TIR data due to the domain shift issue. To this end, in this work, we\npropose a Progressive Domain Adaptation framework for TIR Tracking (PDAT),\nwhich transfers useful knowledge learned from RGB tracking to TIR tracking. The\nframework makes full use of large-scale labeled RGB datasets without requiring\ntime-consuming and labor-intensive labeling of large-scale TIR data.\nSpecifically, we first propose an adversarial-based global domain adaptation\nmodule to reduce domain gap on the feature level coarsely. Second, we design a\nclustering-based subdomain adaptation method to further align the feature\ndistributions of the RGB and TIR datasets finely. These two domain adaptation\nmodules gradually eliminate the discrepancy between the two domains, and thus\nlearn domain-invariant fine-grained features through progressive training.\nAdditionally, we collect a largescale TIR dataset with over 1.48 million\nunlabeled TIR images for training the proposed domain adaptation framework.\nExperimental results on five TIR tracking benchmarks show that the proposed\nmethod gains a nearly 6% success rate, demonstrating its effectiveness.\n","authors":["Qiao Li","Kanlun Tan","Qiao Liu","Di Yuan","Xin Li","Yunpeng Liu"],"pdf_url":"https://arxiv.org/pdf/2407.19430v2.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.08174v4","updated":"2024-09-03T09:16:03Z","published":"2024-01-16T07:33:22Z","title":"An Efficient Instance Segmentation Framework Using Segmentation\n Foundation Models with Oriented Bounding Box Prompts","summary":" Instance segmentation in unmanned aerial vehicle measurement is a\nlong-standing challenge. Since horizontal bounding boxes introduce many\ninterference objects, oriented bounding boxes (OBBs) are usually used for\ninstance identification. However, based on ``segmentation within bounding box''\nparadigm, current instance segmentation methods using OBBs are overly dependent\non bounding box detection performance. To tackle this, this paper proposes\nOBSeg, an efficient instance segmentation framework using OBBs. OBSeg is based\non box prompt-based segmentation foundation models (BSMs), e.g., Segment\nAnything Model. Specifically, OBSeg first detects OBBs to distinguish instances\nand provide coarse localization information. Then, it predicts OBB\nprompt-related masks for fine segmentation. Since OBBs only serve as prompts,\nOBSeg alleviates the over-dependence on bounding box detection performance of\ncurrent instance segmentation methods using OBBs. In addition, to enable BSMs\nto handle OBB prompts, we propose a novel OBB prompt encoder. To make OBSeg\nmore lightweight and further improve the performance of lightweight distilled\nBSMs, a Gaussian smoothing-based knowledge distillation method is introduced.\nExperiments demonstrate that OBSeg outperforms current instance segmentation\nmethods on multiple public datasets. The code is available at\nhttps://github.com/zhen6618/OBBInstanceSegmentation.\n","authors":["Zhen Zhou","Junfeng Fan","Yunkai Ma","Sihan Zhao","Fengshui Jing","Min Tan"],"pdf_url":"https://arxiv.org/pdf/2401.08174v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02092v2","updated":"2024-09-03T08:25:11Z","published":"2023-06-03T11:50:44Z","title":"Collaborative Group: Composed Image Retrieval via Consensus Learning\n from Noisy Annotations","summary":" Composed image retrieval extends content-based image retrieval systems by\nenabling users to search using reference images and captions that describe\ntheir intention. Despite great progress in developing image-text compositors to\nextract discriminative visual-linguistic features, we identify a hitherto\noverlooked issue, triplet ambiguity, which impedes robust feature extraction.\nTriplet ambiguity refers to a type of semantic ambiguity that arises between\nthe reference image, the relative caption, and the target image. It is mainly\ndue to the limited representation of the annotated text, resulting in many\nnoisy triplets where multiple visually dissimilar candidate images can be\nmatched to an identical reference pair (i.e., a reference image + a relative\ncaption). To address this challenge, we propose the Consensus Network\n(Css-Net), inspired by the psychological concept that groups outperform\nindividuals. Css-Net comprises two core components: (1) a consensus module with\nfour diverse compositors, each generating distinct image-text embeddings,\nfostering complementary feature extraction and mitigating dependence on any\nsingle, potentially biased compositor; (2) a Kullback-Leibler divergence loss\nthat encourages learning of inter-compositor interactions to promote consensual\noutputs. During evaluation, the decisions of the four compositors are combined\nthrough a weighting scheme, enhancing overall agreement. On benchmark datasets,\nparticularly FashionIQ, Css-Net demonstrates marked improvements. Notably, it\nachieves significant recall gains, with a 2.77% increase in R@10 and 6.67%\nboost in R@50, underscoring its competitiveness in addressing the fundamental\nlimitations of existing methods.\n","authors":["Xu Zhang","Zhedong Zheng","Linchao Zhu","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2306.02092v2.pdf","comment":"Accepted by Knowledge-Based Systems (KBS)"},{"id":"http://arxiv.org/abs/2311.18825v2","updated":"2024-09-03T08:16:32Z","published":"2023-11-30T18:58:51Z","title":"CAST: Cross-Attention in Space and Time for Video Action Recognition","summary":" Recognizing human actions in videos requires spatial and temporal\nunderstanding. Most existing action recognition models lack a balanced\nspatio-temporal understanding of videos. In this work, we propose a novel\ntwo-stream architecture, called Cross-Attention in Space and Time (CAST), that\nachieves a balanced spatio-temporal understanding of videos using only RGB\ninput. Our proposed bottleneck cross-attention mechanism enables the spatial\nand temporal expert models to exchange information and make synergistic\npredictions, leading to improved performance. We validate the proposed method\nwith extensive experiments on public benchmarks with different characteristics:\nEPIC-KITCHENS-100, Something-Something-V2, and Kinetics-400. Our method\nconsistently shows favorable performance across these datasets, while the\nperformance of existing methods fluctuates depending on the dataset\ncharacteristics.\n","authors":["Dongho Lee","Jongseo Lee","Jinwoo Choi"],"pdf_url":"https://arxiv.org/pdf/2311.18825v2.pdf","comment":"This is an accepted NeurIPS 2023. Project webpage is available at\n https://jong980812.github.io/CAST.github.io/ Code is available at\n https://github.com/KHU-VLL/CAST"},{"id":"http://arxiv.org/abs/2312.05449v2","updated":"2024-09-03T08:01:47Z","published":"2023-12-09T03:33:14Z","title":"TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot\n Image Classification","summary":" Few-shot image classification aims to classify images from unseen novel\nclasses with few samples. Recent works demonstrate that deep local descriptors\nexhibit enhanced representational capabilities compared to image-level\nfeatures. However, most existing methods solely rely on either employing all\nlocal descriptors or directly utilizing partial descriptors, potentially\nresulting in the loss of crucial information. Moreover, these methods primarily\nemphasize the selection of query descriptors while overlooking support\ndescriptors. In this paper, we propose a novel Task-Aware Adaptive Local\nDescriptors Selection Network (TALDS-Net), which exhibits the capacity for\nadaptive selection of task-aware support descriptors and query descriptors.\nSpecifically, we compare the similarity of each local support descriptor with\nother local support descriptors to obtain the optimal support descriptor subset\nand then compare the query descriptors with the optimal support subset to\nobtain discriminative query descriptors. Extensive experiments demonstrate that\nour TALDS-Net outperforms state-of-the-art methods on both general and\nfine-grained datasets.\n","authors":["Qian Qiao","Yu Xie","Ziyin Zeng","Fanzhang Li"],"pdf_url":"https://arxiv.org/pdf/2312.05449v2.pdf","comment":"4 pages, 1 figures, is accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2307.10593v2","updated":"2024-09-03T07:50:05Z","published":"2023-07-20T05:15:03Z","title":"Asynchronous Blob Tracker for Event Cameras","summary":" Event-based cameras are popular for tracking fast-moving objects due to their\nhigh temporal resolution, low latency, and high dynamic range. In this paper,\nwe propose a novel algorithm for tracking event blobs using raw events\nasynchronously in real time. We introduce the concept of an event blob as a\nspatio-temporal likelihood of event occurrence where the conditional spatial\nlikelihood is blob-like. Many real-world objects such as car headlights or any\nquickly moving foreground objects generate event blob data. The proposed\nalgorithm uses a nearest neighbour classifier with a dynamic threshold criteria\nfor data association coupled with an extended Kalman filter to track the event\nblob state. Our algorithm achieves highly accurate blob tracking, velocity\nestimation, and shape estimation even under challenging lighting conditions and\nhigh-speed motions (> 11000 pixels/s). The microsecond time resolution achieved\nmeans that the filter output can be used to derive secondary information such\nas time-to-contact or range estimation, that will enable applications to\nreal-world problems such as collision avoidance in autonomous driving.\n","authors":["Ziwei Wang","Timothy Molloy","Pieter van Goor","Robert Mahony"],"pdf_url":"https://arxiv.org/pdf/2307.10593v2.pdf","comment":"18 pages, 16 figures, Manuscript was accepted on August 7, 2024, by\n IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2405.09777v2","updated":"2024-09-03T07:46:17Z","published":"2024-05-16T02:46:19Z","title":"Rethinking Barely-Supervised Volumetric Medical Image Segmentation from\n an Unsupervised Domain Adaptation Perspective","summary":" This paper investigates an extremely challenging problem: barely-supervised\nvolumetric medical image segmentation (BSS). A BSS training dataset consists of\ntwo parts: 1) a barely-annotated labeled set, where each labeled image contains\nonly a single-slice annotation, and 2) an unlabeled set comprising numerous\nunlabeled volumetric images. State-of-the-art BSS methods employ a\nregistration-based paradigm, which uses inter-slice image registration to\npropagate single-slice annotations into volumetric pseudo labels, constructing\na completely annotated labeled set, to which a semi-supervised segmentation\nscheme can be applied. However, the paradigm has a critical limitation: the\npseudo-labels generated by image registration are unreliable and noisy.\nMotivated by this, we propose a new perspective: instead of solving BSS within\na semi-supervised learning scheme, this work formulates BSS as an unsupervised\ndomain adaptation problem. To this end, we propose a novel BSS framework,\n\\textbf{B}arely-supervised learning \\textbf{via} unsupervised domain\n\\textbf{A}daptation (BvA), as an alternative to the dominant registration\nparadigm. Specifically, we first design a novel noise-free labeled data\nconstruction algorithm (NFC) for slice-to-volume labeled data synthesis. Then,\nwe introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the\ndomain shifts. Extensive experiments demonstrate that our method provides a\npromising alternative for BSS. Remarkably, the proposed method, trained on the\nleft atrial segmentation dataset with \\textbf{only one} barely-labeled image,\nachieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%.\nThe code is available at\n\\href{https://github.com/Senyh/BvA}{\\textit{\\texttt{https://github.com/Senyh/BvA}}}.\n","authors":["Zhiqiang Shen","Peng Cao","Junming Su","Jinzhu Yang","Osmar R. Zaiane"],"pdf_url":"https://arxiv.org/pdf/2405.09777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08822v2","updated":"2024-09-03T07:42:44Z","published":"2023-12-14T11:11:50Z","title":"Planning and Rendering: Towards Product Poster Generation with Diffusion\n Models","summary":" Product poster generation significantly optimizes design efficiency and\nreduces production costs. Prevailing methods predominantly rely on\nimage-inpainting methods to generate clean background images for given\nproducts. Subsequently, poster layout generation methods are employed to\nproduce corresponding layout results. However, the background images may not be\nsuitable for accommodating textual content due to their complexity, and the\nfixed location of products limits the diversity of layout results. To alleviate\nthese issues, we propose a novel product poster generation framework based on\ndiffusion models named P\\&R. The P\\&R draws inspiration from the workflow of\ndesigners in creating posters, which consists of two stages: Planning and\nRendering. At the planning stage, we propose a PlanNet to generate the layout\nof the product and other visual components considering both the appearance\nfeatures of the product and semantic features of the text, which improves the\ndiversity and rationality of the layouts. At the rendering stage, we propose a\nRenderNet to generate the background for the product while considering the\ngenerated layout, where a spatial fusion module is introduced to fuse the\nlayout of different visual components. To foster the advancement of this field,\nwe propose the first product poster generation dataset PPG30k, comprising 30k\nexquisite product poster images along with comprehensive image and text\nannotations. Our method outperforms the state-of-the-art product poster\ngeneration methods on PPG30k. The PPG30k will be released soon.\n","authors":["Zhaochen Li","Fengheng Li","Wei Feng","Honghe Zhu","Yaoyu Li","Zheng Zhang","Jingjing Lv","Junjie Shen","Zhangang Lin","Jingping Shao","Zhenglu Yang"],"pdf_url":"https://arxiv.org/pdf/2312.08822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11835v3","updated":"2024-09-03T07:39:39Z","published":"2024-01-22T10:52:02Z","title":"Unveiling the Human-like Similarities of Automatic Facial Expression\n Recognition: An Empirical Exploration through Explainable AI","summary":" Facial expression recognition is vital for human behavior analysis, and deep\nlearning has enabled models that can outperform humans. However, it is unclear\nhow closely they mimic human processing. This study aims to explore the\nsimilarity between deep neural networks and human perception by comparing\ntwelve different networks, including both general object classifiers and\nFER-specific models. We employ an innovative global explainable AI method to\ngenerate heatmaps, revealing crucial facial regions for the twelve networks\ntrained on six facial expressions. We assess these results both quantitatively\nand qualitatively, comparing them to ground truth masks based on Friesen and\nEkman's description and among them. We use Intersection over Union (IoU) and\nnormalized correlation coefficients for comparisons. We generate 72 heatmaps to\nhighlight critical regions for each expression and architecture. Qualitatively,\nmodels with pre-trained weights show more similarity in heatmaps compared to\nthose without pre-training. Specifically, eye and nose areas influence certain\nfacial expressions, while the mouth is consistently important across all models\nand expressions. Quantitatively, we find low average IoU values (avg. 0.2702)\nacross all expressions and architectures. The best-performing architecture\naverages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,\nbuilt with the normalized correlation coefficient, reveal two main clusters for\nmost expressions: models with pre-training and models without pre-training.\nFindings suggest limited alignment between human and AI facial expression\nrecognition, with network architectures influencing the similarity, as similar\narchitectures prioritize similar facial regions.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis-Guarinos","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11835v3.pdf","comment":"Multimed Tools Appl (2024)"},{"id":"http://arxiv.org/abs/2402.17296v3","updated":"2024-09-03T07:38:14Z","published":"2024-02-27T08:19:51Z","title":"Learning Exposure Correction in Dynamic Scenes","summary":" Exposure correction aims to enhance visual data suffering from improper\nexposures, which can greatly improve satisfactory visual effects. However,\nprevious methods mainly focus on the image modality, and the video counterpart\nis less explored in the literature. Directly applying prior image-based methods\nto videos results in temporal incoherence with low visual quality. Through\nthorough investigation, we find that the development of relevant communities is\nlimited by the absence of a benchmark dataset. Therefore, in this paper, we\nconstruct the first real-world paired video dataset, including both\nunderexposure and overexposure dynamic scenes. To achieve spatial alignment, we\nutilize two DSLR cameras and a beam splitter to simultaneously capture improper\nand normal exposure videos. Additionally, we propose an end-to-end video\nexposure correction network, in which a dual-stream module is designed to deal\nwith both underexposure and overexposure factors, enhancing the illumination\nbased on Retinex theory. The extensive experiments based on various metrics and\nuser studies demonstrate the significance of our dataset and the effectiveness\nof our method. The code and dataset are available at\nhttps://github.com/kravrolens/VECNet.\n","authors":["Jin Liu","Bo Wang","Chuanming Wang","Huiyuan Fu","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2402.17296v3.pdf","comment":"To be published at ACM Multimedia 2024"},{"id":"http://arxiv.org/abs/2401.11790v3","updated":"2024-09-03T07:34:44Z","published":"2024-01-22T09:40:52Z","title":"Deep Learning for Computer Vision based Activity Recognition and Fall\n Detection of the Elderly: a Systematic Review","summary":" As the percentage of elderly people in developed countries increases\nworldwide, the healthcare of this collective is a worrying matter, especially\nif it includes the preservation of their autonomy. In this direction, many\nstudies are being published on Ambient Assisted Living (AAL) systems, which\nhelp to reduce the preoccupations raised by the independent living of the\nelderly. In this study, a systematic review of the literature is presented on\nfall detection and Human Activity Recognition (HAR) for the elderly, as the two\nmain tasks to solve to guarantee the safety of elderly people living alone. To\naddress the current tendency to perform these two tasks, the review focuses on\nthe use of Deep Learning (DL) based approaches on computer vision data. In\naddition, different collections of data like DL models, datasets or hardware\n(e.g. depth or thermal cameras) are gathered from the reviewed studies and\nprovided for reference in future studies. Strengths and weaknesses of existing\napproaches are also discussed and, based on them, our recommendations for\nfuture works are provided.\n","authors":["F. Xavier Gaya-Morey","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11790v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00997v3","updated":"2024-09-03T07:25:51Z","published":"2023-07-03T13:21:58Z","title":"RefSAM: Efficiently Adapting Segmenting Anything Model for Referring\n Video Object Segmentation","summary":" The Segment Anything Model (SAM) has gained significant attention for its\nimpressive performance in image segmentation. However, it lacks proficiency in\nreferring video object segmentation (RVOS) due to the need for precise\nuser-interactive prompts and a limited understanding of different modalities,\nsuch as language and vision. This paper presents the RefSAM model, which\nexplores the potential of SAM for RVOS by incorporating multi-view information\nfrom diverse modalities and successive frames at different timestamps in an\nonline manner. Our proposed approach adapts the original SAM model to enhance\ncross-modality learning by employing a lightweight Cross-Modal MLP that\nprojects the text embedding of the referring expression into sparse and dense\nembeddings, serving as user-interactive prompts. Additionally, we have\nintroduced the hierarchical dense attention module to fuse hierarchical visual\nsemantic information with sparse embeddings to obtain fine-grained dense\nembeddings, and an implicit tracking module to generate a tracking token and\nprovide historical information for the mask decoder. Furthermore, we employ a\nparameter-efficient tuning strategy to align and fuse the language and vision\nfeatures effectively. Through comprehensive ablation studies, we demonstrate\nour model's practical and effective design choices. Extensive experiments\nconducted on Refer-Youtube-VOS, Ref-DAVIS17, and three referring image\nsegmentation datasets validate the superiority and effectiveness of our RefSAM\nmodel over existing methods.\n","authors":["Yonglin Li","Jing Zhang","Xiao Teng","Long Lan","Xinwang Liu"],"pdf_url":"https://arxiv.org/pdf/2307.00997v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15214v2","updated":"2024-09-03T06:40:37Z","published":"2024-05-24T05:02:51Z","title":"PointRWKV: Efficient RWKV-Like Model for Hierarchical Point Cloud\n Learning","summary":" Transformers have revolutionized the point cloud learning task, but the\nquadratic complexity hinders its extension to long sequence and makes a burden\non limited computational resources. The recent advent of RWKV, a fresh breed of\ndeep sequence models, has shown immense potential for sequence modeling in NLP\ntasks. In this paper, we present PointRWKV, a model of linear complexity\nderived from the RWKV model in the NLP field with necessary modifications for\npoint cloud learning tasks. Specifically, taking the embedded point patches as\ninput, we first propose to explore the global processing capabilities within\nPointRWKV blocks using modified multi-headed matrix-valued states and a dynamic\nattention recurrence mechanism. To extract local geometric features\nsimultaneously, we design a parallel branch to encode the point cloud\nefficiently in a fixed radius near-neighbors graph with a graph stabilizer.\nFurthermore, we design PointRWKV as a multi-scale framework for hierarchical\nfeature learning of 3D point clouds, facilitating various downstream tasks.\nExtensive experiments on different point cloud learning tasks show our proposed\nPointRWKV outperforms the transformer- and mamba-based counterparts, while\nsignificantly saving about 42\\% FLOPs, demonstrating the potential option for\nconstructing foundational 3D models.\n","authors":["Qingdong He","Jiangning Zhang","Jinlong Peng","Haoyang He","Xiangtai Li","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2405.15214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13110v3","updated":"2024-09-03T06:31:48Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v3.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes and formatting improvements. V3: improvements from\n journal revisions"},{"id":"http://arxiv.org/abs/2401.12743v2","updated":"2024-09-03T06:13:37Z","published":"2024-01-23T13:20:57Z","title":"Correlation-Embedded Transformer Tracking: A Single-Branch Framework","summary":" Developing robust and discriminative appearance models has been a\nlong-standing research challenge in visual object tracking. In the prevalent\nSiamese-based paradigm, the features extracted by the Siamese-like networks are\noften insufficient to model the tracked targets and distractor objects, thereby\nhindering them from being robust and discriminative simultaneously. While most\nSiamese trackers focus on designing robust correlation operations, we propose a\nnovel single-branch tracking framework inspired by the transformer. Unlike the\nSiamese-like feature extraction, our tracker deeply embeds cross-image feature\ncorrelation in multiple layers of the feature network. By extensively matching\nthe features of the two images through multiple layers, it can suppress\nnon-target features, resulting in target-aware feature extraction. The output\nfeatures can be directly used for predicting target locations without\nadditional correlation steps. Thus, we reformulate the two-branch Siamese\ntracking as a conceptually simple, fully transformer-based Single-Branch\nTracking pipeline, dubbed SBT. After conducting an in-depth analysis of the SBT\nbaseline, we summarize many effective design principles and propose an improved\ntracker dubbed SuperSBT. SuperSBT adopts a hierarchical architecture with a\nlocal modeling layer to enhance shallow-level features. A unified relation\nmodeling is proposed to remove complex handcrafted layer pattern designs.\nSuperSBT is further improved by masked image modeling pre-training, integrating\ntemporal modeling, and equipping with dedicated prediction heads. Thus,\nSuperSBT outperforms the SBT baseline by 4.7%,3.0%, and 4.5% AUC scores in\nLaSOT, TrackingNet, and GOT-10K. Notably, SuperSBT greatly raises the speed of\nSBT from 37 FPS to 81 FPS. Extensive experiments show that our method achieves\nsuperior results on eight VOT benchmarks.\n","authors":["Fei Xie","Wankou Yang","Chunyu Wang","Lei Chu","Yue Cao","Chao Ma","Wenjun Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.12743v2.pdf","comment":"Extension of SBT paper, accepted by TPAMI"},{"id":"http://arxiv.org/abs/2407.13363v2","updated":"2024-09-03T05:10:50Z","published":"2024-07-18T10:14:49Z","title":"Learning from the Web: Language Drives Weakly-Supervised Incremental\n Learning for Semantic Segmentation","summary":" Current weakly-supervised incremental learning for semantic segmentation\n(WILSS) approaches only consider replacing pixel-level annotations with\nimage-level labels, while the training images are still from well-designed\ndatasets. In this work, we argue that widely available web images can also be\nconsidered for the learning of new classes. To achieve this, firstly we\nintroduce a strategy to select web images which are similar to previously seen\nexamples in the latent space using a Fourier-based domain discriminator. Then,\nan effective caption-driven reharsal strategy is proposed to preserve\npreviously learnt classes. To our knowledge, this is the first work to rely\nsolely on web images for both the learning of new concepts and the preservation\nof the already learned ones in WILSS. Experimental results show that the\nproposed approach can reach state-of-the-art performances without using\nmanually selected and annotated data in the incremental steps.\n","authors":["Chang Liu","Giulia Rizzoli","Pietro Zanuttigh","Fu Li","Yi Niu"],"pdf_url":"https://arxiv.org/pdf/2407.13363v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2311.04811v4","updated":"2024-09-03T03:54:59Z","published":"2023-11-08T16:34:18Z","title":"Image-Based Virtual Try-On: A Survey","summary":" Image-based virtual try-on aims to synthesize a naturally dressed person\nimage with a clothing image, which revolutionizes online shopping and inspires\nrelated topics within image generation, showing both research significance and\ncommercial potential. However, there is a gap between current research progress\nand commercial applications and an absence of comprehensive overview of this\nfield to accelerate the development.In this survey, we provide a comprehensive\nanalysis of the state-of-the-art techniques and methodologies in aspects of\npipeline architecture, person representation and key modules such as try-on\nindication, clothing warping and try-on stage. We additionally apply CLIP to\nassess the semantic alignment of try-on results, and evaluate representative\nmethods with uniformly implemented evaluation metrics on the same dataset.In\naddition to quantitative and qualitative evaluation of current open-source\nmethods, unresolved issues are highlighted and future research directions are\nprospected to identify key trends and inspire further exploration. The\nuniformly implemented evaluation metrics, dataset and collected methods will be\nmade public available at\nhttps://github.com/little-misfit/Survey-Of-Virtual-Try-On.\n","authors":["Dan Song","Xuanpu Zhang","Juan Zhou","Weizhi Nie","Ruofeng Tong","Mohan Kankanhalli","An-An Liu"],"pdf_url":"https://arxiv.org/pdf/2311.04811v4.pdf","comment":"30 pages, 20 figures"},{"id":"http://arxiv.org/abs/2406.19101v2","updated":"2024-09-03T03:51:37Z","published":"2024-06-27T11:28:36Z","title":"DocKylin: A Large Multimodal Model for Visual Document Understanding\n with Efficient Visual Slimming","summary":" Current multimodal large language models (MLLMs) face significant challenges\nin visual document understanding (VDU) tasks due to the high resolution, dense\ntext, and complex layouts typical of document images. These characteristics\ndemand a high level of detail perception ability from MLLMs. While increasing\ninput resolution improves detail perception capability, it also leads to longer\nsequences of visual tokens, increasing computational costs and straining the\nmodels' ability to handle long contexts. To address these challenges, we\nintroduce DocKylin, a document-centric MLLM that performs visual content\nslimming at both the pixel and token levels, thereby reducing token sequence\nlength in VDU scenarios. We introduce an Adaptive Pixel Slimming (APS)\npreprocessing module to perform pixel-level slimming, increasing the proportion\nof informative pixels. Moreover, we propose a novel Dynamic Token Slimming\n(DTS) module to conduct token-level slimming, filtering essential tokens and\nremoving others to adaptively create a more compact visual sequence.\nExperiments demonstrate DocKylin's promising performance across various VDU\nbenchmarks and the effectiveness of each component.\n","authors":["Jiaxin Zhang","Wentao Yang","Songxuan Lai","Zecheng Xie","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2406.19101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15667v2","updated":"2024-09-03T03:22:18Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n vision transformers","summary":" Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01169v2","updated":"2024-09-03T03:21:30Z","published":"2024-03-02T10:42:47Z","title":"Learn Suspected Anomalies from Event Prompts for Video Anomaly Detection","summary":" Most models for weakly supervised video anomaly detection (WS-VAD) rely on\nmultiple instance learning, aiming to distinguish normal and abnormal snippets\nwithout specifying the type of anomaly. However, the ambiguous nature of\nanomaly definitions across contexts may introduce inaccuracy in discriminating\nabnormal and normal events. To show the model what is anomalous, a novel\nframework is proposed to guide the learning of suspected anomalies from event\nprompts. Given a textual prompt dictionary of potential anomaly events and the\ncaptions generated from anomaly videos, the semantic anomaly similarity between\nthem could be calculated to identify the suspected events for each video\nsnippet. It enables a new multi-prompt learning process to constrain the\nvisual-semantic features across all videos, as well as provides a new way to\nlabel pseudo anomalies for self-training. To demonstrate its effectiveness,\ncomprehensive experiments and detailed ablation studies are conducted on four\ndatasets, namely XD-Violence, UCF-Crime, TAD, and ShanghaiTech. Our proposed\nmodel outperforms most state-of-the-art methods in terms of AP or AUC (86.5\\%,\n\\hl{90.4}\\%, 94.4\\%, and 97.4\\%). Furthermore, it shows promising performance\nin open-set and cross-dataset cases. The data, code, and models can be found\nat: \\url{https://github.com/shiwoaz/lap}.\n","authors":["Chenchen Tao","Xiaohao Peng","Chong Wang","Jiafei Wu","Puning Zhao","Jun Wang","Jiangbo Qian"],"pdf_url":"https://arxiv.org/pdf/2403.01169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07547v2","updated":"2024-09-03T03:20:54Z","published":"2023-04-15T12:52:23Z","title":"TagCLIP: Improving Discrimination Ability of Open-Vocabulary Semantic\n Segmentation","summary":" Contrastive Language-Image Pre-training (CLIP) has recently shown great\npromise in pixel-level zero-shot learning tasks. However, existing approaches\nutilizing CLIP's text and patch embeddings to generate semantic masks often\nmisidentify input pixels from unseen classes, leading to confusion between\nnovel classes and semantically similar ones. In this work, we propose a novel\napproach, TagCLIP (Trusty-aware guided CLIP), to address this issue. We\ndisentangle the ill-posed optimization problem into two parallel processes:\nsemantic matching performed individually and reliability judgment for improving\ndiscrimination ability. Building on the idea of special tokens in language\nmodeling representing sentence-level embeddings, we introduce a trusty token\nthat enables distinguishing novel classes from known ones in prediction. To\nevaluate our approach, we conduct experiments on two benchmark datasets, PASCAL\nVOC 2012, COCO-Stuff 164K and PASCAL Context. Our results show that TagCLIP\nimproves the Intersection over Union (IoU) of unseen classes by 7.4%, 1.7% and\n2.1%, respectively, with negligible overheads. The code is available at\nhttps://github.com/dvlab-research/TagCLIP.\n","authors":["Jingyao Li","Pengguang Chen","Shengju Qian","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2304.07547v2.pdf","comment":"TPAMI2024"},{"id":"http://arxiv.org/abs/2408.07500v2","updated":"2024-09-03T02:50:56Z","published":"2024-08-14T12:29:49Z","title":"Cross-Platform Video Person ReID: A New Benchmark Dataset and Adaptation\n Approach","summary":" In this paper, we construct a large-scale benchmark dataset for\nGround-to-Aerial Video-based person Re-Identification, named G2A-VReID, which\ncomprises 185,907 images and 5,576 tracklets, featuring 2,788 distinct\nidentities. To our knowledge, this is the first dataset for video ReID under\nGround-to-Aerial scenarios. G2A-VReID dataset has the following\ncharacteristics: 1) Drastic view changes; 2) Large number of annotated\nidentities; 3) Rich outdoor scenarios; 4) Huge difference in resolution.\nAdditionally, we propose a new benchmark approach for cross-platform ReID by\ntransforming the cross-platform visual alignment problem into visual-semantic\nalignment through vision-language model (i.e., CLIP) and applying a\nparameter-efficient Video Set-Level-Adapter module to adapt image-based\nfoundation model to video ReID tasks, termed VSLA-CLIP. Besides, to further\nreduce the great discrepancy across the platforms, we also devise the\nplatform-bridge prompts for efficient visual feature alignment. Extensive\nexperiments demonstrate the superiority of the proposed method on all existing\nvideo ReID datasets and our proposed G2A-VReID dataset.\n","authors":["Shizhou Zhang","Wenlong Luo","De Cheng","Qingchun Yang","Lingyan Ran","Yinghui Xing","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.07500v2.pdf","comment":"Published at ECCV 2024"},{"id":"http://arxiv.org/abs/2404.06559v2","updated":"2024-09-03T01:57:04Z","published":"2024-04-09T18:23:34Z","title":"The Impact of Print-Scanning in Heterogeneous Morph Evaluation Scenarios","summary":" Face morphing attacks pose an increasing threat to face recognition (FR)\nsystems. A morphed photo contains biometric information from two different\nsubjects to take advantage of vulnerabilities in FRs. These systems are\nparticularly susceptible to attacks when the morphs are subjected to\nprint-scanning to mask the artifacts generated during the morphing process. We\ninvestigate the impact of print-scanning on morphing attack detection through a\nseries of evaluations on heterogeneous morphing attack scenarios. Our\nexperiments show that we can increase the Mated Morph Presentation Match Rate\n(MMPMR) by up to 8.48%. Furthermore, when a Single-image Morphing Attack\nDetection (S-MAD) algorithm is not trained to detect print-scanned morphs the\nMorphing Attack Classification Error Rate (MACER) can increase by up to 96.12%,\nindicating significant vulnerability.\n","authors":["Richard E. Neddo","Zander W. Blasingame","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2404.06559v2.pdf","comment":"Accepted as a special sessions paper at IJCB 2024"},{"id":"http://arxiv.org/abs/2403.08542v2","updated":"2024-09-03T01:53:19Z","published":"2024-03-13T13:56:34Z","title":"AIGCs Confuse AI Too: Investigating and Explaining Synthetic\n Image-induced Hallucinations in Large Vision-Language Models","summary":" The evolution of Artificial Intelligence Generated Contents (AIGCs) is\nadvancing towards higher quality. The growing interactions with AIGCs present a\nnew challenge to the data-driven AI community: While AI-generated contents have\nplayed a crucial role in a wide range of AI models, the potential hidden risks\nthey introduce have not been thoroughly examined. Beyond human-oriented forgery\ndetection, AI-generated content poses potential issues for AI models originally\ndesigned to process natural data. In this study, we underscore the exacerbated\nhallucination phenomena in Large Vision-Language Models (LVLMs) caused by\nAI-synthetic images. Remarkably, our findings shed light on a consistent AIGC\n\\textbf{hallucination bias}: the object hallucinations induced by synthetic\nimages are characterized by a greater quantity and a more uniform position\ndistribution, even these synthetic images do not manifest unrealistic or\nadditional relevant visual features compared to natural images. Moreover, our\ninvestigations on Q-former and Linear projector reveal that synthetic images\nmay present token deviations after visual projection, thereby amplifying the\nhallucination bias.\n","authors":["Yifei Gao","Jiaqi Wang","Zhiyu Lin","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2403.08542v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05904v3","updated":"2024-09-03T01:40:52Z","published":"2023-09-12T01:29:37Z","title":"Enhancing Representation in Radiography-Reports Foundation Model: A\n Granular Alignment Algorithm Using Masked Contrastive Learning","summary":" Recently, multi-modal vision-language foundation models have gained\nsignificant attention in the medical field. While these models offer great\nopportunities, they still face crucial challenges, such as the requirement for\nfine-grained knowledge understanding in computer-aided diagnosis and the\ncapability of utilizing very limited or even no task-specific labeled data in\nreal-world clinical applications. In this study, we present MaCo, a masked\ncontrastive chest X-ray foundation model that tackles these challenges. MaCo\nexplores masked contrastive learning to simultaneously achieve fine-grained\nimage understanding and zero-shot learning for a variety of medical imaging\ntasks. It designs a correlation weighting mechanism to adjust the correlation\nbetween masked chest X-ray image patches and their corresponding reports,\nthereby enhancing the model's representation learning capabilities. To evaluate\nthe performance of MaCo, we conducted extensive experiments using 6 well-known\nopen-source X-ray datasets. The experimental results demonstrate the\nsuperiority of MaCo over 10 state-of-the-art approaches across tasks such as\nclassification, segmentation, detection, and phrase grounding. These findings\nhighlight the significant potential of MaCo in advancing a wide range of\nmedical image analysis tasks.\n","authors":["Weijian Huang","Cheng Li","Hong-Yu Zhou","Hao Yang","Jiarun Liu","Yong Liang","Hairong Zheng","Shaoting Zhang","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05904v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02337v1","updated":"2024-09-03T23:52:33Z","published":"2024-09-03T23:52:33Z","title":"Coaching a Robotic Sonographer: Learning Robotic Ultrasound with Sparse\n Expert's Feedback","summary":" Ultrasound is widely employed for clinical intervention and diagnosis, due to\nits advantages of offering non-invasive, radiation-free, and real-time imaging.\nHowever, the accessibility of this dexterous procedure is limited due to the\nsubstantial training and expertise required of operators. The robotic\nultrasound (RUS) offers a viable solution to address this limitation;\nnonetheless, achieving human-level proficiency remains challenging. Learning\nfrom demonstrations (LfD) methods have been explored in RUS, which learns the\npolicy prior from a dataset of offline demonstrations to encode the mental\nmodel of the expert sonographer. However, active engagement of experts, i.e.\nCoaching, during the training of RUS has not been explored thus far. Coaching\nis known for enhancing efficiency and performance in human training. This paper\nproposes a coaching framework for RUS to amplify its performance. The framework\ncombines DRL (self-supervised practice) with sparse expert's feedback through\ncoaching. The DRL employs an off-policy Soft Actor-Critic (SAC) network, with a\nreward based on image quality rating. The coaching by experts is modeled as a\nPartially Observable Markov Decision Process (POMDP), which updates the policy\nparameters based on the correction by the expert. The validation study on\nphantoms showed that coaching increases the learning rate by $25\\%$ and the\nnumber of high-quality image acquisition by $74.5\\%$.\n","authors":["Deepak Raina","Mythra V. Balakuntala","Byung Wook Kim","Juan Wachs","Richard Voyles"],"pdf_url":"https://arxiv.org/pdf/2409.02337v1.pdf","comment":"Accepted in IEEE Transactions on Medical Robotics and Bionics (TMRB)\n 2024"},{"id":"http://arxiv.org/abs/2409.02335v1","updated":"2024-09-03T23:49:45Z","published":"2024-09-03T23:49:45Z","title":"What Do You See in Common? Learning Hierarchical Prototypes over\n Tree-of-Life to Discover Evolutionary Traits","summary":" A grand challenge in biology is to discover evolutionary traits - features of\norganisms common to a group of species with a shared ancestor in the tree of\nlife (also referred to as phylogenetic tree). With the growing availability of\nimage repositories in biology, there is a tremendous opportunity to discover\nevolutionary traits directly from images in the form of a hierarchy of\nprototypes. However, current prototype-based methods are mostly designed to\noperate over a flat structure of classes and face several challenges in\ndiscovering hierarchical prototypes, including the issue of learning\nover-specific features at internal nodes. To overcome these challenges, we\nintroduce the framework of Hierarchy aligned Commonality through Prototypical\nNetworks (HComP-Net). We empirically show that HComP-Net learns prototypes that\nare accurate, semantically consistent, and generalizable to unseen species in\ncomparison to baselines on birds, butterflies, and fishes datasets. The code\nand datasets are available at https://github.com/Imageomics/HComPNet.\n","authors":["Harish Babu Manogaran","M. Maruf","Arka Daw","Kazi Sajeed Mehrab","Caleb Patrick Charpentier","Josef C. Uyeda","Wasila Dahdul","Matthew J Thompson","Elizabeth G Campolongo","Kaiya L Provost","Paula M. Mabee","Hilmar Lapp","Anuj Karpatne"],"pdf_url":"https://arxiv.org/pdf/2409.02335v1.pdf","comment":"34 pages, 27 figures"},{"id":"http://arxiv.org/abs/2409.02334v1","updated":"2024-09-03T23:42:19Z","published":"2024-09-03T23:42:19Z","title":"YoloTag: Vision-based Robust UAV Navigation with Fiducial Markers","summary":" By harnessing fiducial markers as visual landmarks in the environment,\nUnmanned Aerial Vehicles (UAVs) can rapidly build precise maps and navigate\nspaces safely and efficiently, unlocking their potential for fluent\ncollaboration and coexistence with humans. Existing fiducial marker methods\nrely on handcrafted feature extraction, which sacrifices accuracy. On the other\nhand, deep learning pipelines for marker detection fail to meet real-time\nruntime constraints crucial for navigation applications. In this work, we\npropose YoloTag \\textemdash a real-time fiducial marker-based localization\nsystem. YoloTag uses a lightweight YOLO v8 object detector to accurately detect\nfiducial markers in images while meeting the runtime constraints needed for\nnavigation. The detected markers are then used by an efficient\nperspective-n-point algorithm to estimate UAV states. However, this\nlocalization system introduces noise, causing instability in trajectory\ntracking. To suppress noise, we design a higher-order Butterworth filter that\neffectively eliminates noise through frequency domain analysis. We evaluate our\nalgorithm through real-robot experiments in an indoor environment, comparing\nthe trajectory tracking performance of our method against other approaches in\nterms of several distance metrics.\n","authors":["Sourav Raxit","Simant Bahadur Singh","Abdullah Al Redwan Newaz"],"pdf_url":"https://arxiv.org/pdf/2409.02334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14871v3","updated":"2024-09-03T23:13:33Z","published":"2023-12-22T17:49:11Z","title":"BrainVis: Exploring the Bridge between Brain and Visual Signals via\n Image Reconstruction","summary":" Analyzing and reconstructing visual stimuli from brain signals effectively\nadvances the understanding of human visual system. However, the EEG signals are\ncomplex and contain significant noise. This leads to substantial limitations in\nexisting works of visual stimuli reconstruction from EEG, such as difficulties\nin aligning EEG embeddings with the fine-grained semantic information and a\nheavy reliance on additional large self-collected dataset for training. To\naddress these challenges, we propose a novel approach called BrainVis. Firstly,\nwe divide the EEG signals into various units and apply a self-supervised\napproach on them to obtain EEG time-domain features, in an attempt to ease the\ntraining difficulty. Additionally, we also propose to utilize the\nfrequency-domain features to enhance the EEG representations. Then, we\nsimultaneously align EEG time-frequency embeddings with the interpolation of\nthe coarse and fine-grained semantics in the CLIP space, to highlight the\nprimary visual components and reduce the cross-modal alignment difficulty.\nFinally, we adopt the cascaded diffusion models to reconstruct images. Using\nonly 10\\% training data of the previous work, our proposed BrainVis outperforms\nstate of the arts in both semantic fidelity reconstruction and generation\nquality. The code is available at https://github.com/RomGai/BrainVis.\n","authors":["Honghao Fu","Zhiqi Shen","Jing Jih Chin","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.14871v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04890v2","updated":"2024-09-03T22:56:42Z","published":"2024-05-08T08:39:25Z","title":"GISR: Geometric Initialization and Silhouette-based Refinement for\n Single-View Robot Pose and Configuration Estimation","summary":" In autonomous robotics, measurement of the robot's internal state and\nperception of its environment, including interaction with other agents such as\ncollaborative robots, are essential. Estimating the pose of the robot arm from\na single view has the potential to replace classical eye-to-hand calibration\napproaches and is particularly attractive for online estimation and dynamic\nenvironments. In addition to its pose, recovering the robot configuration\nprovides a complete spatial understanding of the observed robot that can be\nused to anticipate the actions of other agents in advanced robotics use cases.\nFurthermore, this additional redundancy enables the planning and execution of\nrecovery protocols in case of sensor failures or external disturbances. We\nintroduce GISR - a deep configuration and robot-to-camera pose estimation\nmethod that prioritizes execution in real-time. GISR consists of two modules:\n(i) a geometric initialization module that efficiently computes an approximate\nrobot pose and configuration, and (ii) a deep iterative silhouette-based\nrefinement module that arrives at a final solution in just a few iterations. We\nevaluate GISR on publicly available data and show that it outperforms existing\nmethods of the same class in terms of both speed and accuracy, and can compete\nwith approaches that rely on ground-truth proprioception and recover only the\npose.\n","authors":["Ivan Bilić","Filip Marić","Fabio Bonsignorio","Ivan Petrović"],"pdf_url":"https://arxiv.org/pdf/2405.04890v2.pdf","comment":"IEEE Robotics and Automation Letters (under revision), code available\n at http://github.com/iwhitey/GISR-robot"},{"id":"http://arxiv.org/abs/2409.02324v1","updated":"2024-09-03T22:36:11Z","published":"2024-09-03T22:36:11Z","title":"Visual Servoing for Robotic On-Orbit Servicing: A Survey","summary":" On-orbit servicing (OOS) activities will power the next big step for\nsustainable exploration and commercialization of space. Developing robotic\ncapabilities for autonomous OOS operations is a priority for the space\nindustry. Visual Servoing (VS) enables robots to achieve the precise manoeuvres\nneeded for critical OOS missions by utilizing visual information for motion\ncontrol. This article presents an overview of existing VS approaches for\nautonomous OOS operations with space manipulator systems (SMS). We divide the\napproaches according to their contribution to the typical phases of a robotic\nOOS mission: a) Recognition, b) Approach, and c) Contact. We also present a\ndiscussion on the reviewed VS approaches, identifying current trends. Finally,\nwe highlight the challenges and areas for future research on VS techniques for\nrobotic OOS.\n","authors":["Lina María Amaya-Mejía","Mohamed Ghita","Jan Dentler","Miguel Olivares-Mendez","Carol Martinez"],"pdf_url":"https://arxiv.org/pdf/2409.02324v1.pdf","comment":"Accepted for publication at the 2024 International Conference on\n Space Robotics (iSpaRo)"},{"id":"http://arxiv.org/abs/2408.01690v2","updated":"2024-09-03T22:30:34Z","published":"2024-08-03T07:05:40Z","title":"IDNet: A Novel Dataset for Identity Document Analysis and Fraud\n Detection","summary":" Effective fraud detection and analysis of government-issued identity\ndocuments, such as passports, driver's licenses, and identity cards, are\nessential in thwarting identity theft and bolstering security on online\nplatforms. The training of accurate fraud detection and analysis tools depends\non the availability of extensive identity document datasets. However, current\npublicly available benchmark datasets for identity document analysis, including\nMIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a\nlimited number of samples, cover insufficient varieties of fraud patterns, and\nseldom include alterations in critical personal identifying fields like\nportrait images, limiting their utility in training models capable of detecting\nrealistic frauds while preserving privacy.\n In response to these shortcomings, our research introduces a new benchmark\ndataset, IDNet, designed to advance privacy-preserving fraud detection efforts.\nThe IDNet dataset comprises 837,060 images of synthetically generated identity\ndocuments, totaling approximately 490 gigabytes, categorized into 20 types from\n$10$ U.S. states and 10 European countries. We evaluate the utility and present\nuse cases of the dataset, illustrating how it can aid in training\nprivacy-preserving fraud detection methods, facilitating the generation of\ncamera and video capturing of identity documents, and testing schema\nunification and other identity document management functionalities.\n","authors":["Hong Guan","Yancheng Wang","Lulu Xie","Soham Nag","Rajeev Goel","Niranjan Erappa Narayana Swamy","Yingzhen Yang","Chaowei Xiao","Jonathan Prisby","Ross Maciejewski","Jia Zou"],"pdf_url":"https://arxiv.org/pdf/2408.01690v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2409.02310v1","updated":"2024-09-03T21:41:35Z","published":"2024-09-03T21:41:35Z","title":"Geometry-aware Feature Matching for Large-Scale Structure from Motion","summary":" Establishing consistent and dense correspondences across multiple images is\ncrucial for Structure from Motion (SfM) systems. Significant view changes, such\nas air-to-ground with very sparse view overlap, pose an even greater challenge\nto the correspondence solvers. We present a novel optimization-based approach\nthat significantly enhances existing feature matching methods by introducing\ngeometry cues in addition to color cues. This helps fill gaps when there is\nless overlap in large-scale scenarios. Our method formulates geometric\nverification as an optimization problem, guiding feature matching within\ndetector-free methods and using sparse correspondences from detector-based\nmethods as anchor points. By enforcing geometric constraints via the Sampson\nDistance, our approach ensures that the denser correspondences from\ndetector-free methods are geometrically consistent and more accurate. This\nhybrid strategy significantly improves correspondence density and accuracy,\nmitigates multi-view inconsistencies, and leads to notable advancements in\ncamera pose accuracy and point cloud density. It outperforms state-of-the-art\nfeature matching methods on benchmark datasets and enables feature matching in\nchallenging extreme large-scale settings.\n","authors":["Gonglin Chen","Jinsen Wu","Haiwei Chen","Wenbin Teng","Zhiyuan Gao","Andrew Feng","Rongjun Qin","Yajie Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.02310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02309v1","updated":"2024-09-03T21:39:58Z","published":"2024-09-03T21:39:58Z","title":"QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of\n DWI Data","summary":" We propose an image-conditioned diffusion model to estimate high angular\nresolution diffusion weighted imaging (DWI) from a low angular resolution\nacquisition. Our model, which we call QID$^2$, takes as input a set of low\nangular resolution DWI data and uses this information to estimate the DWI data\nassociated with a target gradient direction. We leverage a U-Net architecture\nwith cross-attention to preserve the positional information of the reference\nimages, further guiding the target image generation. We train and evaluate\nQID$^2$ on single-shell DWI samples curated from the Human Connectome Project\n(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to\nproduce low angular resolution DWI data and train QID$^2$ to reconstruct the\nmissing high angular resolution samples. We compare QID$^2$ with two\nstate-of-the-art GAN models. Our results demonstrate that QID$^2$ not only\nachieves higher-quality generated images, but it consistently outperforms the\nGAN models in downstream tensor estimation across multiple metrics. Taken\ntogether, this study highlights the potential of diffusion models, and QID$^2$\nin particular, for q-space up-sampling, thus offering a promising toolkit for\nclinical and research applications.\n","authors":["Zijian Chen","Jueqi Wang","Archana Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2409.02309v1.pdf","comment":"Accepted at MICCAI 2024 International Workshop on Computational\n Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02290v1","updated":"2024-09-03T20:58:56Z","published":"2024-09-03T20:58:56Z","title":"Unsupervised Welding Defect Detection Using Audio And Video","summary":" In this work we explore the application of AI to robotic welding. Robotic\nwelding is a widely used technology in many industries, but robots currently do\nnot have the capability to detect welding defects which get introduced due to\nvarious reasons in the welding process. We describe how deep-learning methods\ncan be applied to detect weld defects in real-time by recording the welding\nprocess with microphones and a camera. Our findings are based on a large\ndatabase with more than 4000 welding samples we collected which covers\ndifferent weld types, materials and various defect categories. All deep\nlearning models are trained in an unsupervised fashion because the space of\npossible defects is large and the defects in our data may contain biases. We\ndemonstrate that a reliable real-time detection of most categories of weld\ndefects is feasible both from audio and video, with improvements achieved by\ncombining both modalities. Specifically, the multi-modal approach achieves an\naverage Area-under-ROC-Curve (AUC) of 0.92 over all eleven defect types in our\ndata. We conclude the paper with an analysis of the results by defect type and\na discussion of future work.\n","authors":["Georg Stemmer","Jose A. Lopez","Juan A. Del Hoyo Ontiveros","Arvind Raju","Tara Thimmanaik","Sovan Biswas"],"pdf_url":"https://arxiv.org/pdf/2409.02290v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2409.02284v1","updated":"2024-09-03T20:37:43Z","published":"2024-09-03T20:37:43Z","title":"Biochemical Prostate Cancer Recurrence Prediction: Thinking Fast & Slow","summary":" Time to biochemical recurrence in prostate cancer is essential for prognostic\nmonitoring of the progression of patients after prostatectomy, which assesses\nthe efficacy of the surgery. In this work, we proposed to leverage multiple\ninstance learning through a two-stage ``thinking fast \\& slow'' strategy for\nthe time to recurrence (TTR) prediction. The first (``thinking fast'') stage\nfinds the most relevant WSI area for biochemical recurrence and the second\n(``thinking slow'') stage leverages higher resolution patches to predict TTR.\nOur approach reveals a mean C-index ($Ci$) of 0.733 ($\\theta=0.059$) on our\ninternal validation and $Ci=0.603$ on the LEOPARD challenge validation set.\nPost hoc attention visualization shows that the most attentive area contributes\nto the TTR prediction.\n","authors":["Suhang You","Sanyukta Adap","Siddhesh Thakur","Bhakti Baheti","Spyridon Bakas"],"pdf_url":"https://arxiv.org/pdf/2409.02284v1.pdf","comment":"8 pages, 3 figures, methodology paper for LEOPRARD Challenge"},{"id":"http://arxiv.org/abs/2409.02281v1","updated":"2024-09-03T20:28:30Z","published":"2024-09-03T20:28:30Z","title":"K-Origins: Better Colour Quantification for Neural Networks","summary":" K-Origins is a neural network layer designed to improve image-based network\nperformances when learning colour, or intensities, is beneficial. Over 250\nencoder-decoder convolutional networks are trained and tested on 16-bit\nsynthetic data, demonstrating that K-Origins improves semantic segmentation\naccuracy in two scenarios: object detection with low signal-to-noise ratios,\nand segmenting multiple objects that are identical in shape but vary in colour.\nK-Origins generates output features from the input features, $\\textbf{X}$, by\nthe equation $\\textbf{Y}_k = \\textbf{X}-\\textbf{J}\\cdot w_k$ for each trainable\nparameter $w_k$, where $\\textbf{J}$ is a matrix of ones. Additionally, networks\nwith varying receptive fields were trained to determine optimal network depths\nbased on the dimensions of target classes, suggesting that receptive field\nlengths should exceed object sizes. By ensuring a sufficient receptive field\nlength and incorporating K-Origins, we can achieve better semantic network\nperformance.\n","authors":["Lewis Mason","Mark Martinez"],"pdf_url":"https://arxiv.org/pdf/2409.02281v1.pdf","comment":"16 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.02278v1","updated":"2024-09-03T20:24:37Z","published":"2024-09-03T20:24:37Z","title":"Evaluation and Comparison of Visual Language Models for Transportation\n Engineering Problems","summary":" Recent developments in vision language models (VLM) have shown great\npotential for diverse applications related to image understanding. In this\nstudy, we have explored state-of-the-art VLM models for vision-based\ntransportation engineering tasks such as image classification and object\ndetection. The image classification task involves congestion detection and\ncrack identification, whereas, for object detection, helmet violations were\nidentified. We have applied open-source models such as CLIP, BLIP, OWL-ViT,\nLlava-Next, and closed-source GPT-4o to evaluate the performance of these\nstate-of-the-art VLM models to harness the capabilities of language\nunderstanding for vision-based transportation tasks. These tasks were performed\nby applying zero-shot prompting to the VLM models, as zero-shot prompting\ninvolves performing tasks without any training on those tasks. It eliminates\nthe need for annotated datasets or fine-tuning for specific tasks. Though these\nmodels gave comparative results with benchmark Convolutional Neural Networks\n(CNN) models in the image classification tasks, for object localization tasks,\nit still needs improvement. Therefore, this study provides a comprehensive\nevaluation of the state-of-the-art VLM models highlighting the advantages and\nlimitations of the models, which can be taken as the baseline for future\nimprovement and wide-scale implementation.\n","authors":["Sanjita Prajapati","Tanu Singh","Chinmay Hegde","Pranamesh Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2409.02278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02274v1","updated":"2024-09-03T20:16:56Z","published":"2024-09-03T20:16:56Z","title":"ADHD diagnosis based on action characteristics recorded in videos using\n machine learning","summary":" Demand for ADHD diagnosis and treatment is increasing significantly and the\nexisting services are unable to meet the demand in a timely manner. In this\nwork, we introduce a novel action recognition method for ADHD diagnosis by\nidentifying and analysing raw video recordings. Our main contributions include\n1) designing and implementing a test focusing on the attention and\nhyperactivity/impulsivity of participants, recorded through three cameras; 2)\nimplementing a novel machine learning ADHD diagnosis system based on action\nrecognition neural networks for the first time; 3) proposing classification\ncriteria to provide diagnosis results and analysis of ADHD action\ncharacteristics.\n","authors":["Yichun Li","Syes Mohsen Naqvi","Rajesh Nair"],"pdf_url":"https://arxiv.org/pdf/2409.02274v1.pdf","comment":"Neuroscience Applied"},{"id":"http://arxiv.org/abs/2310.15128v2","updated":"2024-09-03T19:55:22Z","published":"2023-10-23T17:32:38Z","title":"Projected Stochastic Gradient Descent with Quantum Annealed Binary\n Gradients","summary":" We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards\ntraining neural networks with binary weights, known as binary neural networks\n(BNNs), on quantum hardware. BNNs reduce the computational requirements and\nenergy consumption of deep learning models with minimal loss in accuracy.\nHowever, training them in practice remains to be an open challenge. Most known\nBNN-optimisers either rely on projected updates or binarise weights\npost-training. Instead, QP-SBGD approximately maps the gradient onto binary\nvariables, by solving a quadratic constrained binary optimisation. Under\npractically reasonable assumptions, we show that this update rule converges\nwith a rate of $\\mathcal{O}(1 / \\sqrt{T})$. Moreover, we show how the\n$\\mathcal{NP}$-hard projection can be effectively executed on an adiabatic\nquantum annealer, harnessing recent advancements in quantum computation. We\nalso introduce a projected version of this update rule and prove that if a\nfixed point exists in the binary variable space, the modified updates will\nconverge to it. Last but not least, our algorithm is implemented layer-wise,\nmaking it suitable to train larger networks on resource-limited quantum\nhardware. Through extensive evaluations, we show that QP-SBGD outperforms or is\non par with competitive and well-established baselines such as BinaryConnect,\nsignSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as\nwell as binary graph neural networks.\n","authors":["Maximilian Krahn","Michele Sasdelli","Fengyi Yang","Vladislav Golyanik","Juho Kannala","Tat-Jun Chin","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2310.15128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02261v1","updated":"2024-09-03T19:38:23Z","published":"2024-09-03T19:38:23Z","title":"Action-Based ADHD Diagnosis in Video","summary":" Attention Deficit Hyperactivity Disorder (ADHD) causes significant impairment\nin various domains. Early diagnosis of ADHD and treatment could significantly\nimprove the quality of life and functioning. Recently, machine learning methods\nhave improved the accuracy and efficiency of the ADHD diagnosis process.\nHowever, the cost of the equipment and trained staff required by the existing\nmethods are generally huge. Therefore, we introduce the video-based frame-level\naction recognition network to ADHD diagnosis for the first time. We also record\na real multi-modal ADHD dataset and extract three action classes from the video\nmodality for ADHD diagnosis. The whole process data have been reported to\nCNTW-NHS Foundation Trust, which would be reviewed by medical\nconsultants/professionals and will be made public in due course.\n","authors":["Yichun Li","Yuxing Yang","Syed Nohsen Naqvi"],"pdf_url":"https://arxiv.org/pdf/2409.02261v1.pdf","comment":"31st European Symposium on Artificial Neural Networks"},{"id":"http://arxiv.org/abs/2409.02259v1","updated":"2024-09-03T19:34:25Z","published":"2024-09-03T19:34:25Z","title":"Optimal L-Systems for Stochastic L-system Inference Problems","summary":" This paper presents two novel theorems that address two open problems in\nstochastic Lindenmayer-system (L-system) inference, specifically focusing on\nthe construction of an optimal stochastic L-system capable of generating a\ngiven sequence of strings. The first theorem delineates a method for crafting a\nstochastic L-system that maximizes the likelihood of producing a given sequence\nof words through a singular derivation. Furthermore, the second theorem\ndetermines the stochastic L-systems with the highest probability of producing a\ngiven sequence of words with multiple possible derivations. From these, we\nintroduce an algorithm to infer an optimal stochastic L-system from a given\nsequence. This algorithm incorporates sophisticated optimization techniques,\nsuch as interior point methods, ensuring production of a stochastically optimal\nstochastic L-system suitable for generating the given sequence. This allows for\nthe use of using stochastic L-systems as model for machine learning using only\npositive data for training.\n","authors":["Ali Lotfi","Ian McQuillan"],"pdf_url":"https://arxiv.org/pdf/2409.02259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02253v1","updated":"2024-09-03T19:26:13Z","published":"2024-09-03T19:26:13Z","title":"How to Determine the Preferred Image Distribution of a Black-Box\n Vision-Language Model?","summary":" Large foundation models have revolutionized the field, yet challenges remain\nin optimizing multi-modal models for specialized visual tasks. We propose a\nnovel, generalizable methodology to identify preferred image distributions for\nblack-box Vision-Language Models (VLMs) by measuring output consistency across\nvaried input prompts. Applying this to different rendering types of 3D objects,\nwe demonstrate its efficacy across various domains requiring precise\ninterpretation of complex structures, with a focus on Computer-Aided Design\n(CAD) as an exemplar field. We further refine VLM outputs using in-context\nlearning with human feedback, significantly enhancing explanation quality. To\naddress the lack of benchmarks in specialized domains, we introduce CAD-VQA, a\nnew dataset for evaluating VLMs on CAD-related visual question answering tasks.\nOur evaluation of state-of-the-art VLMs on CAD-VQA establishes baseline\nperformance levels, providing a framework for advancing VLM capabilities in\ncomplex visual reasoning tasks across various fields requiring expert-level\nvisual interpretation. We release the dataset and evaluation codes at\n\\url{https://github.com/asgsaeid/cad_vqa}.\n","authors":["Saeid Asgari Taghanaki","Joseph Lambourne","Alana Mongkhounsavath"],"pdf_url":"https://arxiv.org/pdf/2409.02253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02251v1","updated":"2024-09-03T19:24:46Z","published":"2024-09-03T19:24:46Z","title":"NoiseAttack: An Evasive Sample-Specific Multi-Targeted Backdoor Attack\n Through White Gaussian Noise","summary":" Backdoor attacks pose a significant threat when using third-party data for\ndeep learning development. In these attacks, data can be manipulated to cause a\ntrained model to behave improperly when a specific trigger pattern is applied,\nproviding the adversary with unauthorized advantages. While most existing works\nfocus on designing trigger patterns in both visible and invisible to poison the\nvictim class, they typically result in a single targeted class upon the success\nof the backdoor attack, meaning that the victim class can only be converted to\nanother class based on the adversary predefined value. In this paper, we\naddress this issue by introducing a novel sample-specific multi-targeted\nbackdoor attack, namely NoiseAttack. Specifically, we adopt White Gaussian\nNoise (WGN) with various Power Spectral Densities (PSD) as our underlying\ntriggers, coupled with a unique training strategy to execute the backdoor\nattack. This work is the first of its kind to launch a vision backdoor attack\nwith the intent to generate multiple targeted classes with minimal input\nconfiguration. Furthermore, our extensive experimental results demonstrate that\nNoiseAttack can achieve a high attack success rate against popular network\narchitectures and datasets, as well as bypass state-of-the-art backdoor\ndetection methods. Our source code and experiments are available at\nhttps://github.com/SiSL-URI/NoiseAttack/tree/main.\n","authors":["Abdullah Arafat Miah","Kaan Icer","Resit Sendag","Yu Bi"],"pdf_url":"https://arxiv.org/pdf/2409.02251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02243v1","updated":"2024-09-03T19:16:36Z","published":"2024-09-03T19:16:36Z","title":"A Novel Audio-Visual Information Fusion System for Mental Disorders\n Detection","summary":" Mental disorders are among the foremost contributors to the global healthcare\nchallenge. Research indicates that timely diagnosis and intervention are vital\nin treating various mental disorders. However, the early somatization symptoms\nof certain mental disorders may not be immediately evident, often resulting in\ntheir oversight and misdiagnosis. Additionally, the traditional diagnosis\nmethods incur high time and cost. Deep learning methods based on fMRI and EEG\nhave improved the efficiency of the mental disorder detection process. However,\nthe cost of the equipment and trained staff are generally huge. Moreover, most\nsystems are only trained for a specific mental disorder and are not\ngeneral-purpose. Recently, physiological studies have shown that there are some\nspeech and facial-related symptoms in a few mental disorders (e.g., depression\nand ADHD). In this paper, we focus on the emotional expression features of\nmental disorders and introduce a multimodal mental disorder diagnosis system\nbased on audio-visual information input. Our proposed system is based on\nspatial-temporal attention networks and innovative uses a less computationally\nintensive pre-train audio recognition network to fine-tune the video\nrecognition module for better results. We also apply the unified system for\nmultiple mental disorders (ADHD and depression) for the first time. The\nproposed system achieves over 80\\% accuracy on the real multimodal ADHD dataset\nand achieves state-of-the-art results on the depression dataset AVEC 2014.\n","authors":["Yichun Li","Shuanglin Li","Syed Mohsen Naqvi"],"pdf_url":"https://arxiv.org/pdf/2409.02243v1.pdf","comment":"27th International Conference on Information (FUSION)"},{"id":"http://arxiv.org/abs/2409.02241v1","updated":"2024-09-03T19:14:01Z","published":"2024-09-03T19:14:01Z","title":"What makes a face looks like a hat: Decoupling low-level and high-level\n Visual Properties with Image Triplets","summary":" In visual decision making, high-level features, such as object categories,\nhave a strong influence on choice. However, the impact of low-level features on\nbehavior is less understood partly due to the high correlation between high-\nand low-level features in the stimuli presented (e.g., objects of the same\ncategory are more likely to share low-level features). To disentangle these\neffects, we propose a method that de-correlates low- and high-level visual\nproperties in a novel set of stimuli. Our method uses two Convolutional Neural\nNetworks (CNNs) as candidate models of the ventral visual stream: the CORnet-S\nthat has high neural predictivity in high-level, IT-like responses and the\nVGG-16 that has high neural predictivity in low-level responses. Triplets\n(root, image1, image2) of stimuli are parametrized by the level of low- and\nhigh-level similarity of images extracted from the different layers. These\nstimuli are then used in a decision-making task where participants are tasked\nto choose the most similar-to-the-root image. We found that different networks\nshow differing abilities to predict the effects of low-versus-high-level\nsimilarity: while CORnet-S outperforms VGG-16 in explaining human choices based\non high-level similarity, VGG-16 outperforms CORnet-S in explaining human\nchoices based on low-level similarity. Using Brain-Score, we observed that the\nbehavioral prediction abilities of different layers of these networks\nqualitatively corresponded to their ability to explain neural activity at\ndifferent levels of the visual hierarchy. In summary, our algorithm for\nstimulus set generation enables the study of how different representations in\nthe visual stream affect high-level cognitive behaviors.\n","authors":["Maytus Piriyajitakonkij","Sirawaj Itthipuripat","Ian Ballard","Ioannis Pappas"],"pdf_url":"https://arxiv.org/pdf/2409.02241v1.pdf","comment":"Accepted at Workshop on Human-inspired Computer Vision @ ECCV2024"},{"id":"http://arxiv.org/abs/2407.14434v2","updated":"2024-09-03T19:05:17Z","published":"2024-07-19T16:06:11Z","title":"Co-synthesis of Histopathology Nuclei Image-Label Pairs using a\n Context-Conditioned Joint Diffusion Model","summary":" In multi-class histopathology nuclei analysis tasks, the lack of training\ndata becomes a main bottleneck for the performance of learning-based methods.\nTo tackle this challenge, previous methods have utilized generative models to\nincrease data by generating synthetic samples. However, existing methods often\noverlook the importance of considering the context of biological tissues (e.g.,\nshape, spatial layout, and tissue type) in the synthetic data. Moreover, while\ngenerative models have shown superior performance in synthesizing realistic\nhistopathology images, none of the existing methods are capable of producing\nimage-label pairs at the same time. In this paper, we introduce a novel\nframework for co-synthesizing histopathology nuclei images and paired semantic\nlabels using a context-conditioned joint diffusion model. We propose\nconditioning of a diffusion model using nucleus centroid layouts with\nstructure-related text prompts to incorporate spatial and structural context\ninformation into the generation targets. Moreover, we enhance the granularity\nof our synthesized semantic labels by generating instance-wise nuclei labels\nusing distance maps synthesized concurrently in conjunction with the images and\nsemantic labels. We demonstrate the effectiveness of our framework in\ngenerating high-quality samples on multi-institutional, multi-organ, and\nmulti-modality datasets. Our synthetic data consistently outperforms existing\naugmentation methods in the downstream tasks of nuclei segmentation and\nclassification.\n","authors":["Seonghui Min","Hyun-Jic Oh","Won-Ki Jeong"],"pdf_url":"https://arxiv.org/pdf/2407.14434v2.pdf","comment":"ECCV 2024 accepted"},{"id":"http://arxiv.org/abs/2409.02224v1","updated":"2024-09-03T18:53:32Z","published":"2024-09-03T18:53:32Z","title":"EgoPressure: A Dataset for Hand Pressure and Pose Estimation in\n Egocentric Vision","summary":" Estimating touch contact and pressure in egocentric vision is a central task\nfor downstream applications in Augmented Reality, Virtual Reality, as well as\nmany robotic applications, because it provides precise physical insights into\nhand-object interaction and object manipulation. However, existing contact\npressure datasets lack egocentric views and hand poses, which are essential for\naccurate estimation during in-situ operation, both for AR/VR interaction and\nrobotic manipulation. In this paper, we introduce EgoPressure,a novel dataset\nof touch contact and pressure interaction from an egocentric perspective,\ncomplemented with hand pose meshes and fine-grained pressure intensities for\neach contact. The hand poses in our dataset are optimized using our proposed\nmulti-view sequence-based method that processes footage from our capture rig of\n8 accurately calibrated RGBD cameras. EgoPressure comprises 5.0 hours of touch\ncontact and pressure interaction from 21 participants captured by a moving\negocentric camera and 7 stationary Kinect cameras, which provided RGB images\nand depth maps at 30 Hz. In addition, we provide baselines for estimating\npressure with different modalities, which will enable future developments and\nbenchmarking on the dataset. Overall, we demonstrate that pressure and hand\nposes are complementary, which supports our intention to better facilitate the\nphysical understanding of hand-object interactions in AR/VR and robotics\nresearch.\n","authors":["Yiming Zhao","Taein Kwon","Paul Streli","Marc Pollefeys","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2409.02224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02865v1","updated":"2024-09-03T17:59:50Z","published":"2024-09-03T17:59:50Z","title":"Visually Grounded Speech Models for Low-resource Languages and Cognitive\n Modelling","summary":" This dissertation examines visually grounded speech (VGS) models that learn\nfrom unlabelled speech paired with images. It focuses on applications for\nlow-resource languages and understanding human language acquisition. We\nintroduce a task called visually prompted keyword localisation to detect and\nlocalise keywords in speech using images. We demonstrate the effectiveness of\nVGS models in few-shot learning scenarios for low-resource languages like\nYoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our\nmonolingual VGS model exhibits this bias, but we found that multilingualism\ndoes not affect the bias in this VGS model similarly to what is observed in\nchildren.\n","authors":["Leanne Nortje"],"pdf_url":"https://arxiv.org/pdf/2409.02865v1.pdf","comment":"PhD Dissertation"},{"id":"http://arxiv.org/abs/2409.02108v1","updated":"2024-09-03T17:59:05Z","published":"2024-09-03T17:59:05Z","title":"Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection,\n Removal, and Generation in the Era of Deep Learning","summary":" Shadows are formed when light encounters obstacles, leading to areas of\ndiminished illumination. In computer vision, shadow detection, removal, and\ngeneration are crucial for enhancing scene understanding, refining image\nquality, ensuring visual consistency in video editing, and improving virtual\nenvironments. This paper presents a comprehensive survey of shadow detection,\nremoval, and generation in images and videos within the deep learning landscape\nover the past decade, covering tasks, deep models, datasets, and evaluation\nmetrics. Our key contributions include a comprehensive survey of shadow\nanalysis, standardization of experimental comparisons, exploration of the\nrelationships among model size, speed, and performance, a cross-dataset\ngeneralization study, identification of open issues and future directions, and\nprovision of publicly available resources to support further research.\n","authors":["Xiaowei Hu","Zhenghao Xing","Tianyu Wang","Chi-Wing Fu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02108v1.pdf","comment":"Publicly available results, trained models, and evaluation metrics at\n https://github.com/xw-hu/Unveiling-Deep-Shadows"},{"id":"http://arxiv.org/abs/2409.02104v1","updated":"2024-09-03T17:58:03Z","published":"2024-09-03T17:58:03Z","title":"DynOMo: Online Point Tracking by Dynamic Online Monocular Gaussian\n Reconstruction","summary":" Reconstructing scenes and tracking motion are two sides of the same coin.\nTracking points allow for geometric reconstruction [14], while geometric\nreconstruction of (dynamic) scenes allows for 3D tracking of points over time\n[24, 39]. The latter was recently also exploited for 2D point tracking to\novercome occlusion ambiguities by lifting tracking directly into 3D [38].\nHowever, above approaches either require offline processing or multi-view\ncamera setups both unrealistic for real-world applications like robot\nnavigation or mixed reality. We target the challenge of online 2D and 3D point\ntracking from unposed monocular camera input introducing Dynamic Online\nMonocular Reconstruction (DynOMo). We leverage 3D Gaussian splatting to\nreconstruct dynamic scenes in an online fashion. Our approach extends 3D\nGaussians to capture new content and object motions while estimating camera\nmovements from a single RGB frame. DynOMo stands out by enabling emergence of\npoint trajectories through robust image feature reconstruction and a novel\nsimilarity-enhanced regularization term, without requiring any\ncorrespondence-level supervision. It sets the first baseline for online point\ntracking with monocular unposed cameras, achieving performance on par with\nexisting methods. We aim to inspire the community to advance online point\ntracking and reconstruction, expanding the applicability to diverse real-world\nscenarios.\n","authors":["Jenny Seidenschwarz","Qunjie Zhou","Bardienus Duisterhof","Deva Ramanan","Laura Leal-Taixé"],"pdf_url":"https://arxiv.org/pdf/2409.02104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02101v1","updated":"2024-09-03T17:56:51Z","published":"2024-09-03T17:56:51Z","title":"Towards Real-World Adverse Weather Image Restoration: Enhancing\n Clearness and Semantics with Vision-Language Models","summary":" This paper addresses the limitations of adverse weather image restoration\napproaches trained on synthetic data when applied to real-world scenarios. We\nformulate a semi-supervised learning framework employing vision-language models\nto enhance restoration performance across diverse adverse weather conditions in\nreal-world settings. Our approach involves assessing image clearness and\nproviding semantics using vision-language models on real data, serving as\nsupervision signals for training restoration models. For clearness enhancement,\nwe use real-world data, utilizing a dual-step strategy with pseudo-labels\nassessed by vision-language models and weather prompt learning. For semantic\nenhancement, we integrate real-world data by adjusting weather conditions in\nvision-language model descriptions while preserving semantic meaning.\nAdditionally, we introduce an effective training strategy to bootstrap\nrestoration performance. Our approach achieves superior results in real-world\nadverse weather image restoration, demonstrated through qualitative and\nquantitative comparisons with state-of-the-art works.\n","authors":["Jiaqi Xu","Mengyang Wu","Xiaowei Hu","Chi-Wing Fu","Qi Dou","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02101v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02097v1","updated":"2024-09-03T17:54:39Z","published":"2024-09-03T17:54:39Z","title":"LinFusion: 1 GPU, 1 Minute, 16K Image","summary":" Modern diffusion models, particularly those utilizing a Transformer-based\nUNet for denoising, rely heavily on self-attention operations to manage complex\nspatial relationships, thus achieving impressive generation performance.\nHowever, this existing paradigm faces significant challenges in generating\nhigh-resolution visual content due to its quadratic time and memory complexity\nwith respect to the number of spatial tokens. To address this limitation, we\naim at a novel linear attention mechanism as an alternative in this paper.\nSpecifically, we begin our exploration from recently introduced models with\nlinear complexity, e.g., Mamba, Mamba2, and Gated Linear Attention, and\nidentify two key features-attention normalization and non-causal inference-that\nenhance high-resolution visual generation performance. Building on these\ninsights, we introduce a generalized linear attention paradigm, which serves as\na low-rank approximation of a wide spectrum of popular linear token mixers. To\nsave the training cost and better leverage pre-trained models, we initialize\nour models and distill the knowledge from pre-trained StableDiffusion (SD). We\nfind that the distilled model, termed LinFusion, achieves performance on par\nwith or superior to the original SD after only modest training, while\nsignificantly reducing time and memory complexity. Extensive experiments on\nSD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory\nzero-shot cross-resolution generation performance, generating high-resolution\nimages like 16K resolution. Moreover, it is highly compatible with pre-trained\nSD components, such as ControlNet and IP-Adapter, requiring no adaptation\nefforts. Codes are available at https://github.com/Huage001/LinFusion.\n","authors":["Songhua Liu","Weihao Yu","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02097v1.pdf","comment":"Work in Progress. Codes are available at\n https://github.com/Huage001/LinFusion"},{"id":"http://arxiv.org/abs/2409.02095v1","updated":"2024-09-03T17:52:03Z","published":"2024-09-03T17:52:03Z","title":"DepthCrafter: Generating Consistent Long Depth Sequences for Open-world\n Videos","summary":" Despite significant advancements in monocular depth estimation for static\nimages, estimating video depth in the open world remains challenging, since\nopen-world videos are extremely diverse in content, motion, camera movement,\nand length. We present DepthCrafter, an innovative method for generating\ntemporally consistent long depth sequences with intricate details for\nopen-world videos, without requiring any supplementary information such as\ncamera poses or optical flow. DepthCrafter achieves generalization ability to\nopen-world videos by training a video-to-depth model from a pre-trained\nimage-to-video diffusion model, through our meticulously designed three-stage\ntraining strategy with the compiled paired video-depth datasets. Our training\napproach enables the model to generate depth sequences with variable lengths at\none time, up to 110 frames, and harvest both precise depth details and rich\ncontent diversity from realistic and synthetic datasets. We also propose an\ninference strategy that processes extremely long videos through segment-wise\nestimation and seamless stitching. Comprehensive evaluations on multiple\ndatasets reveal that DepthCrafter achieves state-of-the-art performance in\nopen-world video depth estimation under zero-shot settings. Furthermore,\nDepthCrafter facilitates various downstream applications, including depth-based\nvisual effects and conditional video generation.\n","authors":["Wenbo Hu","Xiangjun Gao","Xiaoyu Li","Sijie Zhao","Xiaodong Cun","Yong Zhang","Long Quan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2409.02095v1.pdf","comment":"Project webpage: https://depthcrafter.github.io"},{"id":"http://arxiv.org/abs/2409.02084v1","updated":"2024-09-03T17:35:48Z","published":"2024-09-03T17:35:48Z","title":"GraspSplats: Efficient Manipulation with 3D Feature Splatting","summary":" The ability for robots to perform efficient and zero-shot grasping of object\nparts is crucial for practical applications and is becoming prevalent with\nrecent advances in Vision-Language Models (VLMs). To bridge the 2D-to-3D gap\nfor representations to support such a capability, existing methods rely on\nneural fields (NeRFs) via differentiable rendering or point-based projection\nmethods. However, we demonstrate that NeRFs are inappropriate for scene changes\ndue to their implicitness and point-based methods are inaccurate for part\nlocalization without rendering-based optimization. To amend these issues, we\npropose GraspSplats. Using depth supervision and a novel reference feature\ncomputation method, GraspSplats generates high-quality scene representations in\nunder 60 seconds. We further validate the advantages of Gaussian-based\nrepresentation by showing that the explicit and optimized geometry in\nGraspSplats is sufficient to natively support (1) real-time grasp sampling and\n(2) dynamic and articulated object manipulation with point trackers. With\nextensive experiments on a Franka robot, we demonstrate that GraspSplats\nsignificantly outperforms existing methods under diverse task settings. In\nparticular, GraspSplats outperforms NeRF-based methods like F3RM and LERF-TOGO,\nand 2D detection methods.\n","authors":["Mazeyu Ji","Ri-Zhao Qiu","Xueyan Zou","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02084v1.pdf","comment":"Project webpage: https://graspsplats.github.io/"},{"id":"http://arxiv.org/abs/2409.02081v1","updated":"2024-09-03T17:32:35Z","published":"2024-09-03T17:32:35Z","title":"Physical Rule-Guided Convolutional Neural Network","summary":" The black-box nature of Convolutional Neural Networks (CNNs) and their\nreliance on large datasets limit their use in complex domains with limited\nlabeled data. Physics-Guided Neural Networks (PGNNs) have emerged to address\nthese limitations by integrating scientific principles and real-world\nknowledge, enhancing model interpretability and efficiency. This paper proposes\na novel Physics-Guided CNN (PGCNN) architecture that incorporates dynamic,\ntrainable, and automated LLM-generated, widely recognized rules integrated into\nthe model as custom layers to address challenges like limited data and low\nconfidence scores. The PGCNN is evaluated on multiple datasets, demonstrating\nsuperior performance compared to a baseline CNN model. Key improvements include\na significant reduction in false positives and enhanced confidence scores for\ntrue detection. The results highlight the potential of PGCNNs to improve CNN\nperformance for broader application areas.\n","authors":["Kishor Datta Gupta","Marufa Kamal","Rakib Hossain Rifat","Mohd Ariful Haque","Roy George"],"pdf_url":"https://arxiv.org/pdf/2409.02081v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2408.17344v2","updated":"2024-09-03T10:50:17Z","published":"2024-08-30T15:16:52Z","title":"rerankers: A Lightweight Python Library to Unify Ranking Methods","summary":" This paper presents rerankers, a Python library which provides an easy-to-use\ninterface to the most commonly used re-ranking approaches. Re-ranking is an\nintegral component of many retrieval pipelines; however, there exist numerous\napproaches to it, relying on different implementation methods. rerankers\nunifies these methods into a single user-friendly interface, allowing\npractitioners and researchers alike to explore different methods while only\nchanging a single line of Python code. Moreover ,rerankers ensures that its\nimplementations are done with the fewest dependencies possible, and re-uses the\noriginal implementation whenever possible, guaranteeing that our simplified\ninterface results in no performance degradation compared to more complex ones.\nThe full source code and list of supported models are updated regularly and\navailable at https://github.com/answerdotai/rerankers.\n","authors":["Benjamin Clavié"],"pdf_url":"https://arxiv.org/pdf/2408.17344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06242v2","updated":"2024-09-03T00:21:23Z","published":"2024-05-10T04:44:34Z","title":"Impedance vs. Power Side-channel Vulnerabilities: A Comparative Study","summary":" In recent times, impedance side-channel analysis has emerged as a potent\nstrategy for adversaries seeking to extract sensitive information from\ncomputing systems. It leverages variations in the intrinsic impedance of a\nchip's internal structure across different logic states. In this study, we\nconduct a comparative analysis between the newly explored impedance side\nchannel and the well-established power side channel. Through experimental\nevaluation, we investigate the efficacy of these two side channels in\nextracting the cryptographic key from the Advanced Encryption Standard (AES)\nand analyze their performance. Our results indicate that impedance analysis\ndemonstrates a higher potential for cryptographic key extraction compared to\npower side-channel analysis. Moreover, we identify scenarios where power\nside-channel analysis does not yield satisfactory results, whereas impedance\nanalysis proves to be more robust and effective. This work not only underscores\nthe significance of impedance side-channel analysis in enhancing cryptographic\nsecurity but also emphasizes the necessity for a deeper understanding of its\nmechanisms and implications.\n","authors":["Md Sadik Awal","Buddhipriya Gayanath","Md Tauhidur Rahman"],"pdf_url":"https://arxiv.org/pdf/2405.06242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01736v1","updated":"2024-09-03T09:25:26Z","published":"2024-09-03T09:25:26Z","title":"SpannerLib: Embedding Declarative Information Extraction in an\n Imperative Workflow","summary":" Document spanners have been proposed as a formal framework for declarative\nInformation Extraction (IE) from text, following IE products from the industry\nand academia. Over the past decade, the framework has been studied thoroughly\nin terms of expressive power, complexity, and the ability to naturally combine\ntext analysis with relational querying. This demonstration presents SpannerLib\na library for embedding document spanners in Python code. SpannerLib\nfacilitates the development of IE programs by providing an implementation of\nSpannerlog (Datalog-based documentspanners) that interacts with the Python code\nin two directions: rules can be embedded inside Python, and they can invoke\ncustom Python code (e.g., calls to ML-based NLP models) via user-defined\nfunctions. The demonstration scenarios showcase IE programs, with increasing\nlevels of complexity, within Jupyter Notebook.\n","authors":["Dean Light","Ahmad Aiashy","Mahmoud Diab","Daniel Nachmias","Stijn Vansummeren","Benny Kimelfeld"],"pdf_url":"https://arxiv.org/pdf/2409.01736v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2409.01605v1","updated":"2024-09-03T04:55:03Z","published":"2024-09-03T04:55:03Z","title":"Laser: Parameter-Efficient LLM Bi-Tuning for Sequential Recommendation\n with Collaborative Information","summary":" Sequential recommender systems are essential for discerning user preferences\nfrom historical interactions and facilitating targeted recommendations. Recent\ninnovations employing Large Language Models (LLMs) have advanced the field by\nencoding item semantics, yet they often necessitate substantial parameter\ntuning and are resource-demanding. Moreover, these works fails to consider the\ndiverse characteristics of different types of users and thus diminishes the\nrecommendation accuracy. In this paper, we propose a parameter-efficient Large\nLanguage Model Bi-Tuning framework for sequential recommendation with\ncollaborative information (Laser). Specifically, Bi-Tuning works by inserting\ntrainable virtual tokens at both the prefix and suffix of the input sequence\nand freezing the LLM parameters, thus optimizing the LLM for the sequential\nrecommendation. In our Laser, the prefix is utilized to incorporate user-item\ncollaborative information and adapt the LLM to the recommendation task, while\nthe suffix converts the output embeddings of the LLM from the language space to\nthe recommendation space for the follow-up item recommendation. Furthermore, to\ncapture the characteristics of different types of users when integrating the\ncollaborative information via the prefix, we introduce M-Former, a lightweight\nMoE-based querying transformer that uses a set of query experts to integrate\ndiverse user-specific collaborative information encoded by frozen ID-based\nsequential recommender systems, significantly improving the accuracy of\nrecommendations. Extensive experiments on real-world datasets demonstrate that\nLaser can parameter-efficiently adapt LLMs to effective recommender systems,\nsignificantly outperforming state-of-the-art methods.\n","authors":["Xinyu Zhang","Linmei Hu","Luhao Zhang","Dandan Song","Heyan Huang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2409.01605v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.01563v1","updated":"2024-09-03T03:00:59Z","published":"2024-09-03T03:00:59Z","title":"Blockchain-based Federated Recommendation with Incentive Mechanism","summary":" Nowadays, federated recommendation technology is rapidly evolving to help\nmultiple organisations share data and train models while meeting user privacy,\ndata security and government regulatory requirements. However, federated\nrecommendation increases customer system costs such as power, computational and\ncommunication resources. Besides, federated recommendation systems are also\nsusceptible to model attacks and data poisoning by participating malicious\nclients. Therefore, most customers are unwilling to participate in federated\nrecommendation without any incentive. To address these problems, we propose a\nblockchain-based federated recommendation system with incentive mechanism to\npromote more trustworthy, secure, and efficient federated recommendation\nservice. First, we construct a federated recommendation system based on NeuMF\nand FedAvg. Then we introduce a reverse auction mechanism to select optimal\nclients that can maximize the social surplus. Finally, we employ blockchain for\non-chain evidence storage of models to ensure the safety of the federated\nrecommendation system. The experimental results show that our proposed\nincentive mechanism can attract clients with superior training data to engage\nin the federal recommendation at a lower cost, which can increase the economic\nbenefit of federal recommendation by 54.9\\% while improve the recommendation\nperformance. Thus our work provides theoretical and technological support for\nthe construction of a harmonious and healthy ecological environment for the\napplication of federal recommendation.\n","authors":["Jianhai Chen","Yanlin Wu","Dazhong Rong","Guoyao Yu","Lingqi Jiang","Zhenguang Liu","Peng Zhou","Rui Shen"],"pdf_url":"https://arxiv.org/pdf/2409.01563v1.pdf","comment":"This paper has been accepted on 2024 Blockchain and Web3 Technology\n Innovation and Application Exchange Conference (BWTAC 2024)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.08803v2","updated":"2024-09-03T16:59:07Z","published":"2024-07-11T18:23:46Z","title":"PID Accelerated Temporal Difference Algorithms","summary":" Long-horizon tasks, which have a large discount factor, pose a challenge for\nmost conventional reinforcement learning (RL) algorithms. Algorithms such as\nValue Iteration and Temporal Difference (TD) learning have a slow convergence\nrate and become inefficient in these tasks. When the transition distributions\nare given, PID VI was recently introduced to accelerate the convergence of\nValue Iteration using ideas from control theory. Inspired by this, we introduce\nPID TD Learning and PID Q-Learning algorithms for the RL setting, in which only\nsamples from the environment are available. We give a theoretical analysis of\nthe convergence of PID TD Learning and its acceleration compared to the\nconventional TD Learning. We also introduce a method for adapting PID gains in\nthe presence of noise and empirically verify its effectiveness.\n","authors":["Mark Bedaywi","Amin Rakhsha","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2407.08803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09075v2","updated":"2024-09-03T16:47:09Z","published":"2024-08-17T02:26:29Z","title":"Improving Rare Word Translation With Dictionaries and Attention Masking","summary":" In machine translation, rare words continue to be a problem for the dominant\nencoder-decoder architecture, especially in low-resource and out-of-domain\ntranslation settings. Human translators solve this problem with monolingual or\nbilingual dictionaries. In this paper, we propose appending definitions from a\nbilingual dictionary to source sentences and using attention masking to link\ntogether rare words with their definitions. We find that including definitions\nfor rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1.\n","authors":["Kenneth J. Sible","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2408.09075v2.pdf","comment":"11 pages, 3 figures, 3 tables. Accepted at AMTA 2024"},{"id":"http://arxiv.org/abs/2406.06385v3","updated":"2024-09-03T16:36:06Z","published":"2024-06-10T15:44:22Z","title":"Low-Rank Quantization-Aware Training for LLMs","summary":" Large language models (LLMs) are omnipresent, however their practical\ndeployment is challenging due to their ever increasing computational and memory\ndemands. Quantization is one of the most effective ways to make them more\ncompute and memory efficient. Quantization-aware training (QAT) methods,\ngenerally produce the best quantized performance, however it comes at the cost\nof potentially long training time and excessive memory usage, making it\nimpractical when applying for LLMs. Inspired by parameter-efficient fine-tuning\n(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a\nlightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several\ncomponents to save memory without sacrificing predictive performance: (a)\nlow-rank auxiliary weights that are aware of the quantization grid; (b) a\ndowncasting operator using fixed-point or double-packed integers and (c)\ncheckpointing. Unlike most related work, our method (i) is inference-efficient,\nleading to no additional overhead compared to traditional PTQ; (ii) can be seen\nas a general extended pretraining framework, meaning that the resulting model\ncan still be utilized for any downstream task afterwards; (iii) can be applied\nacross a wide range of quantization settings, such as different choices\nquantization granularity, activation quantization, and seamlessly combined with\nmany PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families\nand validate its effectiveness on several downstream tasks. Our method\noutperforms common post-training quantization (PTQ) approaches and reaches the\nsame model performance as full-model QAT at the fraction of its memory usage.\nSpecifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of\nmemory. Our source code is available at\nhttps://github.com/qualcomm-ai-research/LR-QAT\n","authors":["Yelysei Bondarenko","Riccardo Del Chiaro","Markus Nagel"],"pdf_url":"https://arxiv.org/pdf/2406.06385v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15126v3","updated":"2024-09-03T15:30:27Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n Peptides","summary":" Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in\nfields of materials science, chemistry, pharmacology just to name a few.\nConventional MD simulations are plagued by numerical stability as well as long\nequilibration time issues, which limits broader applications of MD simulations.\nRecently, a surge of deep learning approaches have been devised for\ntime-coarsened dynamics, which learns the state transition mechanism over much\nlarger time scales to overcome these limitations. However, only a few methods\ntarget the underlying Boltzmann distribution by resampling techniques, where\nproposals are rarely accepted as new states with low efficiency. In this work,\nwe propose a force-guided bridge matching model, FBM, a novel framework that\nfirst incorporates physical priors into bridge matching for full-atom\ntime-coarsened dynamics. With the guidance of our well-designed intermediate\nforce field, FBM is feasible to target the Boltzmann-like distribution by\ndirect inference without extra steps. Experiments on small peptides verify our\nsuperiority in terms of comprehensive metrics and demonstrate transferability\nto unseen peptide systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13713v3","updated":"2024-09-03T15:28:37Z","published":"2024-08-25T03:26:00Z","title":"Verifiable cloud-based variational quantum algorithms","summary":" Variational quantum algorithms (VQAs) have shown potential for quantum\nadvantage with noisy intermediate-scale quantum (NISQ) devices for quantum\nmachine learning (QML). However, given the high cost and limited availability\nof quantum resources, delegating VQAs via cloud networks is a more practical\nsolution for clients with limited quantum capabilities. Recently, Shingu et\nal.[Physical Review A, 105, 022603 (2022)] proposed a variational secure cloud\nquantum computing protocol, utilizing ancilla-driven quantum computation (ADQC)\nfor cloud-based VQAs with minimal quantum resource consumption. However, their\nprotocol lacks verifiability, which exposes it to potential malicious behaviors\nby the server. Additionally, channel loss requires frequent re-delegation as\nthe size of the delegated variational circuit grows, complicating verification\ndue to increased circuit complexity. This paper introduces a new protocol to\naddress these challenges and enhance both verifiability and tolerance to\nchannel loss in cloud-based VQAs.\n","authors":["Junhong Yang","Banghai Wang","Junyu Quan","Qin Li"],"pdf_url":"https://arxiv.org/pdf/2408.13713v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06425v6","updated":"2024-09-03T15:07:13Z","published":"2024-08-12T18:04:59Z","title":"Bayesian Learning in a Nonlinear Multiscale State-Space Model","summary":" The ubiquity of multiscale interactions in complex systems is\nwell-recognized, with development and heredity serving as a prime example of\nhow processes at different temporal scales influence one another. This work\nintroduces a novel multiscale state-space model to explore the dynamic\ninterplay between systems interacting across different time scales, with\nfeedback between each scale. We propose a Bayesian learning framework to\nestimate unknown states by learning the unknown process noise covariances\nwithin this multiscale model. We develop a Particle Gibbs with Ancestor\nSampling (PGAS) algorithm for inference and demonstrate through simulations the\nefficacy of our approach.\n","authors":["Nayely Vélez-Cruz","Manfred D. Laubichler"],"pdf_url":"https://arxiv.org/pdf/2408.06425v6.pdf","comment":"Corrected a typo"},{"id":"http://arxiv.org/abs/2408.14340v3","updated":"2024-09-03T14:53:34Z","published":"2024-08-26T15:13:14Z","title":"Foundation Models for Music: A Survey","summary":" In recent years, foundation models (FMs) such as large language models (LLMs)\nand latent diffusion models (LDMs) have profoundly impacted diverse sectors,\nincluding music. This comprehensive review examines state-of-the-art (SOTA)\npre-trained models and foundation models in music, spanning from representation\nlearning, generative learning and multimodal learning. We first contextualise\nthe significance of music in various industries and trace the evolution of AI\nin music. By delineating the modalities targeted by foundation models, we\ndiscover many of the music representations are underexplored in FM development.\nThen, emphasis is placed on the lack of versatility of previous methods on\ndiverse music applications, along with the potential of FMs in music\nunderstanding, generation and medical application. By comprehensively exploring\nthe details of the model pre-training paradigm, architectural choices,\ntokenisation, finetuning methodologies and controllability, we emphasise the\nimportant topics that should have been well explored, like instruction tuning\nand in-context learning, scaling law and emergent ability, as well as\nlong-sequence modelling etc. A dedicated section presents insights into music\nagents, accompanied by a thorough analysis of datasets and evaluations\nessential for pre-training and downstream tasks. Finally, by underscoring the\nvital importance of ethical considerations, we advocate that following research\non FM for music should focus more on such issues as interpretability,\ntransparency, human responsibility, and copyright issues. The paper offers\ninsights into future challenges and trends on FMs for music, aiming to shape\nthe trajectory of human-AI collaboration in the music realm.\n","authors":["Yinghao Ma","Anders Øland","Anton Ragni","Bleiz MacSen Del Sette","Charalampos Saitis","Chris Donahue","Chenghua Lin","Christos Plachouras","Emmanouil Benetos","Elona Shatri","Fabio Morreale","Ge Zhang","György Fazekas","Gus Xia","Huan Zhang","Ilaria Manco","Jiawen Huang","Julien Guinot","Liwei Lin","Luca Marinelli","Max W. Y. Lam","Megha Sharma","Qiuqiang Kong","Roger B. Dannenberg","Ruibin Yuan","Shangda Wu","Shih-Lun Wu","Shuqi Dai","Shun Lei","Shiyin Kang","Simon Dixon","Wenhu Chen","Wenhao Huang","Xingjian Du","Xingwei Qu","Xu Tan","Yizhi Li","Zeyue Tian","Zhiyong Wu","Zhizheng Wu","Ziyang Ma","Ziyu Wang"],"pdf_url":"https://arxiv.org/pdf/2408.14340v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16945v2","updated":"2024-09-03T14:20:02Z","published":"2024-08-29T23:51:51Z","title":"Different Victims, Same Layout: Email Visual Similarity Detection for\n Enhanced Email Protection","summary":" In the pursuit of an effective spam detection system, the focus has often\nbeen on identifying known spam patterns either through rule-based detection\nsystems or machine learning (ML) solutions that rely on keywords. However, both\nsystems are susceptible to evasion techniques and zero-day attacks that can be\nachieved at low cost. Therefore, an email that bypassed the defense system once\ncan do it again in the following days, even though rules are updated or the ML\nmodels are retrained. The recurrence of failures to detect emails that exhibit\nlayout similarities to previously undetected spam is concerning for customers\nand can erode their trust in a company. Our observations show that threat\nactors reuse email kits extensively and can bypass detection with little\neffort, for example, by making changes to the content of emails. In this work,\nwe propose an email visual similarity detection approach, named Pisco, to\nimprove the detection capabilities of an email threat defense system. We apply\nour proof of concept to some real-world samples received from different\nsources. Our results show that email kits are being reused extensively and\nvisually similar emails are sent to our customers at various time intervals.\nTherefore, this method could be very helpful in situations where detection\nfeatures that rely on textual features and keywords are bypassed, an occurrence\nour observations show happens frequently.\n","authors":["Sachin Shukla","Omid Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2408.16945v2.pdf","comment":"To be published in the proceedings of the ACM Conference on Computer\n and Communications Security (ACM CCS 2024)"},{"id":"http://arxiv.org/abs/2402.13108v2","updated":"2024-09-03T14:09:08Z","published":"2024-02-20T16:01:42Z","title":"On the Convergence of Gradient Descent for Large Learning Rates","summary":" A vast literature on convergence guarantees for gradient descent and derived\nmethods exists at the moment. However, a simple practical situation remains\nunexplored: when a fixed step size is used, can we expect gradient descent to\nconverge starting from any initialization? We provide fundamental impossibility\nresults showing that convergence becomes impossible no matter the\ninitialization if the step size gets too big. Looking at the asymptotic value\nof the gradient norm along the optimization trajectory, we see that there is a\nphase transition as the step size crosses a critical value. This has been\nobserved by practitioners, yet the true mechanisms through which this happens\nremain unclear beyond heuristics. Using results from dynamical systems theory,\nwe provide a proof of this in the case of linear neural networks with a squared\nloss. We also prove the impossibility of convergence for more general losses\nwithout requiring strong assumptions such as Lipschitz continuity for the\ngradient. We validate our findings through experiments with non-linear\nnetworks.\n","authors":["Alexandru Crăciun","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2402.13108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17147v4","updated":"2024-09-03T12:55:47Z","published":"2024-04-26T04:34:45Z","title":"On the Federated Learning Framework for Cooperative Perception","summary":" Cooperative perception is essential to enhance the efficiency and safety of\nfuture transportation systems, requiring extensive data sharing among vehicles\non the road, which raises significant privacy concerns. Federated learning\noffers a promising solution by enabling data privacy-preserving collaborative\nenhancements in perception, decision-making, and planning among connected and\nautonomous vehicles (CAVs). However, federated learning is impeded by\nsignificant challenges arising from data heterogeneity across diverse clients,\npotentially diminishing model accuracy and prolonging convergence periods. This\nstudy introduces a specialized federated learning framework for CP, termed the\nfederated dynamic weighted aggregation (FedDWA) algorithm, facilitated by\ndynamic adjusting loss (DALoss) function. This framework employs dynamic client\nweighting to direct model convergence and integrates a novel loss function that\nutilizes Kullback-Leibler divergence (KLD) to counteract the detrimental\neffects of non-independently and identically distributed (Non-IID) and\nunbalanced data. Utilizing the BEV transformer as the primary model, our\nrigorous testing on the OpenV2V dataset, augmented with FedBEVT data,\ndemonstrates significant improvements in the average intersection over union\n(IoU). These results highlight the substantial potential of our federated\nlearning framework to address data heterogeneity challenges in CP, thereby\nenhancing the accuracy of environmental perception models and facilitating more\nrobust and efficient collaborative learning solutions in the transportation\nsector.\n","authors":["Zhenrong Zhang","Jianan Liu","Xi Zhou","Tao Huang","Qing-Long Han","Jingxin Liu","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2404.17147v4.pdf","comment":"accepted by IEEE RA-L"},{"id":"http://arxiv.org/abs/2401.12843v2","updated":"2024-09-03T12:49:16Z","published":"2024-01-23T15:25:21Z","title":"An embedding-based distance for temporal graphs","summary":" Temporal graphs are commonly used to represent time-resolved relations\nbetween entities in many natural and artificial systems. Many techniques were\ndevised to investigate the evolution of temporal graphs by comparing their\nstate at different time points. However, quantifying the similarity between\ntemporal graphs as a whole is an open problem. Here, we use embeddings based on\ntime-respecting random walks to introduce a new notion of distance between\ntemporal graphs. This distance is well-defined for pairs of temporal graphs\nwith different numbers of nodes and different time spans. We study the case of\na matched pair of graphs, when a known relation exists between their nodes, and\nthe case of unmatched graphs, when such a relation is unavailable and the\ngraphs may be of different sizes. We use empirical and synthetic temporal\nnetwork data to show that the distance we introduce discriminates graphs with\ndifferent topological and temporal properties. We provide an efficient\nimplementation of the distance computation suitable for large-scale temporal\ngraphs.\n","authors":["Lorenzo Dall'Amico","Alain Barrat","Ciro Cattuto"],"pdf_url":"https://arxiv.org/pdf/2401.12843v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10800v2","updated":"2024-09-03T12:43:37Z","published":"2024-05-17T14:10:34Z","title":"Heterogeneity-Informed Meta-Parameter Learning for Spatiotemporal Time\n Series Forecasting","summary":" Spatiotemporal time series forecasting plays a key role in a wide range of\nreal-world applications. While significant progress has been made in this area,\nfully capturing and leveraging spatiotemporal heterogeneity remains a\nfundamental challenge. Therefore, we propose a novel Heterogeneity-Informed\nMeta-Parameter Learning scheme. Specifically, our approach implicitly captures\nspatiotemporal heterogeneity through learning spatial and temporal embeddings,\nwhich can be viewed as a clustering process. Then, a novel spatiotemporal\nmeta-parameter learning paradigm is proposed to learn spatiotemporal-specific\nparameters from meta-parameter pools, which is informed by the captured\nheterogeneity. Based on these ideas, we develop a Heterogeneity-Informed\nSpatiotemporal Meta-Network (HimNet) for spatiotemporal time series\nforecasting. Extensive experiments on five widely-used benchmarks demonstrate\nour method achieves state-of-the-art performance while exhibiting superior\ninterpretability. Our code is available at\nhttps://github.com/XDZhelheim/HimNet.\n","authors":["Zheng Dong","Renhe Jiang","Haotian Gao","Hangchen Liu","Jinliang Deng","Qingsong Wen","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2405.10800v2.pdf","comment":"Published in KDD'24 Research Track"},{"id":"http://arxiv.org/abs/2406.14281v4","updated":"2024-09-03T12:38:22Z","published":"2024-06-20T13:07:06Z","title":"FairX: A comprehensive benchmarking tool for model analysis using\n fairness, utility, and explainability","summary":" We present FairX, an open-source Python-based benchmarking tool designed for\nthe comprehensive analysis of models under the umbrella of fairness, utility,\nand eXplainability (XAI). FairX enables users to train benchmarking\nbias-mitigation models and evaluate their fairness using a wide array of\nfairness metrics, data utility metrics, and generate explanations for model\npredictions, all within a unified framework. Existing benchmarking tools do not\nhave the way to evaluate synthetic data generated from fair generative models,\nalso they do not have the support for training fair generative models either.\nIn FairX, we add fair generative models in the collection of our fair-model\nlibrary (pre-processing, in-processing, post-processing) and evaluation metrics\nfor evaluating the quality of synthetic fair data. This version of FairX\nsupports both tabular and image datasets. It also allows users to provide their\nown custom datasets. The open-source FairX benchmarking package is publicly\navailable at \\url{https://github.com/fahim-sikder/FairX}.\n","authors":["Md Fahim Sikder","Resmi Ramachandranpillai","Daniel de Leng","Fredrik Heintz"],"pdf_url":"https://arxiv.org/pdf/2406.14281v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09360v2","updated":"2024-09-03T12:31:51Z","published":"2024-08-18T04:50:41Z","title":"Behavioral Learning of Dish Rinsing and Scrubbing based on Interruptive\n Direct Teaching Considering Assistance Rate","summary":" Robots are expected to manipulate objects in a safe and dexterous way. For\nexample, washing dishes is a dexterous operation that involves scrubbing the\ndishes with a sponge and rinsing them with water. It is necessary to learn it\nsafely without splashing water and without dropping the dishes. In this study,\nwe propose a safe and dexterous manipulation system. The robot learns a\ndynamics model of the object by estimating the state of the object and the\nrobot itself, the control input, and the amount of human assistance required\n(assistance rate) after the human corrects the initial trajectory of the\nrobot's hands by interruptive direct teaching. By backpropagating the error\nbetween the estimated and the reference value using the acquired dynamics\nmodel, the robot can generate a control input that approaches the reference\nvalue, for example, so that human assistance is not required and the dish does\nnot move excessively. This allows for adaptive rinsing and scrubbing of dishes\nwith unknown shapes and properties. As a result, it is possible to generate\nsafe actions that require less human assistance.\n","authors":["Shumpei Wakabayashi","Kento Kawaharazuka","Kei Okada","Masayuki Inaba"],"pdf_url":"https://arxiv.org/pdf/2408.09360v2.pdf","comment":"Accepted at Advanced Robotics"},{"id":"http://arxiv.org/abs/2404.02937v5","updated":"2024-09-03T11:32:50Z","published":"2024-04-03T07:14:15Z","title":"Towards Explainable Traffic Flow Prediction with Large Language Models","summary":" Traffic forecasting is crucial for intelligent transportation systems. It has\nexperienced significant advancements thanks to the power of deep learning in\ncapturing latent patterns of traffic data. However, recent deep-learning\narchitectures require intricate model designs and lack an intuitive\nunderstanding of the mapping from input data to predicted results. Achieving\nboth accuracy and explainability in traffic prediction models remains a\nchallenge due to the complexity of traffic data and the inherent opacity of\ndeep learning models. To tackle these challenges, we propose a Traffic flow\nPrediction model based on Large Language Models (LLMs) to generate explainable\ntraffic predictions, named xTP-LLM. By transferring multi-modal traffic data\ninto natural language descriptions, xTP-LLM captures complex time-series\npatterns and external factors from comprehensive traffic data. The LLM\nframework is fine-tuned using language-based instructions to align with\nspatial-temporal traffic flow data. Empirically, xTP-LLM shows competitive\naccuracy compared with deep learning baselines, while providing an intuitive\nand reliable explanation for predictions. This paper contributes to advancing\nexplainable traffic prediction models and lays a foundation for future\nexploration of LLM applications in transportation. To the best of our\nknowledge, this is the first study to use LLM for explainable prediction of\ntraffic flows.\n","authors":["Xusen Guo","Qiming Zhang","Junyue Jiang","Mingxing Peng","Meixin Zhu"," Hao"," Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02937v5.pdf","comment":"31pages, 16 figures"},{"id":"http://arxiv.org/abs/2310.02031v8","updated":"2024-09-03T10:19:52Z","published":"2023-10-03T13:17:35Z","title":"OceanGPT: A Large Language Model for Ocean Science Tasks","summary":" Ocean science, which delves into the oceans that are reservoirs of life and\nbiodiversity, is of great significance given that oceans cover over 70% of our\nplanet's surface. Recently, advances in Large Language Models (LLMs) have\ntransformed the paradigm in science. Despite the success in other domains,\ncurrent LLMs often fall short in catering to the needs of domain experts like\noceanographers, and the potential of LLMs for ocean science is under-explored.\nThe intrinsic reasons are the immense and intricate nature of ocean data as\nwell as the necessity for higher granularity and richness in knowledge. To\nalleviate these issues, we introduce OceanGPT, the first-ever large language\nmodel in the ocean domain, which is expert in various ocean science tasks. We\nalso propose OceanGPT, a novel framework to automatically obtain a large volume\nof ocean domain instruction data, which generates instructions based on\nmulti-agent collaboration. Additionally, we construct the first oceanography\nbenchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean\ndomain. Though comprehensive experiments, OceanGPT not only shows a higher\nlevel of knowledge expertise for oceans science tasks but also gains\npreliminary embodied intelligence capabilities in ocean technology.\n","authors":["Zhen Bi","Ningyu Zhang","Yida Xue","Yixin Ou","Daxiong Ji","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02031v8.pdf","comment":"ACL2024. Project Website: http://oceangpt.zjukg.cn/"},{"id":"http://arxiv.org/abs/2405.19047v2","updated":"2024-09-03T09:25:46Z","published":"2024-05-29T12:44:41Z","title":"Statistical Context Detection for Deep Lifelong Reinforcement Learning","summary":" Context detection involves labeling segments of an online stream of data as\nbelonging to different tasks. Task labels are used in lifelong learning\nalgorithms to perform consolidation or other procedures that prevent\ncatastrophic forgetting. Inferring task labels from online experiences remains\na challenging problem. Most approaches assume finite and low-dimension\nobservation spaces or a preliminary training phase during which task labels are\nlearned. Moreover, changes in the transition or reward functions can be\ndetected only in combination with a policy, and therefore are more difficult to\ndetect than changes in the input distribution. This paper presents an approach\nto learning both policies and labels in an online deep reinforcement learning\nsetting. The key idea is to use distance metrics, obtained via optimal\ntransport methods, i.e., Wasserstein distance, on suitable latent action-reward\nspaces to measure distances between sets of data points from past and current\nstreams. Such distances can then be used for statistical tests based on an\nadapted Kolmogorov-Smirnov calculation to assign labels to sequences of\nexperiences. A rollback procedure is introduced to learn multiple policies by\nensuring that only the appropriate data is used to train the corresponding\npolicy. The combination of task detection and policy deployment allows for the\noptimization of lifelong reinforcement learning agents without an oracle that\nprovides task labels. The approach is tested using two benchmarks and the\nresults show promising performance when compared with related context detection\nalgorithms. The results suggest that optimal transport statistical methods\nprovide an explainable and justifiable procedure for online context detection\nand reward optimization in lifelong reinforcement learning.\n","authors":["Jeffery Dick","Saptarshi Nath","Christos Peridis","Eseoghene Benjamin","Soheil Kolouri","Andrea Soltoggio"],"pdf_url":"https://arxiv.org/pdf/2405.19047v2.pdf","comment":"10 pages excluding references and bibliography. Accepted at CoLLAs\n 2024"},{"id":"http://arxiv.org/abs/2308.03887v3","updated":"2024-09-03T08:53:32Z","published":"2023-08-04T15:57:28Z","title":"Enhancing Cell Tracking with a Time-Symmetric Deep Learning Approach","summary":" The accurate tracking of live cells using video microscopy recordings remains\na challenging task for popular state-of-the-art image processing based object\ntracking methods. In recent years, several existing and new applications have\nattempted to integrate deep-learning based frameworks for this task, but most\nof them still heavily rely on consecutive frame based tracking embedded in\ntheir architecture or other premises that hinder generalized learning. To\naddress this issue, we aimed to develop a new deep-learning based tracking\nmethod that relies solely on the assumption that cells can be tracked based on\ntheir spatio-temporal neighborhood, without restricting it to consecutive\nframes. The proposed method has the additional benefit that the motion patterns\nof the cells can be learned completely by the predictor without any prior\nassumptions, and it has the potential to handle a large number of video frames\nwith heavy artifacts. The efficacy of the proposed method is demonstrated\nthrough biologically motivated validation strategies and compared against\nmultiple state-of-the-art cell tracking methods.\n","authors":["Gergely Szabó","Paolo Bonaiuti","Andrea Ciliberto","András Horváth"],"pdf_url":"https://arxiv.org/pdf/2308.03887v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17129v2","updated":"2024-09-03T08:45:37Z","published":"2024-08-30T09:14:38Z","title":"Controllable Edge-Type-Specific Interpretation in Multi-Relational Graph\n Neural Networks for Drug Response Prediction","summary":" Graph Neural Networks have been widely applied in critical decision-making\nareas that demand interpretable predictions, leading to the flourishing\ndevelopment of interpretability algorithms. However, current graph\ninterpretability algorithms tend to emphasize generality and often overlook\nbiological significance, thereby limiting their applicability in predicting\ncancer drug responses. In this paper, we propose a novel post-hoc\ninterpretability algorithm for cancer drug response prediction, CETExplainer,\nwhich incorporates a controllable edge-type-specific weighting mechanism. It\nconsiders the mutual information between subgraphs and predictions, proposing a\nstructural scoring approach to provide fine-grained, biologically meaningful\nexplanations for predictive models. We also introduce a method for constructing\nground truth based on real-world datasets to quantitatively evaluate the\nproposed interpretability algorithm. Empirical analysis on the real-world\ndataset demonstrates that CETExplainer achieves superior stability and improves\nexplanation quality compared to leading algorithms, thereby offering a robust\nand insightful tool for cancer drug prediction.\n","authors":["Xiaodi Li","Jianfeng Gui","Qian Gao","Haoyuan Shi","Zhenyu Yue"],"pdf_url":"https://arxiv.org/pdf/2408.17129v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15640v3","updated":"2024-09-03T08:35:15Z","published":"2024-08-28T08:52:14Z","title":"GANs Conditioning Methods: A Survey","summary":" In recent years, Generative Adversarial Networks (GANs) have seen significant\nadvancements, leading to their widespread adoption across various fields. The\noriginal GAN architecture enables the generation of images without any specific\ncontrol over the content, making it an unconditional generation process.\nHowever, many practical applications require precise control over the generated\noutput, which has led to the development of conditional GANs (cGANs) that\nincorporate explicit conditioning to guide the generation process. cGANs extend\nthe original framework by incorporating additional information (conditions),\nenabling the generation of samples that adhere to that specific criteria.\nVarious conditioning methods have been proposed, each differing in how they\nintegrate the conditioning information into both the generator and the\ndiscriminator networks. In this work, we review the conditioning methods\nproposed for GANs, exploring the characteristics of each method and\nhighlighting their unique mechanisms and theoretical foundations. Furthermore,\nwe conduct a comparative analysis of these methods, evaluating their\nperformance on various image datasets. Through these analyses, we aim to\nprovide insights into the strengths and limitations of various conditioning\ntechniques, guiding future research and application in generative modeling.\n","authors":["Anis Bourou","Valérie Mezger","Auguste Genovesio"],"pdf_url":"https://arxiv.org/pdf/2408.15640v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01082v2","updated":"2024-09-03T07:59:58Z","published":"2023-12-02T09:20:10Z","title":"A Survey on Stability of Learning with Limited Labelled Data and its\n Sensitivity to the Effects of Randomness","summary":" Learning with limited labelled data, such as prompting, in-context learning,\nfine-tuning, meta-learning or few-shot learning, aims to effectively train a\nmodel using only a small amount of labelled samples. However, these approaches\nhave been observed to be excessively sensitive to the effects of uncontrolled\nrandomness caused by non-determinism in the training process. The randomness\nnegatively affects the stability of the models, leading to large variances in\nresults across training runs. When such sensitivity is disregarded, it can\nunintentionally, but unfortunately also intentionally, create an imaginary\nperception of research progress. Recently, this area started to attract\nresearch attention and the number of relevant studies is continuously growing.\nIn this survey, we provide a comprehensive overview of 415 papers addressing\nthe effects of randomness on the stability of learning with limited labelled\ndata. We distinguish between four main tasks addressed in the papers\n(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness\neffects), providing findings for each one. Furthermore, we identify and discuss\nseven challenges and open problems together with possible directions to\nfacilitate further research. The ultimate goal of this survey is to emphasise\nthe importance of this growing research area, which so far has not received an\nappropriate level of attention, and reveal impactful directions for future\nresearch.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2312.01082v2.pdf","comment":"Accepted to ACM Comput. Surv. 2024"},{"id":"http://arxiv.org/abs/2310.14481v2","updated":"2024-09-03T07:46:24Z","published":"2023-10-23T01:25:44Z","title":"Efficient Heterogeneous Graph Learning via Random Projection","summary":" Heterogeneous Graph Neural Networks (HGNNs) are powerful tools for deep\nlearning on heterogeneous graphs. Typical HGNNs require repetitive message\npassing during training, limiting efficiency for large-scale real-world graphs.\nRecent pre-computation-based HGNNs use one-time message passing to transform a\nheterogeneous graph into regular-shaped tensors, enabling efficient mini-batch\ntraining. Existing pre-computation-based HGNNs can be mainly categorized into\ntwo styles, which differ in how much information loss is allowed and\nefficiency. We propose a hybrid pre-computation-based HGNN, named Random\nProjection Heterogeneous Graph Neural Network (RpHGNN), which combines the\nbenefits of one style's efficiency with the low information loss of the other\nstyle. To achieve efficiency, the main framework of RpHGNN consists of\npropagate-then-update iterations, where we introduce a Random Projection\nSquashing step to ensure that complexity increases only linearly. To achieve\nlow information loss, we introduce a Relation-wise Neighbor Collection\ncomponent with an Even-odd Propagation Scheme, which aims to collect\ninformation from neighbors in a finer-grained way. Experimental results\nindicate that our approach achieves state-of-the-art results on seven small and\nlarge benchmark datasets while also being 230% faster compared to the most\neffective baseline. Surprisingly, our approach not only surpasses\npre-processing-based baselines but also outperforms end-to-end methods.\n","authors":["Jun Hu","Bryan Hooi","Bingsheng He"],"pdf_url":"https://arxiv.org/pdf/2310.14481v2.pdf","comment":"Accepted by IEEE Transactions on Knowledge and Data Engineering\n (TKDE)"},{"id":"http://arxiv.org/abs/2402.01306v3","updated":"2024-09-03T07:41:51Z","published":"2024-02-02T10:53:36Z","title":"KTO: Model Alignment as Prospect Theoretic Optimization","summary":" Kahneman & Tversky's $\\textit{prospect theory}$ tells us that humans perceive\nrandom variables in a biased but well-defined manner (1992); for example,\nhumans are famously loss-averse. We show that objectives for aligning LLMs with\nhuman feedback implicitly incorporate many of these biases -- the success of\nthese objectives (e.g., DPO) over cross-entropy minimization can partly be\nascribed to them belonging to a family of loss functions that we call\n$\\textit{human-aware losses}$ (HALOs). However, the utility functions these\nmethods attribute to humans still differ from those in the prospect theory\nliterature. Using a Kahneman-Tversky model of human utility, we propose a HALO\nthat directly maximizes the utility of generations instead of maximizing the\nlog-likelihood of preferences, as current methods do. We call this approach\nKTO, and it matches or exceeds the performance of preference-based methods at\nscales from 1B to 30B, despite only learning from a binary signal of whether an\noutput is desirable. More broadly, our work suggests that there is no one HALO\nthat is universally superior; the best loss depends on the inductive biases\nmost appropriate for a given setting, an oft-overlooked consideration.\n","authors":["Kawin Ethayarajh","Winnie Xu","Niklas Muennighoff","Dan Jurafsky","Douwe Kiela"],"pdf_url":"https://arxiv.org/pdf/2402.01306v3.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2310.18542v2","updated":"2024-09-03T07:34:54Z","published":"2023-10-28T00:15:10Z","title":"End-to-end Feature Selection Approach for Learning Skinny Trees","summary":" We propose a new optimization-based approach for feature selection in tree\nensembles, an important problem in statistics and machine learning. Popular\ntree ensemble toolkits e.g., Gradient Boosted Trees and Random Forests support\nfeature selection post-training based on feature importance scores, while very\npopular, they are known to have drawbacks. We propose Skinny Trees: an\nend-to-end toolkit for feature selection in tree ensembles where we train a\ntree ensemble while controlling the number of selected features. Our\noptimization-based approach learns an ensemble of differentiable trees, and\nsimultaneously performs feature selection using a grouped $\\ell_0$-regularizer.\nWe use first-order methods for optimization and present convergence guarantees\nfor our approach. We use a dense-to-sparse regularization scheduling scheme\nthat can lead to more expressive and sparser tree ensembles. On 15 synthetic\nand real-world datasets, Skinny Trees can achieve $1.5\\!\\times\\!\n-~620~\\!\\times\\!$ feature compression rates, leading up to $10\\times$ faster\ninference over dense trees, without any loss in performance. Skinny Trees lead\nto superior feature selection than many existing toolkits e.g., in terms of AUC\nperformance for 25\\% feature budget, Skinny Trees outperforms LightGBM by\n$10.2\\%$ (up to $37.7\\%$), and Random Forests by $3\\%$ (up to $12.5\\%$).\n","authors":["Shibal Ibrahim","Kayhan Behdin","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2310.18542v2.pdf","comment":"Accepted in AISTATS 2024"},{"id":"http://arxiv.org/abs/2405.06433v3","updated":"2024-09-03T07:10:31Z","published":"2024-05-10T12:25:06Z","title":"Fair Mixed Effects Support Vector Machine","summary":" To ensure unbiased and ethical automated predictions, fairness must be a core\nprinciple in machine learning applications. Fairness in machine learning aims\nto mitigate biases present in the training data and model imperfections that\ncould lead to discriminatory outcomes. This is achieved by preventing the model\nfrom making decisions based on sensitive characteristics like ethnicity or\nsexual orientation. A fundamental assumption in machine learning is the\nindependence of observations. However, this assumption often does not hold true\nfor data describing social phenomena, where data points are often clustered\nbased. Hence, if the machine learning models do not account for the cluster\ncorrelations, the results may be biased. Especially high is the bias in cases\nwhere the cluster assignment is correlated to the variable of interest. We\npresent a fair mixed effects support vector machine algorithm that can handle\nboth problems simultaneously. With a reproducible simulation study we\ndemonstrate the impact of clustered data on the quality of fair machine\nlearning predictions.\n","authors":["João Vitor Pamplona","Jan Pablo Burgard"],"pdf_url":"https://arxiv.org/pdf/2405.06433v3.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.13110v3","updated":"2024-09-03T06:31:48Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v3.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes and formatting improvements. V3: improvements from\n journal revisions"},{"id":"http://arxiv.org/abs/2408.02247v4","updated":"2024-09-03T06:02:25Z","published":"2024-08-05T05:41:16Z","title":"Contrastive Learning and Abstract Concepts: The Case of Natural Numbers","summary":" Contrastive Learning (CL) has been successfully applied to classification and\nother downstream tasks related to concrete concepts, such as objects contained\nin the ImageNet dataset. No attempts seem to have been made so far in applying\nthis promising scheme to more abstract entities. A prominent example of these\ncould be the concept of (discrete) Quantity. CL can be frequently interpreted\nas a self-supervised scheme guided by some profound and ubiquitous conservation\nprinciple (e.g. conservation of identity in object classification tasks). In\nthis introductory work we apply a suitable conservation principle to the\nsemi-abstract concept of natural numbers by which discrete quantities can be\nestimated or predicted. We experimentally show, by means of a toy problem, that\ncontrastive learning can be trained to count at a glance with high accuracy\nboth at human as well as at super-human ranges.. We compare this with the\nresults of a trained-to-count at a glance supervised learning (SL) neural\nnetwork scheme of similar architecture. We show that both schemes exhibit\nsimilar good performance on baseline experiments, where the distributions of\nthe training and testing stages are equal. Importantly, we demonstrate that in\nsome generalization scenarios, where training and testing distributions differ,\nCL boasts more robust and much better error performance.\n","authors":["Daniel N. Nissani"],"pdf_url":"https://arxiv.org/pdf/2408.02247v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.01481v2","updated":"2024-09-03T05:47:42Z","published":"2024-05-02T17:13:40Z","title":"NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment","summary":" Aligning Large Language Models (LLMs) with human values and preferences is\nessential for making them helpful and safe. However, building efficient tools\nto perform alignment can be challenging, especially for the largest and most\ncompetent LLMs which often contain tens or hundreds of billions of parameters.\nWe create NeMo-Aligner, a toolkit for model alignment that can efficiently\nscale to a thousand GPUs for training the largest open-source LLMs such as\nNemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized\nand scalable implementations for major paradigms of model alignment such as:\nReinforcement Learning from Human Feedback (RLHF), Direct Preference\nOptimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally,\nour toolkit supports running most of the alignment techniques in a Parameter\nEfficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for\nextensibility, allowing support for other alignment techniques with minimal\neffort. It is open-sourced with Apache 2.0 License and we invite community\ncontributions at https://github.com/NVIDIA/NeMo-Aligner\n","authors":["Gerald Shen","Zhilin Wang","Olivier Delalleau","Jiaqi Zeng","Yi Dong","Daniel Egert","Shengyang Sun","Jimmy Zhang","Sahil Jain","Ali Taghibakhshi","Markel Sanz Ausin","Ashwath Aithal","Oleksii Kuchaiev"],"pdf_url":"https://arxiv.org/pdf/2405.01481v2.pdf","comment":"16 pages, 4 figures, Accepted to COLM 2024"},{"id":"http://arxiv.org/abs/2303.14942v3","updated":"2024-09-03T04:57:03Z","published":"2023-03-27T06:50:31Z","title":"On the Optimality of Misspecified Spectral Algorithms","summary":" In the misspecified spectral algorithms problem, researchers usually assume\nthe underground true function $f_{\\rho}^{*} \\in [\\mathcal{H}]^{s}$, a\nless-smooth interpolation space of a reproducing kernel Hilbert space (RKHS)\n$\\mathcal{H}$ for some $s\\in (0,1)$. The existing minimax optimal results\nrequire $\\|f_{\\rho}^{*}\\|_{L^{\\infty}}<\\infty$ which implicitly requires $s >\n\\alpha_{0}$ where $\\alpha_{0}\\in (0,1)$ is the embedding index, a constant\ndepending on $\\mathcal{H}$. Whether the spectral algorithms are optimal for all\n$s\\in (0,1)$ is an outstanding problem lasting for years. In this paper, we\nshow that spectral algorithms are minimax optimal for any\n$\\alpha_{0}-\\frac{1}{\\beta} < s < 1$, where $\\beta$ is the eigenvalue decay\nrate of $\\mathcal{H}$. We also give several classes of RKHSs whose embedding\nindex satisfies $ \\alpha_0 = \\frac{1}{\\beta} $. Thus, the spectral algorithms\nare minimax optimal for all $s\\in (0,1)$ on these RKHSs.\n","authors":["Haobo Zhang","Yicheng Li","Qian Lin"],"pdf_url":"https://arxiv.org/pdf/2303.14942v3.pdf","comment":"50 pages, 2 figures"},{"id":"http://arxiv.org/abs/2405.08443v2","updated":"2024-09-03T04:10:57Z","published":"2024-05-14T09:03:00Z","title":"Safety Constrained Multi-Agent Reinforcement Learning for Active Voltage\n Control","summary":" Active voltage control presents a promising avenue for relieving power\ncongestion and enhancing voltage quality, taking advantage of the distributed\ncontrollable generators in the power network, such as roof-top photovoltaics.\nWhile Multi-Agent Reinforcement Learning (MARL) has emerged as a compelling\napproach to address this challenge, existing MARL approaches tend to overlook\nthe constrained optimization nature of this problem, failing in guaranteeing\nsafety constraints. In this paper, we formalize the active voltage control\nproblem as a constrained Markov game and propose a safety-constrained MARL\nalgorithm. We expand the primal-dual optimization RL method to multi-agent\nsettings, and augment it with a novel approach of double safety estimation to\nlearn the policy and to update the Lagrange-multiplier. In addition, we\nproposed different cost functions and investigated their influences on the\nbehavior of our constrained MARL method. We evaluate our approach in the power\ndistribution network simulation environment with real-world scale scenarios.\nExperimental results demonstrate the effectiveness of the proposed method\ncompared with the state-of-the-art MARL methods. This paper is published at\n\\url{https://www.ijcai.org/Proceedings/2024/}.\n","authors":["Yang Qu","Jinming Ma","Feng Wu"],"pdf_url":"https://arxiv.org/pdf/2405.08443v2.pdf","comment":"Accepted by IJCAI2024"},{"id":"http://arxiv.org/abs/2408.15667v2","updated":"2024-09-03T03:22:18Z","published":"2024-08-28T09:40:40Z","title":"Towards reliable respiratory disease diagnosis based on cough sounds and\n vision transformers","summary":" Recent advancements in deep learning techniques have sparked performance\nboosts in various real-world applications including disease diagnosis based on\nmulti-modal medical data. Cough sound data-based respiratory disease (e.g.,\nCOVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also\nattracted much attention. However, existing works usually utilise traditional\nmachine learning or deep models of moderate scales. On the other hand, the\ndeveloped approaches are trained and evaluated on small-scale data due to the\ndifficulty of curating and annotating clinical data on scale. To address these\nissues in prior works, we create a unified framework to evaluate various deep\nmodels from lightweight Convolutional Neural Networks (e.g., ResNet18) to\nmodern vision transformers and compare their performance in respiratory disease\nclassification. Based on the observations from such an extensive empirical\nstudy, we propose a novel approach to cough-based disease classification based\non both self-supervised and supervised learning on a large-scale cough data\nset. Experimental results demonstrate our proposed approach outperforms prior\narts consistently on two benchmark datasets for COVID-19 diagnosis and a\nproprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%.\n","authors":["Qian Wang","Zhaoyang Bu","Jiaxuan Mao","Wenyu Zhu","Jingya Zhao","Wei Du","Guochao Shi","Min Zhou","Si Chen","Jieming Qu"],"pdf_url":"https://arxiv.org/pdf/2408.15667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10802v3","updated":"2024-09-03T02:37:48Z","published":"2024-02-16T16:25:20Z","title":"TimeSeriesBench: An Industrial-Grade Benchmark for Time Series Anomaly\n Detection Models","summary":" Time series anomaly detection (TSAD) has gained significant attention due to\nits real-world applications to improve the stability of modern software\nsystems. However, there is no effective way to verify whether they can meet the\nrequirements for real-world deployment. Firstly, current algorithms typically\ntrain a specific model for each time series. Maintaining such many models is\nimpractical in a large-scale system with tens of thousands of curves. The\nperformance of using merely one unified model to detect anomalies remains\nunknown. Secondly, most TSAD models are trained on the historical part of a\ntime series and are tested on its future segment. In distributed systems,\nhowever, there are frequent system deployments and upgrades, with new,\npreviously unseen time series emerging daily. The performance of testing newly\nincoming unseen time series on current TSAD algorithms remains unknown. Lastly,\nthe assumptions of the evaluation metrics in existing benchmarks are far from\npractical demands. To solve the above-mentioned problems, we propose an\nindustrial-grade benchmark TimeSeriesBench. We assess the performance of\nexisting algorithms across more than 168 evaluation settings and provide\ncomprehensive analysis for the future design of anomaly detection algorithms.\nAn industrial dataset is also released along with TimeSeriesBench.\n","authors":["Haotian Si","Jianhui Li","Changhua Pei","Hang Cui","Jingwen Yang","Yongqian Sun","Shenglin Zhang","Jingjing Li","Haiming Zhang","Jing Han","Dan Pei","Gaogang Xie"],"pdf_url":"https://arxiv.org/pdf/2402.10802v3.pdf","comment":"Accepted by ISSRE'24"},{"id":"http://arxiv.org/abs/2402.00976v3","updated":"2024-09-03T02:35:52Z","published":"2024-02-01T19:47:31Z","title":"Investigating Recurrent Transformers with Dynamic Halt","summary":" In this paper, we comprehensively study the inductive biases of two major\napproaches to augmenting Transformers with a recurrent mechanism: (1) the\napproach of incorporating a depth-wise recurrence similar to Universal\nTransformers; and (2) the approach of incorporating a chunk-wise temporal\nrecurrence like Temporal Latent Bottleneck. Furthermore, we propose and\ninvestigate novel ways to extend and combine the above methods - for example,\nwe propose a global mean-based dynamic halting mechanism for Universal\nTransformers and an augmentation of Temporal Latent Bottleneck with elements\nfrom Universal Transformer. We compare the models and probe their inductive\nbiases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop\nlanguage modeling, ListOps, and Logical Inference. The code is released in:\nhttps://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main\n","authors":["Jishnu Ray Chowdhury","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2402.00976v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06576v4","updated":"2024-09-03T02:11:01Z","published":"2024-06-04T04:17:40Z","title":"OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step","summary":" Despite significant advancements in text generation and reasoning, Large\nLanguage Models (LLMs) still face challenges in accurately performing complex\narithmetic operations. Language model systems often enable LLMs to generate\ncode for arithmetic operations to achieve accurate calculations. However, this\napproach compromises speed and security, and fine-tuning risks the language\nmodel losing prior capabilities. We propose a framework that enables exact\narithmetic in a single autoregressive step, providing faster, more secure, and\nmore interpretable LLM systems with arithmetic capabilities. We use the hidden\nstates of a LLM to control a symbolic architecture that performs arithmetic.\nOur implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama)\nachieves 100\\% accuracy on single arithmetic operations\n($+,-,\\times,\\div,\\sin{},\\cos{},\\log{},\\exp{},\\sqrt{}$), outperforming GPT 4o\nwith and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o\nwith and without a code interpreter on average across a range of mathematical\nproblem solving benchmarks, demonstrating that OccamLLMs can excel in\narithmetic tasks, even surpassing much larger models. We will make our code\npublic shortly.\n","authors":["Owen Dugan","Donato Manuel Jimenez Beneto","Charlotte Loh","Zhuo Chen","Rumen Dangovski","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2406.06576v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13482v2","updated":"2024-09-03T00:48:37Z","published":"2024-08-24T05:54:47Z","title":"MPruner: Optimizing Neural Network Size with CKA-Based Mutual\n Information Pruning","summary":" Determining the optimal size of a neural network is critical, as it directly\nimpacts runtime performance and memory usage. Pruning is a well-established\nmodel compression technique that reduces the size of neural networks while\nmathematically guaranteeing accuracy preservation. However, many recent pruning\nmethods overlook the global contributions of individual model components,\nmaking it difficult to ensure that a pruned model meets the desired dataset and\nperformance requirements. To address these challenges, we developed a new\npruning algorithm, MPruner, that leverages mutual information through vector\nsimilarity. MPruner utilizes layer clustering with the Centered Kernel\nAlignment (CKA) similarity metric, allowing us to incorporate global\ninformation from the neural network for more precise and efficient layer-wise\npruning. We evaluated MPruner across various architectures and configurations,\ndemonstrating its versatility and providing practical guidelines. MPruner\nachieved up to a 50% reduction in parameters and memory usage for CNN and\ntransformer-based models, with minimal to no loss in accuracy.\n","authors":["Seungbeom Hu","ChanJun Park","Andrew Ferraiuolo","Sang-Ki Ko","Jinwoo Kim","Haein Song","Jieung Kim"],"pdf_url":"https://arxiv.org/pdf/2408.13482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10733v3","updated":"2024-09-03T23:41:14Z","published":"2022-08-23T05:02:09Z","title":"Recursively Feasible Probabilistic Safe Online Learning with Control\n Barrier Functions","summary":" Learning-based control has recently shown great efficacy in performing\ncomplex tasks for various applications. However, to deploy it in real systems,\nit is of vital importance to guarantee the system will stay safe. Control\nBarrier Functions (CBFs) offer mathematical tools for designing\nsafety-preserving controllers for systems with known dynamics. In this article,\nwe first introduce a model-uncertainty-aware reformulation of CBF-based\nsafety-critical controllers using Gaussian Process (GP) regression to close the\ngap between an approximate mathematical model and the real system, which\nresults in a second-order cone program (SOCP)-based control design. We then\npresent the pointwise feasibility conditions of the resulting safety\ncontroller, highlighting the level of richness that the available system\ninformation must meet to ensure safety. We use these conditions to devise an\nevent-triggered online data collection strategy that ensures the recursive\nfeasibility of the learned safety controller. Our method works by constantly\nreasoning about whether the current information is sufficient to ensure safety\nor if new measurements under active safe exploration are required to reduce the\nuncertainty. As a result, our proposed framework can guarantee the forward\ninvariance of the safe set defined by the CBF with high probability, even if it\ncontains a priori unexplored regions. We validate the proposed framework in two\nnumerical simulation experiments.\n","authors":["Fernando Castañeda","Jason J. Choi","Wonsuhk Jung","Bike Zhang","Claire J. Tomlin","Koushil Sreenath"],"pdf_url":"https://arxiv.org/pdf/2208.10733v3.pdf","comment":"Journal article. Includes the results of the 2021 CDC paper titled\n \"Pointwise feasibility of gaussian process-based safety-critical control\n under model uncertainty\" and proposes a recursively feasible safe online\n learning algorithm as new contribution"},{"id":"http://arxiv.org/abs/2409.02332v1","updated":"2024-09-03T23:13:04Z","published":"2024-09-03T23:13:04Z","title":"Double Machine Learning at Scale to Predict Causal Impact of Customer\n Actions","summary":" Causal Impact (CI) of customer actions are broadly used across the industry\nto inform both short- and long-term investment decisions of various types. In\nthis paper, we apply the double machine learning (DML) methodology to estimate\nthe CI values across 100s of customer actions of business interest and 100s of\nmillions of customers. We operationalize DML through a causal ML library based\non Spark with a flexible, JSON-driven model configuration approach to estimate\nCI at scale (i.e., across hundred of actions and millions of customers). We\noutline the DML methodology and implementation, and associated benefits over\nthe traditional potential outcomes based CI model. We show population-level as\nwell as customer-level CI values along with confidence intervals. The\nvalidation metrics show a 2.2% gain over the baseline methods and a 2.5X gain\nin the computational time. Our contribution is to advance the scalable\napplication of CI, while also providing an interface that allows faster\nexperimentation, cross-platform support, ability to onboard new use cases, and\nimproves accessibility of underlying code for partner teams.\n","authors":["Sushant More","Priya Kotwal","Sujith Chappidi","Dinesh Mandalapu","Chris Khawand"],"pdf_url":"https://arxiv.org/pdf/2409.02332v1.pdf","comment":"16 pages, 11 figures. Accepted at the European Conference on Machine\n Learning and Principles and Practice of Knowledge Discovery in Databases\n (ECML PKDD) 2023, Turin, Italy"},{"id":"http://arxiv.org/abs/2406.16746v3","updated":"2024-09-03T23:03:41Z","published":"2024-06-24T15:55:49Z","title":"The Responsible Foundation Model Development Cheatsheet: A Review of\n Tools & Resources","summary":" Foundation model development attracts a rapidly expanding body of\ncontributors, scientists, and applications. To help shape responsible\ndevelopment practices, we introduce the Foundation Model Development\nCheatsheet: a growing collection of 250+ tools and resources spanning text,\nvision, and speech modalities. We draw on a large body of prior work to survey\nresources (e.g. software, documentation, frameworks, guides, and practical\ntools) that support informed data selection, processing, and understanding,\nprecise and limitation-aware artifact documentation, efficient model training,\nadvance awareness of the environmental impact from training, careful model\nevaluation of capabilities, risks, and claims, as well as responsible model\nrelease, licensing and deployment practices. We hope this curated collection of\nresources helps guide more responsible development. The process of curating\nthis list, enabled us to review the AI development ecosystem, revealing what\ntools are critically missing, misused, or over-used in existing practices. We\nfind that (i) tools for data sourcing, model evaluation, and monitoring are\ncritically under-serving ethical and real-world needs, (ii) evaluations for\nmodel safety, capabilities, and environmental impact all lack reproducibility\nand transparency, (iii) text and particularly English-centric analyses continue\nto dominate over multilingual and multi-modal analyses, and (iv) evaluation of\nsystems, rather than just models, is needed so that capabilities and impact are\nassessed in context.\n","authors":["Shayne Longpre","Stella Biderman","Alon Albalak","Hailey Schoelkopf","Daniel McDuff","Sayash Kapoor","Kevin Klyman","Kyle Lo","Gabriel Ilharco","Nay San","Maribeth Rauh","Aviya Skowron","Bertie Vidgen","Laura Weidinger","Arvind Narayanan","Victor Sanh","David Adelani","Percy Liang","Rishi Bommasani","Peter Henderson","Sasha Luccioni","Yacine Jernite","Luca Soldaini"],"pdf_url":"https://arxiv.org/pdf/2406.16746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02327v1","updated":"2024-09-03T22:38:55Z","published":"2024-09-03T22:38:55Z","title":"Generative Principal Component Regression via Variational Inference","summary":" The ability to manipulate complex systems, such as the brain, to modify\nspecific outcomes has far-reaching implications, particularly in the treatment\nof psychiatric disorders. One approach to designing appropriate manipulations\nis to target key features of predictive models. While generative latent\nvariable models, such as probabilistic principal component analysis (PPCA), is\na powerful tool for identifying targets, they struggle incorporating\ninformation relevant to low-variance outcomes into the latent space. When\nstimulation targets are designed on the latent space in such a scenario, the\nintervention can be suboptimal with minimal efficacy. To address this problem,\nwe develop a novel objective based on supervised variational autoencoders\n(SVAEs) that enforces such information is represented in the latent space. The\nnovel objective can be used with linear models, such as PPCA, which we refer to\nas generative principal component regression (gPCR). We show in simulations\nthat gPCR dramatically improves target selection in manipulation as compared to\nstandard PCR and SVAEs. As part of these simulations, we develop a metric for\ndetecting when relevant information is not properly incorporated into the\nloadings. We then show in two neural datasets related to stress and social\nbehavior in which gPCR dramatically outperforms PCR in predictive performance\nand that SVAEs exhibit low incorporation of relevant information into the\nloadings. Overall, this work suggests that our method significantly improves\ntarget selection for manipulation using latent variable models over competitor\ninference schemes.\n","authors":["Austin Talbot","Corey J Keller","David E Carlson","Alex V Kotlar"],"pdf_url":"https://arxiv.org/pdf/2409.02327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02322v1","updated":"2024-09-03T22:31:57Z","published":"2024-09-03T22:31:57Z","title":"TimeDiT: General-purpose Diffusion Transformers for Time Series\n Foundation Model","summary":" With recent advances in building foundation models for texts and video data,\nthere is a surge of interest in foundation models for time series. A family of\nmodels have been developed, utilizing a temporal auto-regressive generative\nTransformer architecture, whose effectiveness has been proven in Large Language\nModels. While the empirical results are promising, almost all existing time\nseries foundation models have only been tested on well-curated ``benchmark''\ndatasets very similar to texts. However, real-world time series exhibit unique\nchallenges, such as variable channel sizes across domains, missing values, and\nvarying signal sampling intervals due to the multi-resolution nature of\nreal-world data. Additionally, the uni-directional nature of temporally\nauto-regressive decoding limits the incorporation of domain knowledge, such as\nphysical laws expressed as partial differential equations (PDEs). To address\nthese challenges, we introduce the Time Diffusion Transformer (TimeDiT), a\ngeneral foundation model for time series that employs a denoising diffusion\nparadigm instead of temporal auto-regressive generation. TimeDiT leverages the\nTransformer architecture to capture temporal dependencies and employs diffusion\nprocesses to generate high-quality candidate samples without imposing stringent\nassumptions on the target distribution via novel masking schemes and a channel\nalignment strategy. Furthermore, we propose a finetuning-free model editing\nstrategy that allows the seamless integration of external knowledge during the\nsampling process without updating any model parameters. Extensive experiments\nconducted on a varity of tasks such as forecasting, imputation, and anomaly\ndetection, demonstrate the effectiveness of TimeDiT.\n","authors":["Defu Cao","Wen Ye","Yizhou Zhang","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02322v1.pdf","comment":"23 Pages, 6 Figures, 11 Tables. First present at ICML 2024 Workshop\n on Foundation Models in the Wild"},{"id":"http://arxiv.org/abs/2408.10263v2","updated":"2024-09-03T22:23:06Z","published":"2024-08-15T18:58:21Z","title":"Kolmogorov Arnold Networks in Fraud Detection: Bridging the Gap Between\n Theory and Practice","summary":" This study evaluates the applicability of Kolmogorov-Arnold Networks (KAN) in\nfraud detection, finding that their effectiveness is context-dependent. We\npropose a quick decision rule using Principal Component Analysis (PCA) to\nassess the suitability of KAN: if data can be effectively separated in two\ndimensions using splines, KAN may outperform traditional models; otherwise,\nother methods could be more appropriate. We also introduce a heuristic approach\nto hyperparameter tuning, significantly reducing computational costs. These\nfindings suggest that while KAN has potential, its use should be guided by\ndata-specific assessments.\n","authors":["Yang Lu","Felix Zhan"],"pdf_url":"https://arxiv.org/pdf/2408.10263v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02313v1","updated":"2024-09-03T21:56:13Z","published":"2024-09-03T21:56:13Z","title":"On the Benefits of Memory for Modeling Time-Dependent PDEs","summary":" Data-driven techniques have emerged as a promising alternative to traditional\nnumerical methods for solving partial differential equations (PDEs). These\ntechniques frequently offer a better trade-off between computational cost and\naccuracy for many PDE families of interest. For time-dependent PDEs, existing\nmethodologies typically treat PDEs as Markovian systems, i.e., the evolution of\nthe system only depends on the ``current state'', and not the past states.\nHowever, distortion of the input signals -- e.g., due to discretization or\nlow-pass filtering -- can render the evolution of the distorted signals\nnon-Markovian. In this work, motivated by the Mori-Zwanzig theory of model\nreduction, we investigate the impact of architectures with memory for modeling\nPDEs: that is, when past states are explicitly used to predict the future. We\nintroduce Memory Neural Operator (MemNO), a network based on the recent SSM\narchitectures and Fourier Neural Operator (FNO). We empirically demonstrate on\na variety of PDE families of interest that when the input is given on a\nlow-resolution grid, MemNO significantly outperforms the baselines without\nmemory, achieving more than 6 times less error on unseen PDEs. Via a\ncombination of theory and experiments, we show that the effect of memory is\nparticularly significant when the solution of the PDE has high frequency\nFourier components (e.g., low-viscosity fluid dynamics), and it also increases\nrobustness to observation noise.\n","authors":["Ricardo Buitrago Ruiz","Tanya Marwah","Albert Gu","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2409.02313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10368v2","updated":"2024-09-03T21:55:35Z","published":"2024-08-19T19:26:07Z","title":"Deep-MacroFin: Informed Equilibrium Neural Network for Continuous Time\n Economic Models","summary":" In this paper, we present Deep-MacroFin, a comprehensive framework designed\nto solve partial differential equations, with a particular focus on models in\ncontinuous time economics. This framework leverages deep learning\nmethodologies, including conventional Multi-Layer Perceptrons and the newly\ndeveloped Kolmogorov-Arnold Networks. It is optimized using economic\ninformation encapsulated by Hamilton-Jacobi-Bellman equations and coupled\nalgebraic equations. The application of neural networks holds the promise of\naccurately resolving high-dimensional problems with fewer computational demands\nand limitations compared to standard numerical methods. This versatile\nframework can be readily adapted for elementary differential equations, and\nsystems of differential equations, even in cases where the solutions may\nexhibit discontinuities. Importantly, it offers a more straightforward and\nuser-friendly implementation than existing libraries.\n","authors":["Yuntao Wu","Jiayuan Guo","Goutham Gopalakrishna","Zisis Poulos"],"pdf_url":"https://arxiv.org/pdf/2408.10368v2.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.02309v1","updated":"2024-09-03T21:39:58Z","published":"2024-09-03T21:39:58Z","title":"QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of\n DWI Data","summary":" We propose an image-conditioned diffusion model to estimate high angular\nresolution diffusion weighted imaging (DWI) from a low angular resolution\nacquisition. Our model, which we call QID$^2$, takes as input a set of low\nangular resolution DWI data and uses this information to estimate the DWI data\nassociated with a target gradient direction. We leverage a U-Net architecture\nwith cross-attention to preserve the positional information of the reference\nimages, further guiding the target image generation. We train and evaluate\nQID$^2$ on single-shell DWI samples curated from the Human Connectome Project\n(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to\nproduce low angular resolution DWI data and train QID$^2$ to reconstruct the\nmissing high angular resolution samples. We compare QID$^2$ with two\nstate-of-the-art GAN models. Our results demonstrate that QID$^2$ not only\nachieves higher-quality generated images, but it consistently outperforms the\nGAN models in downstream tensor estimation across multiple metrics. Taken\ntogether, this study highlights the potential of diffusion models, and QID$^2$\nin particular, for q-space up-sampling, thus offering a promising toolkit for\nclinical and research applications.\n","authors":["Zijian Chen","Jueqi Wang","Archana Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2409.02309v1.pdf","comment":"Accepted at MICCAI 2024 International Workshop on Computational\n Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02303v1","updated":"2024-09-03T21:28:48Z","published":"2024-09-03T21:28:48Z","title":"A Lesion-aware Edge-based Graph Neural Network for Predicting Language\n Ability in Patients with Post-stroke Aphasia","summary":" We propose a lesion-aware graph neural network (LEGNet) to predict language\nability from resting-state fMRI (rs-fMRI) connectivity in patients with\npost-stroke aphasia. Our model integrates three components: an edge-based\nlearning module that encodes functional connectivity between brain regions, a\nlesion encoding module, and a subgraph learning module that leverages\nfunctional similarities for prediction. We use synthetic data derived from the\nHuman Connectome Project (HCP) for hyperparameter tuning and model pretraining.\nWe then evaluate the performance using repeated 10-fold cross-validation on an\nin-house neuroimaging dataset of post-stroke aphasia. Our results demonstrate\nthat LEGNet outperforms baseline deep learning methods in predicting language\nability. LEGNet also exhibits superior generalization ability when tested on a\nsecond in-house dataset that was acquired under a slightly different\nneuroimaging protocol. Taken together, the results of this study highlight the\npotential of LEGNet in effectively learning the relationships between rs-fMRI\nconnectivity and language ability in a patient cohort with brain lesions for\nimproved post-stroke aphasia evaluation.\n","authors":["Zijian Chen","Maria Varkanitsa","Prakash Ishwar","Janusz Konrad","Margrit Betke","Swathi Kiran","Archana Venkataraman"],"pdf_url":"https://arxiv.org/pdf/2409.02303v1.pdf","comment":"Accepted at MICCAI 2024 International Workshop on Machine Learning in\n Clinical Neuroimaging (MLCN)"},{"id":"http://arxiv.org/abs/2409.02281v1","updated":"2024-09-03T20:28:30Z","published":"2024-09-03T20:28:30Z","title":"K-Origins: Better Colour Quantification for Neural Networks","summary":" K-Origins is a neural network layer designed to improve image-based network\nperformances when learning colour, or intensities, is beneficial. Over 250\nencoder-decoder convolutional networks are trained and tested on 16-bit\nsynthetic data, demonstrating that K-Origins improves semantic segmentation\naccuracy in two scenarios: object detection with low signal-to-noise ratios,\nand segmenting multiple objects that are identical in shape but vary in colour.\nK-Origins generates output features from the input features, $\\textbf{X}$, by\nthe equation $\\textbf{Y}_k = \\textbf{X}-\\textbf{J}\\cdot w_k$ for each trainable\nparameter $w_k$, where $\\textbf{J}$ is a matrix of ones. Additionally, networks\nwith varying receptive fields were trained to determine optimal network depths\nbased on the dimensions of target classes, suggesting that receptive field\nlengths should exceed object sizes. By ensuring a sufficient receptive field\nlength and incorporating K-Origins, we can achieve better semantic network\nperformance.\n","authors":["Lewis Mason","Mark Martinez"],"pdf_url":"https://arxiv.org/pdf/2409.02281v1.pdf","comment":"16 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.08205v2","updated":"2024-09-03T20:26:56Z","published":"2024-06-12T13:38:48Z","title":"What do we know about Hugging Face? A systematic literature review and\n quantitative validation of qualitative claims","summary":" Background: Collaborative Software Package Registries (SPRs) are an integral\npart of the software supply chain. Much engineering work synthesizes SPR\npackage into applications. Prior research has examined SPRs for traditional\nsoftware, such as NPM (JavaScript) and PyPI (Python). Pre-Trained Model (PTM)\nRegistries are an emerging class of SPR of increasing importance, because they\nsupport the deep learning supply chain.\n Aims: Recent empirical research has examined PTM registries in ways such as\nvulnerabilities, reuse processes, and evolution. However, no existing research\nsynthesizes them to provide a systematic understanding of the current\nknowledge. Some of the existing research includes qualitative claims lacking\nquantitative analysis. Our research fills these gaps by providing a knowledge\nsynthesis and quantitative analyses.\n Methods: We first conduct a systematic literature review (SLR). We then\nobserve that some of the claims are qualitative. We identify quantifiable\nmetrics associated with those claims, and measure in order to substantiate\nthese claims.\n Results: From our SLR, we identify 12 claims about PTM reuse on the\nHuggingFace platform, 4 of which lack quantitative validation. We successfully\ntest 3 of these claims through a quantitative analysis, and directly compare\none with traditional software. Our findings corroborate qualitative claims with\nquantitative measurements. Our findings are: (1) PTMs have a much higher\nturnover rate than traditional software, indicating a dynamic and rapidly\nevolving reuse environment within the PTM ecosystem; and (2) There is a strong\ncorrelation between documentation quality and PTM popularity.\n Conclusions: We confirm qualitative research claims with concrete metrics,\nsupporting prior qualitative and case study research. Our measures show further\ndynamics of PTM reuse, inspiring research infrastructure and new measures.\n","authors":["Jason Jones","Wenxin Jiang","Nicholas Synovic","George K. Thiruvathukal","James C. Davis"],"pdf_url":"https://arxiv.org/pdf/2406.08205v2.pdf","comment":"[ESEM'24] Proceedings of the 18th ACM/IEEE International Symposium on\n Empirical Software Engineering and Measurement (ESEM) 2024"},{"id":"http://arxiv.org/abs/2406.10131v2","updated":"2024-09-03T20:13:24Z","published":"2024-06-14T15:41:21Z","title":"Linear Contextual Bandits with Hybrid Payoff: Revisited","summary":" We study the Linear Contextual Bandit problem in the hybrid reward setting.\nIn this setting every arm's reward model contains arm specific parameters in\naddition to parameters shared across the reward models of all the arms. We can\nreduce this setting to two closely related settings (a) Shared - no arm\nspecific parameters, and (b) Disjoint - only arm specific parameters, enabling\nthe application of two popular state of the art algorithms - $\\texttt{LinUCB}$\nand $\\texttt{DisLinUCB}$ (Algorithm 1 in (Li et al. 2010)). When the arm\nfeatures are stochastic and satisfy a popular diversity condition, we provide\nnew regret analyses for both algorithms, significantly improving on the known\nregret guarantees of these algorithms. Our novel analysis critically exploits\nthe hybrid reward structure and the diversity condition. Moreover, we introduce\na new algorithm $\\texttt{HyLinUCB}$ that crucially modifies $\\texttt{LinUCB}$\n(using a new exploration coefficient) to account for sparsity in the hybrid\nsetting. Under the same diversity assumptions, we prove that\n$\\texttt{HyLinUCB}$ also incurs only $O(\\sqrt{T})$ regret for $T$ rounds. We\nperform extensive experiments on synthetic and real-world datasets\ndemonstrating strong empirical performance of $\\texttt{HyLinUCB}$.For number of\narm specific parameters much larger than the number of shared parameters, we\nobserve that $\\texttt{DisLinUCB}$ incurs the lowest regret. In this case,\nregret of $\\texttt{HyLinUCB}$ is the second best and extremely competitive to\n$\\texttt{DisLinUCB}$. In all other situations, including our real-world\ndataset, $\\texttt{HyLinUCB}$ has significantly lower regret than\n$\\texttt{LinUCB}$, $\\texttt{DisLinUCB}$ and other SOTA baselines we considered.\nWe also empirically observe that the regret of $\\texttt{HyLinUCB}$ grows much\nslower with the number of arms compared to baselines, making it suitable even\nfor very large action spaces.\n","authors":["Nirjhar Das","Gaurav Sinha"],"pdf_url":"https://arxiv.org/pdf/2406.10131v2.pdf","comment":"Accepted at ECML PKDD 2024 as a Research Track Paper"},{"id":"http://arxiv.org/abs/2403.06023v2","updated":"2024-09-03T20:09:57Z","published":"2024-03-09T22:18:26Z","title":"Persian Slang Text Conversion to Formal and Deep Learning of Persian\n Short Texts on Social Media for Sentiment Classification","summary":" The lack of a suitable tool for the analysis of conversational texts in the\nPersian language has made various analyses of these texts, including Sentiment\nAnalysis, difficult. In this research, we tried to make the understanding of\nthese texts easier for the machine by providing PSC, Persian Slang Converter, a\ntool for converting conversational texts into formal ones, and by using the\nmost up-to-date and best deep learning methods along with the PSC, the\nsentiment learning of short Persian language texts for the machine in a better\nway. be made More than 10 million unlabeled texts from various social networks\nand movie subtitles (as Conversational texts) and about 10 million news texts\n(as formal texts) have been used for training unsupervised models and formal\nimplementation of the tool. 60,000 texts from the comments of Instagram social\nnetwork users with positive, negative, and neutral labels are considered\nsupervised data for training the emotion classification model of short texts.\nUsing the formal tool, 57% of the words of the corpus of conversation were\nconverted. Finally, by using the formalizer, FastText model, and deep LSTM\nnetwork, an accuracy of 81.91 was obtained on the test data.\n","authors":["Mohsen Khazeni","Mohammad Heydari","Amir Albadvi"],"pdf_url":"https://arxiv.org/pdf/2403.06023v2.pdf","comment":"16 pages, 4 figures, 14 tables"},{"id":"http://arxiv.org/abs/2409.02270v1","updated":"2024-09-03T20:01:56Z","published":"2024-09-03T20:01:56Z","title":"Reinforcement Learning-enabled Satellite Constellation Reconfiguration\n and Retasking for Mission-Critical Applications","summary":" The development of satellite constellation applications is rapidly advancing\ndue to increasing user demands, reduced operational costs, and technological\nadvancements. However, a significant gap in the existing literature concerns\nreconfiguration and retasking issues within satellite constellations, which is\nthe primary focus of our research. In this work, we critically assess the\nimpact of satellite failures on constellation performance and the associated\ntask requirements. To facilitate this analysis, we introduce a system modeling\napproach for GPS satellite constellations, enabling an investigation into\nperformance dynamics and task distribution strategies, particularly in\nscenarios where satellite failures occur during mission-critical operations.\nAdditionally, we introduce reinforcement learning (RL) techniques, specifically\nQ-learning, Policy Gradient, Deep Q-Network (DQN), and Proximal Policy\nOptimization (PPO), for managing satellite constellations, addressing the\nchallenges posed by reconfiguration and retasking following satellite failures.\nOur results demonstrate that DQN and PPO achieve effective outcomes in terms of\naverage rewards, task completion rates, and response times.\n","authors":["Hassan El Alami","Danda B. Rawat"],"pdf_url":"https://arxiv.org/pdf/2409.02270v1.pdf","comment":"Accepted for publication in the IEEE Military Communications\n Conference (IEEE MILCOM 2024)"},{"id":"http://arxiv.org/abs/2310.15128v2","updated":"2024-09-03T19:55:22Z","published":"2023-10-23T17:32:38Z","title":"Projected Stochastic Gradient Descent with Quantum Annealed Binary\n Gradients","summary":" We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards\ntraining neural networks with binary weights, known as binary neural networks\n(BNNs), on quantum hardware. BNNs reduce the computational requirements and\nenergy consumption of deep learning models with minimal loss in accuracy.\nHowever, training them in practice remains to be an open challenge. Most known\nBNN-optimisers either rely on projected updates or binarise weights\npost-training. Instead, QP-SBGD approximately maps the gradient onto binary\nvariables, by solving a quadratic constrained binary optimisation. Under\npractically reasonable assumptions, we show that this update rule converges\nwith a rate of $\\mathcal{O}(1 / \\sqrt{T})$. Moreover, we show how the\n$\\mathcal{NP}$-hard projection can be effectively executed on an adiabatic\nquantum annealer, harnessing recent advancements in quantum computation. We\nalso introduce a projected version of this update rule and prove that if a\nfixed point exists in the binary variable space, the modified updates will\nconverge to it. Last but not least, our algorithm is implemented layer-wise,\nmaking it suitable to train larger networks on resource-limited quantum\nhardware. Through extensive evaluations, we show that QP-SBGD outperforms or is\non par with competitive and well-established baselines such as BinaryConnect,\nsignSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as\nwell as binary graph neural networks.\n","authors":["Maximilian Krahn","Michele Sasdelli","Fengyi Yang","Vladislav Golyanik","Juho Kannala","Tat-Jun Chin","Tolga Birdal"],"pdf_url":"https://arxiv.org/pdf/2310.15128v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.05449v2","updated":"2024-09-03T08:01:47Z","published":"2023-12-09T03:33:14Z","title":"TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot\n Image Classification","summary":" Few-shot image classification aims to classify images from unseen novel\nclasses with few samples. Recent works demonstrate that deep local descriptors\nexhibit enhanced representational capabilities compared to image-level\nfeatures. However, most existing methods solely rely on either employing all\nlocal descriptors or directly utilizing partial descriptors, potentially\nresulting in the loss of crucial information. Moreover, these methods primarily\nemphasize the selection of query descriptors while overlooking support\ndescriptors. In this paper, we propose a novel Task-Aware Adaptive Local\nDescriptors Selection Network (TALDS-Net), which exhibits the capacity for\nadaptive selection of task-aware support descriptors and query descriptors.\nSpecifically, we compare the similarity of each local support descriptor with\nother local support descriptors to obtain the optimal support descriptor subset\nand then compare the query descriptors with the optimal support subset to\nobtain discriminative query descriptors. Extensive experiments demonstrate that\nour TALDS-Net outperforms state-of-the-art methods on both general and\nfine-grained datasets.\n","authors":["Qian Qiao","Yu Xie","Ziyin Zeng","Fanzhang Li"],"pdf_url":"https://arxiv.org/pdf/2312.05449v2.pdf","comment":"4 pages, 1 figures, is accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2408.01690v2","updated":"2024-09-03T22:30:34Z","published":"2024-08-03T07:05:40Z","title":"IDNet: A Novel Dataset for Identity Document Analysis and Fraud\n Detection","summary":" Effective fraud detection and analysis of government-issued identity\ndocuments, such as passports, driver's licenses, and identity cards, are\nessential in thwarting identity theft and bolstering security on online\nplatforms. The training of accurate fraud detection and analysis tools depends\non the availability of extensive identity document datasets. However, current\npublicly available benchmark datasets for identity document analysis, including\nMIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a\nlimited number of samples, cover insufficient varieties of fraud patterns, and\nseldom include alterations in critical personal identifying fields like\nportrait images, limiting their utility in training models capable of detecting\nrealistic frauds while preserving privacy.\n In response to these shortcomings, our research introduces a new benchmark\ndataset, IDNet, designed to advance privacy-preserving fraud detection efforts.\nThe IDNet dataset comprises 837,060 images of synthetically generated identity\ndocuments, totaling approximately 490 gigabytes, categorized into 20 types from\n$10$ U.S. states and 10 European countries. We evaluate the utility and present\nuse cases of the dataset, illustrating how it can aid in training\nprivacy-preserving fraud detection methods, facilitating the generation of\ncamera and video capturing of identity documents, and testing schema\nunification and other identity document management functionalities.\n","authors":["Hong Guan","Yancheng Wang","Lulu Xie","Soham Nag","Rajeev Goel","Niranjan Erappa Narayana Swamy","Yingzhen Yang","Chaowei Xiao","Jonathan Prisby","Ross Maciejewski","Jia Zou"],"pdf_url":"https://arxiv.org/pdf/2408.01690v2.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2409.02266v1","updated":"2024-09-03T19:52:49Z","published":"2024-09-03T19:52:49Z","title":"LSTMSE-Net: Long Short Term Speech Enhancement Network for Audio-visual\n Speech Enhancement","summary":" In this paper, we propose long short term memory speech enhancement network\n(LSTMSE-Net), an audio-visual speech enhancement (AVSE) method. This innovative\nmethod leverages the complementary nature of visual and audio information to\nboost the quality of speech signals. Visual features are extracted with\nVisualFeatNet (VFN), and audio features are processed through an encoder and\ndecoder. The system scales and concatenates visual and audio features, then\nprocesses them through a separator network for optimized speech enhancement.\nThe architecture highlights advancements in leveraging multi-modal data and\ninterpolation techniques for robust AVSE challenge systems. The performance of\nLSTMSE-Net surpasses that of the baseline model from the COG-MHEAR AVSE\nChallenge 2024 by a margin of 0.06 in scale-invariant signal-to-distortion\nratio (SISDR), $0.03$ in short-time objective intelligibility (STOI), and\n$1.32$ in perceptual evaluation of speech quality (PESQ). The source code of\nthe proposed LSTMSE-Net is available at\n\\url{https://github.com/mtanveer1/AVSEC-3-Challenge}.\n","authors":["Arnav Jain","Jasmer Singh Sanjotra","Harshvardhan Choudhary","Krish Agrawal","Rupal Shah","Rohan Jha","M. Sajid","Amir Hussain","M. Tanveer"],"pdf_url":"https://arxiv.org/pdf/2409.02266v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02108v1","updated":"2024-09-03T17:59:05Z","published":"2024-09-03T17:59:05Z","title":"Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection,\n Removal, and Generation in the Era of Deep Learning","summary":" Shadows are formed when light encounters obstacles, leading to areas of\ndiminished illumination. In computer vision, shadow detection, removal, and\ngeneration are crucial for enhancing scene understanding, refining image\nquality, ensuring visual consistency in video editing, and improving virtual\nenvironments. This paper presents a comprehensive survey of shadow detection,\nremoval, and generation in images and videos within the deep learning landscape\nover the past decade, covering tasks, deep models, datasets, and evaluation\nmetrics. Our key contributions include a comprehensive survey of shadow\nanalysis, standardization of experimental comparisons, exploration of the\nrelationships among model size, speed, and performance, a cross-dataset\ngeneralization study, identification of open issues and future directions, and\nprovision of publicly available resources to support further research.\n","authors":["Xiaowei Hu","Zhenghao Xing","Tianyu Wang","Chi-Wing Fu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02108v1.pdf","comment":"Publicly available results, trained models, and evaluation metrics at\n https://github.com/xw-hu/Unveiling-Deep-Shadows"},{"id":"http://arxiv.org/abs/2409.02101v1","updated":"2024-09-03T17:56:51Z","published":"2024-09-03T17:56:51Z","title":"Towards Real-World Adverse Weather Image Restoration: Enhancing\n Clearness and Semantics with Vision-Language Models","summary":" This paper addresses the limitations of adverse weather image restoration\napproaches trained on synthetic data when applied to real-world scenarios. We\nformulate a semi-supervised learning framework employing vision-language models\nto enhance restoration performance across diverse adverse weather conditions in\nreal-world settings. Our approach involves assessing image clearness and\nproviding semantics using vision-language models on real data, serving as\nsupervision signals for training restoration models. For clearness enhancement,\nwe use real-world data, utilizing a dual-step strategy with pseudo-labels\nassessed by vision-language models and weather prompt learning. For semantic\nenhancement, we integrate real-world data by adjusting weather conditions in\nvision-language model descriptions while preserving semantic meaning.\nAdditionally, we introduce an effective training strategy to bootstrap\nrestoration performance. Our approach achieves superior results in real-world\nadverse weather image restoration, demonstrated through qualitative and\nquantitative comparisons with state-of-the-art works.\n","authors":["Jiaqi Xu","Mengyang Wu","Xiaowei Hu","Chi-Wing Fu","Qi Dou","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2409.02101v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02049v1","updated":"2024-09-03T16:53:34Z","published":"2024-09-03T16:53:34Z","title":"Low-Resolution Face Recognition via Adaptable Instance-Relation\n Distillation","summary":" Low-resolution face recognition is a challenging task due to the missing of\ninformative details. Recent approaches based on knowledge distillation have\nproven that high-resolution clues can well guide low-resolution face\nrecognition via proper knowledge transfer. However, due to the distribution\ndifference between training and testing faces, the learned models often suffer\nfrom poor adaptability. To address that, we split the knowledge transfer\nprocess into distillation and adaptation steps, and propose an adaptable\ninstance-relation distillation approach to facilitate low-resolution face\nrecognition. In the approach, the student distills knowledge from\nhigh-resolution teacher in both instance level and relation level, providing\nsufficient cross-resolution knowledge transfer. Then, the learned student can\nbe adaptable to recognize low-resolution faces with adaptive batch\nnormalization in inference. In this manner, the capability of recovering\nmissing details of familiar low-resolution faces can be effectively enhanced,\nleading to a better knowledge transfer. Extensive experiments on low-resolution\nface recognition clearly demonstrate the effectiveness and adaptability of our\napproach.\n","authors":["Ruixin Shi","Weijia Guo","Shiming Ge"],"pdf_url":"https://arxiv.org/pdf/2409.02049v1.pdf","comment":"Accepted by IJCNN 2024"},{"id":"http://arxiv.org/abs/2409.01761v1","updated":"2024-09-03T10:15:30Z","published":"2024-09-03T10:15:30Z","title":"PRoGS: Progressive Rendering of Gaussian Splats","summary":" Over the past year, 3D Gaussian Splatting (3DGS) has received significant\nattention for its ability to represent 3D scenes in a perceptually accurate\nmanner. However, it can require a substantial amount of storage since each\nsplat's individual data must be stored. While compression techniques offer a\npotential solution by reducing the memory footprint, they still necessitate\nretrieving the entire scene before any part of it can be rendered. In this\nwork, we introduce a novel approach for progressively rendering such scenes,\naiming to display visible content that closely approximates the final scene as\nearly as possible without loading the entire scene into memory. This approach\nbenefits both on-device rendering applications limited by memory constraints\nand streaming applications where minimal bandwidth usage is preferred. To\nachieve this, we approximate the contribution of each Gaussian to the final\nscene and construct an order of prioritization on their inclusion in the\nrendering process. Additionally, we demonstrate that our approach can be\ncombined with existing compression methods to progressively render (and stream)\n3DGS scenes, optimizing bandwidth usage by focusing on the most important\nsplats within a scene. Overall, our work establishes a foundation for making\nremotely hosted 3DGS content more quickly accessible to end-users in\nover-the-top consumption scenarios, with our results showing significant\nimprovements in quality across all metrics compared to existing methods.\n","authors":["Brent Zoomers","Maarten Wijnants","Ivan Molenaers","Joni Vanherck","Jeroen Put","Lode Jorissen","Nick Michiels"],"pdf_url":"https://arxiv.org/pdf/2409.01761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01710v1","updated":"2024-09-03T08:47:17Z","published":"2024-09-03T08:47:17Z","title":"Privacy-Preserving Multimedia Mobile Cloud Computing Using Protective\n Perturbation","summary":" Mobile cloud computing has been adopted in many multimedia applications,\nwhere the resource-constrained mobile device sends multimedia data (e.g.,\nimages) to remote cloud servers to request computation-intensive multimedia\nservices (e.g., image recognition). While significantly improving the\nperformance of the mobile applications, the cloud-based mechanism often causes\nprivacy concerns as the multimedia data and services are offloaded from the\ntrusted user device to untrusted cloud servers. Several recent studies have\nproposed perturbation-based privacy preserving mechanisms, which obfuscate the\noffloaded multimedia data to eliminate privacy exposures without affecting the\nfunctionality of the remote multimedia services. However, the existing privacy\nprotection approaches require the deployment of computation-intensive\nperturbation generation on the resource-constrained mobile devices. Also, the\nobfuscated images are typically not compliant with the standard image\ncompression algorithms and suffer from significant bandwidth consumption. In\nthis paper, we develop a novel privacy-preserving multimedia mobile cloud\ncomputing framework, namely $PMC^2$, to address the resource and bandwidth\nchallenges. $PMC^2$ employs secure confidential computing in the cloud to\ndeploy the perturbation generator, which addresses the resource challenge while\nmaintaining the privacy. Furthermore, we develop a neural compressor\nspecifically trained to compress the perturbed images in order to address the\nbandwidth challenge. We implement $PMC^2$ in an end-to-end mobile cloud\ncomputing system, based on which our evaluations demonstrate superior latency,\npower efficiency, and bandwidth consumption achieved by $PMC^2$ while\nmaintaining high accuracy in the target multimedia service.\n","authors":["Zhongze Tang","Mengmei Ye","Yao Liu","Sheng Wei"],"pdf_url":"https://arxiv.org/pdf/2409.01710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01534v1","updated":"2024-09-03T02:08:47Z","published":"2024-09-03T02:08:47Z","title":"Think Twice Before Recognizing: Large Multimodal Models for General\n Fine-grained Traffic Sign Recognition","summary":" We propose a new strategy called think twice before recognizing to improve\nfine-grained traffic sign recognition (TSR). Fine-grained TSR in the wild is\ndifficult due to the complex road conditions, and existing approaches\nparticularly struggle with cross-country TSR when data is lacking. Our strategy\nachieves effective fine-grained TSR by stimulating the multiple-thinking\ncapability of large multimodal models (LMM). We introduce context,\ncharacteristic, and differential descriptions to design multiple thinking\nprocesses for the LMM. The context descriptions with center coordinate prompt\noptimization help the LMM to locate the target traffic sign in the original\nroad images containing multiple traffic signs and filter irrelevant answers\nthrough the proposed prior traffic sign hypothesis. The characteristic\ndescription is based on few-shot in-context learning of template traffic signs,\nwhich decreases the cross-domain difference and enhances the fine-grained\nrecognition capability of the LMM. The differential descriptions of similar\ntraffic signs optimize the multimodal thinking capability of the LMM. The\nproposed method is independent of training data and requires only simple and\nuniform instructions. We conducted extensive experiments on three benchmark\ndatasets and two real-world datasets from different countries, and the proposed\nmethod achieves state-of-the-art TSR results on all five datasets.\n","authors":["Yaozong Gan","Guang Li","Ren Togo","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2409.01534v1.pdf","comment":null}]},"2024-09-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2405.15092v2","updated":"2024-09-02T22:40:20Z","published":"2024-05-23T22:38:58Z","title":"Dissociation of Faithful and Unfaithful Reasoning in LLMs","summary":" Large language models (LLMs) often improve their performance in downstream\ntasks when they generate Chain of Thought reasoning text before producing an\nanswer. We investigate how LLMs recover from errors in Chain of Thought.\nThrough analysis of error recovery behaviors, we find evidence for\nunfaithfulness in Chain of Thought, which occurs when models arrive at the\ncorrect answer despite invalid reasoning text. We identify factors that shift\nLLM recovery behavior: LLMs recover more frequently from obvious errors and in\ncontexts that provide more evidence for the correct answer. Critically, these\nfactors have divergent effects on faithful and unfaithful recoveries. Our\nresults indicate that there are distinct mechanisms driving faithful and\nunfaithful error recoveries. Selective targeting of these mechanisms may be\nable to drive down the rate of unfaithful reasoning and improve model\ninterpretability.\n","authors":["Evelyn Yee","Alice Li","Chenyu Tang","Yeon Ho Jung","Ramamohan Paturi","Leon Bergen"],"pdf_url":"https://arxiv.org/pdf/2405.15092v2.pdf","comment":"code published at\n https://github.com/CoTErrorRecovery/CoTErrorRecovery"},{"id":"http://arxiv.org/abs/2404.07981v2","updated":"2024-09-02T21:29:04Z","published":"2024-04-11T17:57:32Z","title":"Manipulating Large Language Models to Increase Product Visibility","summary":" Large language models (LLMs) are increasingly being integrated into search\nengines to provide natural language responses tailored to user queries.\nCustomers and end-users are also becoming more dependent on these models for\nquick and easy purchase decisions. In this work, we investigate whether\nrecommendations from LLMs can be manipulated to enhance a product's visibility.\nWe demonstrate that adding a strategic text sequence (STS) -- a carefully\ncrafted message -- to a product's information page can significantly increase\nits likelihood of being listed as the LLM's top recommendation. To understand\nthe impact of STS, we use a catalog of fictitious coffee machines and analyze\nits effect on two target products: one that seldom appears in the LLM's\nrecommendations and another that usually ranks second. We observe that the\nstrategic text sequence significantly enhances the visibility of both products\nby increasing their chances of appearing as the top recommendation. This\nability to manipulate LLM-generated search responses provides vendors with a\nconsiderable competitive advantage and has the potential to disrupt fair market\ncompetition. Just as search engine optimization (SEO) revolutionized how\nwebpages are customized to rank higher in search engine results, influencing\nLLM recommendations could profoundly impact content optimization for AI-driven\nsearch services. Code for our experiments is available at\nhttps://github.com/aounon/llm-rank-optimizer.\n","authors":["Aounon Kumar","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2404.07981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10999v2","updated":"2024-09-02T20:26:30Z","published":"2024-06-16T16:25:22Z","title":"Balancing Rigor and Utility: Mitigating Cognitive Biases in Large\n Language Models for Multiple-Choice Questions","summary":" This paper examines the role of cognitive biases in the decision-making\nprocesses of large language models (LLMs), challenging the conventional goal of\neliminating all biases. We show that certain cognitive biases when properly\nbalanced, can enhance decision-making efficiency through rational deviations\nand heuristic shortcuts. By introducing heuristic moderation and an abstention\noption, which allows LLMs to withhold responses when uncertain, we reduce error\nrates, improve decision accuracy, and optimize decision rates. Using the\nBalance Rigor and Utility (BRU) dataset, developed through expert\ncollaboration, our findings demonstrate that targeted inspection of cognitive\nbiases aligns LLM decisions more closely with human reasoning, enhancing\nreliability and suggesting strategies for future improvements. This approach\noffers a novel way to leverage cognitive biases to improve the practical\nutility of LLMs across various applications.\n","authors":["Liman Wang","Hanyang Zhong"],"pdf_url":"https://arxiv.org/pdf/2406.10999v2.pdf","comment":"This article is currently under review. All data will be open on\n GitHub once the review is complete.\n https://github.com/limanwang/Balancing-Rigor-and-Utility"},{"id":"http://arxiv.org/abs/2405.15077v4","updated":"2024-09-02T20:25:36Z","published":"2024-05-23T21:56:12Z","title":"Eliciting Informative Text Evaluations with Large Language Models","summary":" Peer prediction mechanisms motivate high-quality feedback with provable\nguarantees. However, current methods only apply to rather simple reports, like\nmultiple-choice or scalar numbers. We aim to broaden these techniques to the\nlarger domain of text-based reports, drawing on the recent developments in\nlarge language models. This vastly increases the applicability of peer\nprediction mechanisms as textual feedback is the norm in a large variety of\nfeedback channels: peer reviews, e-commerce customer reviews, and comments on\nsocial media.\n We introduce two mechanisms, the Generative Peer Prediction Mechanism (GPPM)\nand the Generative Synopsis Peer Prediction Mechanism (GSPPM). These mechanisms\nutilize LLMs as predictors, mapping from one agent's report to a prediction of\nher peer's report. Theoretically, we show that when the LLM prediction is\nsufficiently accurate, our mechanisms can incentivize high effort and\ntruth-telling as an (approximate) Bayesian Nash equilibrium. Empirically, we\nconfirm the efficacy of our mechanisms through experiments conducted on two\nreal datasets: the Yelp review dataset and the ICLR OpenReview dataset. We\nhighlight the results that on the ICLR dataset, our mechanisms can\ndifferentiate three quality levels -- human-written reviews, GPT-4-generated\nreviews, and GPT-3.5-generated reviews in terms of expected scores.\nAdditionally, GSPPM penalizes LLM-generated reviews more effectively than GPPM.\n","authors":["Yuxuan Lu","Shengwei Xu","Yichi Zhang","Yuqing Kong","Grant Schoenebeck"],"pdf_url":"https://arxiv.org/pdf/2405.15077v4.pdf","comment":"Accepted by the Twenty-Fifth ACM Conference on Economics and\n Computation (EC'24)"},{"id":"http://arxiv.org/abs/2408.13295v2","updated":"2024-09-02T17:00:05Z","published":"2024-08-23T14:47:10Z","title":"Exploring Bias and Prediction Metrics to Characterise the Fairness of\n Machine Learning for Equity-Centered Public Health Decision-Making: A\n Narrative Review","summary":" Background: The rapid advancement of Machine Learning (ML) represents novel\nopportunities to enhance public health research, surveillance, and\ndecision-making. However, there is a lack of comprehensive understanding of\nalgorithmic bias, systematic errors in predicted population health outcomes,\nresulting from the public health application of ML. The objective of this\nnarrative review is to explore the types of bias generated by ML and\nquantitative metrics to assess these biases.\n Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of\nElectrical and Electronics Engineers), ACM (Association for Computing\nMachinery) Digital Library, Science Direct, and Springer Nature. We used\nkeywords to identify studies describing types of bias and metrics to measure\nthese in the domain of ML and public and population health published in English\nbetween 2008 and 2023, inclusive.\n Results: A total of 72 articles met the inclusion criteria. Our review\nidentified the commonly described types of bias and quantitative metrics to\nassess these biases from an equity perspective.\n Conclusion : The review will help formalize the evaluation framework for ML\non public health from an equity perspective.\n","authors":["Shaina Raza","Arash Shaban-Nejad","Elham Dolatabadi","Hiroshi Mamiya"],"pdf_url":"https://arxiv.org/pdf/2408.13295v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2404.16160v2","updated":"2024-09-02T16:33:29Z","published":"2024-04-24T19:30:18Z","title":"Domain-Specific Improvement on Psychotherapy Chatbot Using Assistant","summary":" Large language models (LLMs) have demonstrated impressive generalization\ncapabilities on specific tasks with human-written instruction data. However,\nthe limited quantity, diversity, and professional expertise of such instruction\ndata raise concerns about the performance of LLMs in psychotherapy tasks when\nprovided with domain-specific instructions. To address this, we firstly propose\nDomain-Specific Assistant Instructions based on AlexanderStreet therapy, and\nsecondly, we use an adaption fine-tuning method and retrieval augmented\ngeneration method to improve pre-trained LLMs. Through quantitative evaluation\nof linguistic quality using automatic and human evaluation, we observe that\npre-trained LLMs on Psychotherapy Assistant Instructions outperform\nstate-of-the-art LLMs response baselines. Our Assistant-Instruction approach\noffers a half-annotation method to align pre-trained LLMs with instructions and\nprovide pre-trained LLMs with more psychotherapy knowledge.\n","authors":["Cheng Kang","Daniel Novak","Katerina Urbanova","Yuqing Cheng","Yong Hu"],"pdf_url":"https://arxiv.org/pdf/2404.16160v2.pdf","comment":"Accepted at ICASSP 2024 EIHRC Workshop"},{"id":"http://arxiv.org/abs/2404.14024v2","updated":"2024-09-02T16:20:49Z","published":"2024-04-22T09:40:07Z","title":"Exploring neural oscillations during speech perception via surrogate\n gradient spiking neural networks","summary":" Understanding cognitive processes in the brain demands sophisticated models\ncapable of replicating neural dynamics at large scales. We present a\nphysiologically inspired speech recognition architecture, compatible and\nscalable with deep learning frameworks, and demonstrate that end-to-end\ngradient descent training leads to the emergence of neural oscillations in the\ncentral spiking neural network. Significant cross-frequency couplings,\nindicative of these oscillations, are measured within and across network layers\nduring speech processing, whereas no such interactions are observed when\nhandling background noise inputs. Furthermore, our findings highlight the\ncrucial inhibitory role of feedback mechanisms, such as spike frequency\nadaptation and recurrent connections, in regulating and synchronising neural\nactivity to improve recognition performance. Overall, on top of developing our\nunderstanding of synchronisation phenomena notably observed in the human\nauditory pathway, our architecture exhibits dynamic and efficient information\nprocessing, with relevance to neuromorphic technology.\n","authors":["Alexandre Bittar","Philip N. Garner"],"pdf_url":"https://arxiv.org/pdf/2404.14024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13152v2","updated":"2024-09-02T15:42:03Z","published":"2024-06-19T02:00:51Z","title":"Analyzing Diversity in Healthcare LLM Research: A Scientometric\n Perspective","summary":" The deployment of large language models (LLMs) in healthcare has demonstrated\nsubstantial potential for enhancing clinical decision-making, administrative\nefficiency, and patient outcomes. However, the underrepresentation of diverse\ngroups in the development and application of these models can perpetuate\nbiases, leading to inequitable healthcare delivery. This paper presents a\ncomprehensive scientometric analysis of LLM research for healthcare, including\ndata from January 1, 2021, to July 1, 2024. By analyzing metadata from PubMed\nand Dimensions, including author affiliations, countries, and funding sources,\nwe assess the diversity of contributors to LLM research. Our findings highlight\nsignificant gender and geographic disparities, with a predominance of male\nauthors and contributions primarily from high-income countries (HICs). We\nintroduce a novel journal diversity index based on Gini diversity to measure\nthe inclusiveness of scientific publications. Our results underscore the\nnecessity for greater representation in order to ensure the equitable\napplication of LLMs in healthcare. We propose actionable strategies to enhance\ndiversity and inclusivity in artificial intelligence research, with the\nultimate goal of fostering a more inclusive and equitable future in healthcare\ninnovation.\n","authors":["David Restrepo","Chenwei Wu","Constanza Vásquez-Venegas","João Matos","Jack Gallifant","Leo Anthony Celi","Danielle S. Bitterman","Luis Filipe Nakayama"],"pdf_url":"https://arxiv.org/pdf/2406.13152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02887v2","updated":"2024-09-02T15:41:34Z","published":"2024-05-05T10:52:09Z","title":"Sentiment Analysis Across Languages: Evaluation Before and After Machine\n Translation to English","summary":" People communicate in more than 7,000 languages around the world, with around\n780 languages spoken in India alone. Despite this linguistic diversity,\nresearch on Sentiment Analysis has predominantly focused on English text data,\nresulting in a disproportionate availability of sentiment resources for\nEnglish. This paper examines the performance of transformer models in Sentiment\nAnalysis tasks across multilingual datasets and text that has undergone machine\ntranslation. By comparing the effectiveness of these models in different\nlinguistic contexts, we gain insights into their performance variations and\npotential implications for sentiment analysis across diverse languages. We also\ndiscuss the shortcomings and potential for future work towards the end.\n","authors":["Aekansh Kathunia","Mohammad Kaif","Nalin Arora","N Narotam"],"pdf_url":"https://arxiv.org/pdf/2405.02887v2.pdf","comment":"6 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2310.12537v3","updated":"2024-09-02T12:36:06Z","published":"2023-10-19T07:39:00Z","title":"ExtractGPT: Exploring the Potential of Large Language Models for Product\n Attribute Value Extraction","summary":" In order to facilitate features such as faceted product search and product\ncomparison, e-commerce platforms require accurately structured product data,\nincluding precise attribute/value pairs. Vendors often times provide\nunstructured product descriptions consisting only of an offer title and a\ntextual description. Consequently, extracting attribute values from titles and\ndescriptions is vital for e-commerce platforms. State-of-the-art attribute\nvalue extraction methods based on pre-trained language models, such as BERT,\nface two drawbacks (i) the methods require significant amounts of task-specific\ntraining data and (ii) the fine-tuned models have problems with generalising to\nunseen attribute values that were not part of the training data. This paper\nexplores the potential of using large language models as a more training\ndata-efficient and more robust alternative to existing AVE methods. We propose\nprompt templates for describing the target attributes of the extraction to the\nLLM, covering both zero-shot and few-shot scenarios. In the zero-shot scenario,\ntextual and JSON-based target schema representations of the attributes are\ncompared. In the few-shot scenario, we investigate (i) the provision of example\nattribute values, (ii) the selection of in-context demonstrations, (iii)\nshuffled ensembling to prevent position bias, and (iv) fine-tuning the LLM. We\nevaluate the prompt templates in combination with hosted LLMs, such as GPT-3.5\nand GPT-4, and open-source LLMs which can be run locally. We compare the\nperformance of the LLMs to the PLM-based methods SU-OpenTag, AVEQA, and MAVEQA.\nThe highest average F1-score of 86% was achieved by GPT-4. Llama-3-70B performs\nonly 3% worse than GPT-4, making it a competitive open-source alternative.\nGiven the same training data, this prompt/GPT-4 combination outperforms the\nbest PLM baseline by an average of 6% F1-score.\n","authors":["Alexander Brinkmann","Roee Shraga","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.12537v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20352v2","updated":"2024-09-02T12:07:54Z","published":"2023-10-31T10:47:33Z","title":"AMERICANO: Argument Generation with Discourse-driven Decomposition and\n Agent Interaction","summary":" Argument generation is a challenging task in natural language processing,\nwhich requires rigorous reasoning and proper content organization. Inspired by\nrecent chain-of-thought prompting that breaks down a complex task into\nintermediate steps, we propose Americano, a novel framework with agent\ninteraction for argument generation. Our approach decomposes the generation\nprocess into sequential actions grounded on argumentation theory, which first\nexecutes actions sequentially to generate argumentative discourse components,\nand then produces a final argument conditioned on the components. To further\nmimic the human writing process and improve the left-to-right generation\nparadigm of current autoregressive language models, we introduce an argument\nrefinement module which automatically evaluates and refines argument drafts\nbased on feedback received. We evaluate our framework on the task of\ncounterargument generation using a subset of Reddit/CMV dataset. The results\nshow that our method outperforms both end-to-end and chain-of-thought prompting\nmethods and can generate more coherent and persuasive arguments with diverse\nand rich contents.\n","authors":["Zhe Hu","Hou Pong Chan","Yu Yin"],"pdf_url":"https://arxiv.org/pdf/2310.20352v2.pdf","comment":"INLG 2024"},{"id":"http://arxiv.org/abs/2408.14438v3","updated":"2024-09-02T11:59:05Z","published":"2024-08-26T17:25:16Z","title":"Evaluating Large Language Models on Spatial Tasks: A Multi-Task\n Benchmarking Study","summary":" The advent of large language models such as ChatGPT, Gemini, and others has\nunderscored the importance of evaluating their diverse capabilities, ranging\nfrom natural language understanding to code generation. However, their\nperformance on spatial tasks has not been comprehensively assessed. This study\naddresses this gap by introducing a novel multi-task spatial evaluation\ndataset, designed to systematically explore and compare the performance of\nseveral advanced models on spatial tasks. The dataset encompasses twelve\ndistinct task types, including spatial understanding and path planning, each\nwith verified, accurate answers. We evaluated multiple models, including\nOpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase\ntesting approach. Initially, we conducted zero-shot testing, followed by\ncategorizing the dataset by difficulty and performing prompt tuning tests.\nResults indicate that gpt-4o achieved the highest overall accuracy in the first\nphase, with an average of 71.3%. Although moonshot-v1-8k slightly\nunderperformed overall, it surpassed gpt-4o in place name recognition tasks.\nThe study also highlights the impact of prompt strategies on model performance\nin specific tasks. For example, the Chain-of-Thought (COT) strategy increased\ngpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot\nstrategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to\n76.3%.\n","authors":["Liuchang Xu","Shuo Zhao","Qingming Lin","Luyao Chen","Qianqian Luo","Sensen Wu","Xinyue Ye","Hailin Feng","Zhenhong Du"],"pdf_url":"https://arxiv.org/pdf/2408.14438v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07448v2","updated":"2024-09-02T11:45:41Z","published":"2024-08-14T10:36:17Z","title":"LiveFC: A System for Live Fact-Checking of Audio Streams","summary":" The advances in the digital era have led to rapid dissemination of\ninformation. This has also aggravated the spread of misinformation and\ndisinformation. This has potentially serious consequences, such as civil\nunrest. While fact-checking aims to combat this, manual fact-checking is\ncumbersome and not scalable. While automated fact-checking approaches exist,\nthey do not operate in real-time and do not always account for spread of\nmisinformation through different modalities. This is particularly important as\nproactive fact-checking on live streams in real-time can help people be\ninformed of false narratives and prevent catastrophic consequences that may\ncause civil unrest. This is particularly relevant with the rapid dissemination\nof information through video on social media platforms or other streams like\npolitical rallies and debates. Hence, in this work we develop a platform named\nLiveFC, that can aid in fact-checking live audio streams in real-time. LiveFC\nhas a user-friendly interface that displays the claims detected along with\ntheir veracity and evidence for live streams with associated speakers for\nclaims from respective segments. The app can be accessed at\nhttp://livefc.factiverse.ai and a screen recording of the demo can be found at\nhttps://bit.ly/3WVAoIw.\n","authors":["Venktesh V","Vinay Setty"],"pdf_url":"https://arxiv.org/pdf/2408.07448v2.pdf","comment":"Under Review, 11 pages"},{"id":"http://arxiv.org/abs/2403.08564v3","updated":"2024-09-02T11:09:55Z","published":"2024-03-13T14:19:08Z","title":"Generalizing Fairness to Generative Language Models via Reformulation of\n Non-discrimination Criteria","summary":" Generative AI, such as large language models, has undergone rapid development\nwithin recent years. As these models become increasingly available to the\npublic, concerns arise about perpetuating and amplifying harmful biases in\napplications. Gender stereotypes can be harmful and limiting for the\nindividuals they target, whether they consist of misrepresentation or\ndiscrimination. Recognizing gender bias as a pervasive societal construct, this\npaper studies how to uncover and quantify the presence of gender biases in\ngenerative language models. In particular, we derive generative AI analogues of\nthree well-known non-discrimination criteria from classification, namely\nindependence, separation and sufficiency. To demonstrate these criteria in\naction, we design prompts for each of the criteria with a focus on occupational\ngender stereotype, specifically utilizing the medical test to introduce the\nground truth in the generative AI context. Our results address the presence of\noccupational gender bias within such conversational language models.\n","authors":["Sara Sterlie","Nina Weng","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2403.08564v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05141v3","updated":"2024-09-02T10:55:30Z","published":"2024-08-09T15:53:55Z","title":"A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning","summary":" Retrieval-augmented generation (RAG) is a framework enabling large language\nmodels (LLMs) to enhance their accuracy and reduce hallucinations by\nintegrating external knowledge bases. In this paper, we introduce a hybrid RAG\nsystem enhanced through a comprehensive suite of optimizations that\nsignificantly improve retrieval quality, augment reasoning capabilities, and\nrefine numerical computation ability. We refined the text chunks and tables in\nweb pages, added attribute predictors to reduce hallucinations, conducted LLM\nKnowledge Extractor and Knowledge Graph Extractor, and finally built a\nreasoning strategy with all the references. We evaluated our system on the CRAG\ndataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and\nonline evaluations demonstrate that our system significantly enhances complex\nreasoning capabilities. In local evaluations, we have significantly improved\naccuracy and reduced error rates compared to the baseline model, achieving a\nnotable increase in scores. In the meanwhile, we have attained outstanding\nresults in online assessments, demonstrating the performance and generalization\ncapabilities of the proposed system. The source code for our system is released\nin \\url{https://gitlab.aicrowd.com/shizueyy/crag-new}.\n","authors":["Ye Yuan","Chengwu Liu","Jingyang Yuan","Gongbo Sun","Siqi Li","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05141v3.pdf","comment":"Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024"},{"id":"http://arxiv.org/abs/2406.03816v2","updated":"2024-09-02T09:48:18Z","published":"2024-06-06T07:40:00Z","title":"ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search","summary":" Recent methodologies in LLM self-training mostly rely on LLM generating\nresponses and filtering those with correct output answers as training data.\nThis approach often yields a low-quality fine-tuning training set (e.g.,\nincorrect plans or intermediate reasoning). In this paper, we develop a\nreinforced self-training approach, called ReST-MCTS*, based on integrating\nprocess reward guidance with tree search MCTS* for collecting higher-quality\nreasoning traces as well as per-step value to train policy and reward models.\nReST-MCTS* circumvents the per-step manual annotation typically used to train\nprocess rewards by tree-search-based reinforcement learning: Given oracle final\ncorrect answers, ReST-MCTS* is able to infer the correct process rewards by\nestimating the probability this step can help lead to the correct answer. These\ninferred rewards serve dual purposes: they act as value targets for further\nrefining the process reward model and also facilitate the selection of\nhigh-quality traces for policy model self-training. We first show that the\ntree-search policy in ReST-MCTS* achieves higher accuracy compared with prior\nLLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same\nsearch budget. We then show that by using traces searched by this tree-search\npolicy as training data, we can continuously enhance the three language models\nfor multiple iterations, and outperform other self-training algorithms such as\nReST$^\\text{EM}$ and Self-Rewarding LM.\n","authors":["Dan Zhang","Sining Zhoubian","Ziniu Hu","Yisong Yue","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2406.03816v2.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2406.16069v2","updated":"2024-09-02T09:13:51Z","published":"2024-06-23T10:36:35Z","title":"FastMem: Fast Memorization of Prompt Improves Context Awareness of Large\n Language Models","summary":" Large language models (LLMs) excel in generating coherent text, but they\noften struggle with context awareness, leading to inaccuracies in tasks\nrequiring faithful adherence to provided information. We introduce FastMem, a\nnovel method designed to enhance instruction fine-tuned LLMs' context awareness\nthrough fast memorization of the prompt. FastMem maximizes the likelihood of\nthe prompt before inference by fine-tuning only the last Feed-Forward Network\n(FFN) module. This targeted approach ensures efficient optimization without\noverfitting, significantly improving the model's ability to comprehend and\naccurately follow the context. Our experiments demonstrate substantial gains in\nreading comprehension, text summarization and adherence to output structures.\nFor instance, FastMem improves the accuracy of Llama 3-8B-Inst on the NQ-SWAP\ndataset from 59.1% to 71.6%, and reduces the output structure failure rate of\nQwen 1.5-4B-Chat from 34.9% to 25.5%. Extensive experimental results highlight\nFastMem's potential to offer a robust solution to enhance the reliability and\naccuracy of LLMs in various applications. Our code is available at:\nhttps://github.com/IAAR-Shanghai/FastMem\n","authors":["Junyi Zhu","Shuochen Liu","Yu Yu","Bo Tang","Yibo Yan","Zhiyu Li","Feiyu Xiong","Tong Xu","Matthew B. Blaschko"],"pdf_url":"https://arxiv.org/pdf/2406.16069v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16837v3","updated":"2024-09-02T08:33:21Z","published":"2023-06-29T10:29:23Z","title":"A Formal Perspective on Byte-Pair Encoding","summary":" Byte-Pair Encoding (BPE) is a popular algorithm used for tokenizing data in\nNLP, despite being devised initially as a compression method. BPE appears to be\na greedy algorithm at face value, but the underlying optimization problem that\nBPE seeks to solve has not yet been laid down. We formalize BPE as a\ncombinatorial optimization problem. Via submodular functions, we prove that the\niterative greedy version is a\n$\\frac{1}{{\\sigma(\\boldsymbol{\\mu}^\\star)}}(1-e^{-{\\sigma(\\boldsymbol{\\mu}^\\star)}})$-approximation\nof an optimal merge sequence, where ${\\sigma(\\boldsymbol{\\mu}^\\star)}$ is the\ntotal backward curvature with respect to the optimal merge sequence\n$\\boldsymbol{\\mu}^\\star$. Empirically the lower bound of the approximation is\n$\\approx 0.37$.\n We provide a faster implementation of BPE which improves the runtime\ncomplexity from $\\mathcal{O}\\left(N M\\right)$ to $\\mathcal{O}\\left(N \\log\nM\\right)$, where $N$ is the sequence length and $M$ is the merge count.\nFinally, we optimize the brute-force algorithm for optimal BPE using\nmemoization.\n","authors":["Vilém Zouhar","Clara Meister","Juan Luis Gastaldi","Li Du","Tim Vieira","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2306.16837v3.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2408.15366v2","updated":"2024-09-02T08:18:52Z","published":"2024-08-27T19:03:11Z","title":"Pitfalls and Outlooks in Using COMET","summary":" Since its introduction, the COMET metric has blazed a trail in the machine\ntranslation community, given its strong correlation with human judgements of\ntranslation quality. Its success stems from being a modified pre-trained\nmultilingual model finetuned for quality assessment. However, it being a\nmachine learning model also gives rise to a new set of pitfalls that may not be\nwidely known. We investigate these unexpected behaviours from three aspects: 1)\ntechnical: obsolete software versions and compute precision; 2) data: empty\ncontent, language mismatch, and translationese at test time as well as\ndistribution and domain biases in training; 3) usage and reporting:\nmulti-reference support and model referencing in the literature. All of these\nproblems imply that COMET scores is not comparable between papers or even\ntechnical setups and we put forward our perspective on fixing each issue.\nFurthermore, we release the SacreCOMET package that can generate a signature\nfor the software and model configuration as well as an appropriate citation.\nThe goal of this work is to help the community make more sound use of the COMET\nmetric.\n","authors":["Vilém Zouhar","Pinzhen Chen","Tsz Kin Lam","Nikita Moghe","Barry Haddow"],"pdf_url":"https://arxiv.org/pdf/2408.15366v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16020v3","updated":"2024-09-02T07:54:54Z","published":"2024-06-23T05:40:26Z","title":"AudioBench: A Universal Benchmark for Audio Large Language Models","summary":" We introduce AudioBench, a universal benchmark designed to evaluate Audio\nLarge Language Models (AudioLLMs). It encompasses 8 distinct tasks and 26\ndatasets, among which, 7 are newly proposed datasets. The evaluation targets\nthree main aspects: speech understanding, audio scene understanding, and voice\nunderstanding (paralinguistic). Despite recent advancements, there lacks a\ncomprehensive benchmark for AudioLLMs on instruction following capabilities\nconditioned on audio signals. AudioBench addresses this gap by setting up\ndatasets as well as desired evaluation metrics. Besides, we also evaluated the\ncapabilities of five popular models and found that no single model excels\nconsistently across all tasks. We outline the research outlook for AudioLLMs\nand anticipate that our open-sourced evaluation toolkit, data, and leaderboard\nwill offer a robust testbed for future model developments.\n","authors":["Bin Wang","Xunlong Zou","Geyu Lin","Shuo Sun","Zhuohan Liu","Wenyu Zhang","Zhengyuan Liu","AiTi Aw","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2406.16020v3.pdf","comment":"v3 - Abundent update on models and evaluation details; Code:\n https://github.com/AudioLLMs/AudioBench"},{"id":"http://arxiv.org/abs/2308.09067v3","updated":"2024-09-02T07:26:46Z","published":"2023-08-17T15:54:38Z","title":"Contrasting Linguistic Patterns in Human and LLM-Generated News Text","summary":" We conduct a quantitative analysis contrasting human-written English news\ntext with comparable large language model (LLM) output from six different LLMs\nthat cover three different families and four sizes in total. Our analysis spans\nseveral measurable linguistic dimensions, including morphological, syntactic,\npsychometric, and sociolinguistic aspects. The results reveal various\nmeasurable differences between human and AI-generated texts. Human texts\nexhibit more scattered sentence length distributions, more variety of\nvocabulary, a distinct use of dependency and constituent types, shorter\nconstituents, and more optimized dependency distances. Humans tend to exhibit\nstronger negative emotions (such as fear and disgust) and less joy compared to\ntext generated by LLMs, with the toxicity of these models increasing as their\nsize grows. LLM outputs use more numbers, symbols and auxiliaries (suggesting\nobjective language) than human texts, as well as more pronouns. The sexist bias\nprevalent in human text is also expressed by LLMs, and even magnified in all of\nthem but one. Differences between LLMs and humans are larger than between LLMs.\n","authors":["Alberto Muñoz-Ortiz","Carlos Gómez-Rodríguez","David Vilares"],"pdf_url":"https://arxiv.org/pdf/2308.09067v3.pdf","comment":"Published at Artificial Intelligence Review vol. 57, 265"},{"id":"http://arxiv.org/abs/2310.05191v2","updated":"2024-09-02T06:24:32Z","published":"2023-10-08T15:00:04Z","title":"LLM-as-a-tutor in EFL Writing Education: Focusing on Evaluation of\n Student-LLM Interaction","summary":" In the context of English as a Foreign Language (EFL) writing education,\nLLM-as-a-tutor can assist students by providing real-time feedback on their\nessays. However, challenges arise in assessing LLM-as-a-tutor due to differing\nstandards between educational and general use cases. To bridge this gap, we\nintegrate pedagogical principles to assess student-LLM interaction. First, we\nexplore how LLMs can function as English tutors, providing effective essay\nfeedback tailored to students. Second, we propose three metrics to evaluate\nLLM-as-a-tutor specifically designed for EFL writing education, emphasizing\npedagogical aspects. In this process, EFL experts evaluate the feedback from\nLLM-as-a-tutor regarding quality and characteristics. On the other hand, EFL\nlearners assess their learning outcomes from interaction with LLM-as-a-tutor.\nThis approach lays the groundwork for developing LLMs-as-a-tutor tailored to\nthe needs of EFL learners, advancing the effectiveness of writing education in\nthis context.\n","authors":["Jieun Han","Haneul Yoo","Junho Myung","Minsun Kim","Hyunseung Lim","Yoonsu Kim","Tak Yeon Lee","Hwajung Hong","Juho Kim","So-Yeon Ahn","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2310.05191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14033v2","updated":"2024-09-02T05:55:06Z","published":"2024-08-26T05:55:48Z","title":"MLR-Copilot: Autonomous Machine Learning Research based on Large\n Language Models Agents","summary":" Machine learning research, crucial for technological advancements and\ninnovation, often faces significant challenges due to its inherent complexity,\nslow pace of experimentation, and the necessity for specialized expertise.\nMotivated by this, we present a new systematic framework, autonomous Machine\nLearning Research with large language models (MLR-Copilot), designed to enhance\nmachine learning research productivity through the automatic generation and\nimplementation of research ideas using Large Language Model (LLM) agents. The\nframework consists of three phases: research idea generation, experiment\nimplementation, and implementation execution. First, existing research papers\nare used to generate hypotheses and experimental plans vis IdeaAgent powered by\nLLMs. Next, the implementation generation phase translates these plans into\nexecutables with ExperimentAgent. This phase leverages retrieved prototype code\nand optionally retrieves candidate models and data. Finally, the execution\nphase, also managed by ExperimentAgent, involves running experiments with\nmechanisms for human feedback and iterative debugging to enhance the likelihood\nof achieving executable research outcomes. We evaluate our framework on five\nmachine learning research tasks and the experimental results show the\nframework's potential to facilitate the research progress and innovations.\n","authors":["Ruochen Li","Teerth Patel","Qingyun Wang","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2408.14033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03024v3","updated":"2024-09-02T05:51:02Z","published":"2023-08-06T05:23:25Z","title":"Show Me the World in My Language: Establishing the First Baseline for\n Scene-Text to Scene-Text Translation","summary":" In this work, we study the task of ``visually'' translating scene text from a\nsource language (e.g., Hindi) to a target language (e.g., English). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe source scene text, such as font, size, and background. There are several\nchallenges associated with this task, such as translation with limited context,\ndeciding between translation and transliteration, accommodating varying text\nlengths within fixed spatial boundaries, and preserving the font and background\nstyles of the source scene text in the target language. To address this\nproblem, we make the following contributions: (i) We study visual translation\nas a standalone problem for the first time in the literature. (ii) We present a\ncascaded framework for visual translation that combines state-of-the-art\nmodules for scene text recognition, machine translation, and scene text\nsynthesis as a baseline for the task. (iii) We propose a set of task-specific\ndesign enhancements to design a variant of the baseline to obtain performance\nimprovements. (iv) Currently, the existing related literature lacks any\ncomprehensive performance evaluation for this novel task. To fill this gap, we\nintroduce several automatic and user-assisted evaluation metrics designed\nexplicitly for evaluating visual translation. Further, we evaluate presented\nbaselines for translating scene text between Hindi and English. Our experiments\ndemonstrate that although we can effectively perform visual translation over a\nlarge collection of scene text images, the presented baseline only partially\naddresses challenges posed by visual translation tasks. We firmly believe that\nthis new task and the limitations of existing models, as reported in this\npaper, should encourage further research in visual translation.\n","authors":["Shreyas Vaidya","Arvind Kumar Sharma","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v3.pdf","comment":"Accepted at ICPR 2024, Project Website:\n https://vl2g.github.io/projects/visTrans/"},{"id":"http://arxiv.org/abs/2403.06764v3","updated":"2024-09-02T05:48:54Z","published":"2024-03-11T14:35:32Z","title":"An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference\n Acceleration for Large Vision-Language Models","summary":" In this study, we identify the inefficient attention phenomena in Large\nVision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5,\nQwenVL-Chat and Video-LLaVA. We find out that the attention computation over\nvisual tokens is of extreme inefficiency in the deep layers of popular LVLMs,\nsuggesting a need for a sparser approach compared to textual data handling. To\nthis end, we introduce FastV, a versatile plug-and-play method designed to\noptimize computational efficiency by learning adaptive attention patterns in\nearly layers and pruning visual tokens in subsequent ones. Our evaluations\ndemonstrate FastV's ability to dramatically reduce computational costs (e.g., a\n45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a\nwide range of image and video understanding tasks. The computational efficiency\nand performance trade-off of FastV are highly customizable and\npareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve\na lower budget than that of a 7B-parameter model, while still maintaining\nsuperior performance. We believe FastV has practical values for deployment of\nLVLMs in edge devices and commercial models. Code is released at\nhttps://github.com/pkunlp-icler/FastV.\n","authors":["Liang Chen","Haozhe Zhao","Tianyu Liu","Shuai Bai","Junyang Lin","Chang Zhou","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2403.06764v3.pdf","comment":"Accepted to ECCV 2024 (Oral), code is released at\n https://github.com/pkunlp-icler/FastV,"},{"id":"http://arxiv.org/abs/2406.10311v2","updated":"2024-09-02T03:37:35Z","published":"2024-06-14T06:47:40Z","title":"CHiSafetyBench: A Chinese Hierarchical Safety Benchmark for Large\n Language Models","summary":" With the profound development of large language models(LLMs), their safety\nconcerns have garnered increasing attention. However, there is a scarcity of\nChinese safety benchmarks for LLMs, and the existing safety taxonomies are\ninadequate, lacking comprehensive safety detection capabilities in authentic\nChinese scenarios. In this work, we introduce CHiSafetyBench, a dedicated\nsafety benchmark for evaluating LLMs' capabilities in identifying risky content\nand refusing answering risky questions in Chinese contexts. CHiSafetyBench\nincorporates a dataset that covers a hierarchical Chinese safety taxonomy\nconsisting of 5 risk areas and 31 categories. This dataset comprises two types\nof tasks: multiple-choice questions and question-answering, evaluating LLMs\nfrom the perspectives of risk content identification and the ability to refuse\nanswering risky questions respectively. Utilizing this benchmark, we validate\nthe feasibility of automatic evaluation as a substitute for human evaluation\nand conduct comprehensive automatic safety assessments on mainstream Chinese\nLLMs. Our experiments reveal the varying performance of different models across\nvarious safety domains, indicating that all models possess considerable\npotential for improvement in Chinese safety capabilities. Our dataset is\npublicly available at\nhttps://github.com/UnicomAI/UnicomBenchmark/tree/main/CHiSafetyBench.\n","authors":["Wenjing Zhang","Xuejiao Lei","Zhaoxiang Liu","Meijuan An","Bikun Yang","KaiKai Zhao","Kai Wang","Shiguo Lian"],"pdf_url":"https://arxiv.org/pdf/2406.10311v2.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2405.04818v2","updated":"2024-09-02T02:44:11Z","published":"2024-05-08T05:36:52Z","title":"ACORN: Aspect-wise Commonsense Reasoning Explanation Evaluation","summary":" Evaluating the quality of free-text explanations is a multifaceted,\nsubjective, and labor-intensive task. Large language models (LLMs) present an\nappealing alternative due to their potential for consistency, scalability, and\ncost-efficiency. In this work, we present ACORN, a new dataset of 3,500\nfree-text explanations and aspect-wise quality ratings, and use it to evaluate\nhow LLMs rate explanations. We observed that larger models outputted labels\nthat maintained or increased the inter-annotator agreement, suggesting that\nthey are within the expected variance between human raters. However, their\ncorrelation with majority-voted human ratings varied across different quality\naspects, indicating that they are not a complete replacement. In turn, using\nLLMs as a supplement to a smaller group of human raters in some cases improved\nthe correlation with the original majority labels. However, the effect was\nlimited to cases where human raters were scarce, and an additional human rater\nhad a more pronounced effect in all cases. Overall, we recommend against using\nLLMs as a complete replacement for human raters but encourage using them in\nconfigurations that end with targeted human involvement. Data available here:\nhttps://github.com/a-brassard/ACORN\n","authors":["Ana Brassard","Benjamin Heinzerling","Keito Kudo","Keisuke Sakaguchi","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2405.04818v2.pdf","comment":"18 pages, 7 figures, accepted to COLM 2024. Data available here:\n https://github.com/a-brassard/ACORN"},{"id":"http://arxiv.org/abs/2402.14154v3","updated":"2024-09-02T02:41:26Z","published":"2024-02-21T22:27:40Z","title":"MM-Soc: Benchmarking Multimodal Large Language Models in Social Media\n Platforms","summary":" Social media platforms are hubs for multimodal information exchange,\nencompassing text, images, and videos, making it challenging for machines to\ncomprehend the information or emotions associated with interactions in online\nspaces. Multimodal Large Language Models (MLLMs) have emerged as a promising\nsolution to these challenges, yet they struggle to accurately interpret human\nemotions and complex content such as misinformation. This paper introduces\nMM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of\nmultimodal social media content. MM-Soc compiles prominent multimodal datasets\nand incorporates a novel large-scale YouTube tagging dataset, targeting a range\nof tasks from misinformation detection, hate speech detection, and social\ncontext generation. Through our exhaustive evaluation on ten size-variants of\nfour open-source MLLMs, we have identified significant performance disparities,\nhighlighting the need for advancements in models' social understanding\ncapabilities. Our analysis reveals that, in a zero-shot setting, various types\nof MLLMs generally exhibit difficulties in handling social media tasks.\nHowever, MLLMs demonstrate performance improvements post fine-tuning,\nsuggesting potential pathways for improvement. Our code and data are available\nat https://github.com/claws-lab/MMSoc.git.\n","authors":["Yiqiao Jin","Minje Choi","Gaurav Verma","Jindong Wang","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.14154v3.pdf","comment":"In Proceedings of ACL 2024"},{"id":"http://arxiv.org/abs/2408.15879v2","updated":"2024-09-02T02:30:51Z","published":"2024-08-28T15:50:41Z","title":"Persuasion Games using Large Language Models","summary":" Large Language Models (LLMs) have emerged as formidable instruments capable\nof comprehending and producing human-like text. This paper explores the\npotential of LLMs, to shape user perspectives and subsequently influence their\ndecisions on particular tasks. This capability finds applications in diverse\ndomains such as Investment, Credit cards and Insurance, wherein they assist\nusers in selecting appropriate insurance policies, investment plans, Credit\ncards, Retail, as well as in Behavioral Change Support Systems (BCSS).\n We present a sophisticated multi-agent framework wherein a consortium of\nagents operate in collaborative manner. The primary agent engages directly with\nuser agents through persuasive dialogue, while the auxiliary agents perform\ntasks such as information retrieval, response analysis, development of\npersuasion strategies, and validation of facts. Empirical evidence from our\nexperiments demonstrates that this collaborative methodology significantly\nenhances the persuasive efficacy of the LLM. We continuously analyze the\nresistance of the user agent to persuasive efforts and counteract it by\nemploying a combination of rule-based and LLM-based resistance-persuasion\nmapping techniques.\n We employ simulated personas and generate conversations in insurance,\nbanking, and retail domains to evaluate the proficiency of large language\nmodels (LLMs) in recognizing, adjusting to, and influencing various personality\ntypes. Concurrently, we examine the resistance mechanisms employed by LLM\nsimulated personas. Persuasion is quantified via measurable surveys before and\nafter interaction, LLM-generated scores on conversation, and user decisions\n(purchase or non-purchase).\n","authors":["Ganesh Prasath Ramani","Shirish Karande","Santhosh V","Yash Bhatia"],"pdf_url":"https://arxiv.org/pdf/2408.15879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06458v2","updated":"2024-09-02T02:26:18Z","published":"2023-10-10T09:29:38Z","title":"Cultural Compass: Predicting Transfer Learning Success in Offensive\n Language Detection with Cultural Features","summary":" The increasing ubiquity of language technology necessitates a shift towards\nconsidering cultural diversity in the machine learning realm, particularly for\nsubjective tasks that rely heavily on cultural nuances, such as Offensive\nLanguage Detection (OLD). Current understanding underscores that these tasks\nare substantially influenced by cultural values, however, a notable gap exists\nin determining if cultural features can accurately predict the success of\ncross-cultural transfer learning for such subjective tasks. Addressing this,\nour study delves into the intersection of cultural features and transfer\nlearning effectiveness. The findings reveal that cultural value surveys indeed\npossess a predictive power for cross-cultural transfer learning success in OLD\ntasks and that it can be further improved using offensive word distance. Based\non these results, we advocate for the integration of cultural information into\ndatasets. Additionally, we recommend leveraging data sources rich in cultural\ninformation, such as surveys, to enhance cultural adaptability. Our research\nsignifies a step forward in the quest for more inclusive, culturally sensitive\nlanguage technologies.\n","authors":["Li Zhou","Antonia Karamolegkou","Wenyu Chen","Daniel Hershcovich"],"pdf_url":"https://arxiv.org/pdf/2310.06458v2.pdf","comment":"Findings of EMNLP 2023 (update)"},{"id":"http://arxiv.org/abs/2303.12816v4","updated":"2024-09-02T01:48:34Z","published":"2023-03-22T07:34:33Z","title":"From Wide to Deep: Dimension Lifting Network for Parameter-efficient\n Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) that maps entities and relations into vector\nrepresentations is essential for downstream applications. Conventional KGE\nmethods require high-dimensional representations to learn the complex structure\nof knowledge graph, but lead to oversized model parameters. Recent advances\nreduce parameters by low-dimensional entity representations, while developing\ntechniques (e.g., knowledge distillation or reinvented representation forms) to\ncompensate for reduced dimension. However, such operations introduce\ncomplicated computations and model designs that may not benefit large knowledge\ngraphs. To seek a simple strategy to improve the parameter efficiency of\nconventional KGE models, we take inspiration from that deeper neural networks\nrequire exponentially fewer parameters to achieve expressiveness comparable to\nwider networks for compositional structures. We view all entity representations\nas a single-layer embedding network, and conventional KGE methods that adopt\nhigh-dimensional entity representations equal widening the embedding network to\ngain expressiveness. To achieve parameter efficiency, we instead propose a\ndeeper embedding network for entity representations, i.e., a narrow entity\nembedding layer plus a multi-layer dimension lifting network (LiftNet).\nExperiments on three public datasets show that by integrating LiftNet, four\nconventional KGE methods with 16-dimensional representations achieve comparable\nlink prediction accuracy as original models that adopt 512-dimensional\nrepresentations, saving 68.4% to 96.9% parameters.\n","authors":["Borui Cai","Yong Xiang","Longxiang Gao","Di Wu","He Zhang","Jiong Jin","Tom Luan"],"pdf_url":"https://arxiv.org/pdf/2303.12816v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01497v1","updated":"2024-09-02T23:37:20Z","published":"2024-09-02T23:37:20Z","title":"DiversityMedQA: Assessing Demographic Biases in Medical Diagnosis using\n Large Language Models","summary":" As large language models (LLMs) gain traction in healthcare, concerns about\ntheir susceptibility to demographic biases are growing. We introduce\n{DiversityMedQA}, a novel benchmark designed to assess LLM responses to medical\nqueries across diverse patient demographics, such as gender and ethnicity. By\nperturbing questions from the MedQA dataset, which comprises medical board exam\nquestions, we created a benchmark that captures the nuanced differences in\nmedical diagnosis across varying patient profiles. Our findings reveal notable\ndiscrepancies in model performance when tested against these demographic\nvariations. Furthermore, to ensure the perturbations were accurate, we also\npropose a filtering strategy that validates each perturbation. By releasing\nDiversityMedQA, we provide a resource for evaluating and mitigating demographic\nbias in LLM medical diagnoses.\n","authors":["Rajat Rawat","Hudson McBride","Dhiyaan Nirmal","Rajarshi Ghosh","Jong Moon","Dhruv Alamuri","Sean O'Brien","Kevin Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.01497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01495v1","updated":"2024-09-02T23:28:15Z","published":"2024-09-02T23:28:15Z","title":"The Compressor-Retriever Architecture for Language Model OS","summary":" Recent advancements in large language models (LLMs) have significantly\nenhanced their capacity to aggregate and process information across multiple\nmodalities, enabling them to perform a wide range of tasks such as multimodal\ndata querying, tool usage, web interactions, and handling long documents. These\ncapabilities pave the way for transforming LLMs from mere chatbots into\ngeneral-purpose agents capable of interacting with the real world. This paper\nexplores the concept of using a language model as the core component of an\noperating system (OS), effectively acting as a CPU that processes data stored\nin a context window, which functions as RAM. A key challenge in realizing such\nan LM OS is managing the life-long context and ensuring statefulness across\nsessions, a feature limited by the current session-based interaction paradigm\ndue to context window size limit. To address this, we introduce\ncompressor-retriever, a model-agnostic architecture designed for life-long\ncontext management. Unlike other long-context solutions such as\nretrieval-augmented generation, our approach exclusively uses the base model's\nforward function to compress and retrieve context, ensuring end-to-end\ndifferentiability. Preliminary experiments demonstrate the effectiveness of\nthis architecture in in-context learning tasks, marking a step towards the\ndevelopment of a fully stateful LLM OS. Project repo available at:\nhttps://github.com/gblackout/LM-OS\n","authors":["Yuan Yang","Siheng Xiong","Ehsan Shareghi","Faramarz Fekri"],"pdf_url":"https://arxiv.org/pdf/2409.01495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01483v1","updated":"2024-09-02T22:35:03Z","published":"2024-09-02T22:35:03Z","title":"Revisiting SMoE Language Models by Evaluating Inefficiencies with Task\n Specific Expert Pruning","summary":" Sparse Mixture of Expert (SMoE) models have emerged as a scalable alternative\nto dense models in language modeling. These models use conditionally activated\nfeedforward subnetworks in transformer blocks, allowing for a separation\nbetween total model parameters and per-example computation. However, large\ntoken-routed SMoE models face a significant challenge: during inference, the\nentire model must be used for a sequence or a batch, resulting in high\nlatencies in a distributed setting that offsets the advantages of per-token\nsparse activation. Our research explores task-specific model pruning to inform\ndecisions about designing SMoE architectures, mainly modulating the choice of\nexpert counts in pretraining. We investigate whether such pruned models offer\nadvantages over smaller SMoE models trained from scratch, when evaluating and\ncomparing them individually on tasks. To that end, we introduce an adaptive\ntask-aware pruning technique UNCURL to reduce the number of experts per MoE\nlayer in an offline manner post-training. Our findings reveal a threshold\npruning factor for the reduction that depends on the number of experts used in\npretraining, above which, the reduction starts to degrade model performance.\nThese insights contribute to our understanding of model design choices when\npretraining with SMoE architectures, particularly useful when considering\ntask-specific inference optimization for later stages.\n","authors":["Soumajyoti Sarkar","Leonard Lausen","Volkan Cevher","Sheng Zha","Thomas Brox","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2409.01483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01482v1","updated":"2024-09-02T22:17:18Z","published":"2024-09-02T22:17:18Z","title":"Masked Mixers for Language Generation and Retrieval","summary":" Attention mechanisms that confer selective focus on a strict subset of input\nelements are nearly ubiquitous in language models today. We posit there to be\ndownside to the use of attention: most information present in the input is\nnecessarily lost. In support of this idea we observe poor input representation\naccuracy in transformers, but find more accurate representation in what we term\nmasked mixers which replace self-attention with masked convolutions. Applied to\nTinyStories the masked mixer learns causal language tasks more efficiently than\nearly transformer implementations and somewhat less efficiently than optimized,\ncurrent implementations. The most efficient learning algorithm observed for\nthis dataset is a transformer-masked mixer hybrid, suggesting that these models\nlearn in an orthogonal manner. We hypothesized that the information loss\nexhibited by transformers would be much more detrimental to retrieval than\ngeneration, and to test this we introduce an efficient training approach for\nretrieval models based on existing generative model embeddings. With this\nmethod, embeddings from masked mixers are found to result in far better\nsummary-to-story retrieval compared to embeddings from transformers.\n","authors":["Benjamin L. Badger"],"pdf_url":"https://arxiv.org/pdf/2409.01482v1.pdf","comment":"23 pages, 15 figures (11 primary, 4 supplementary)"},{"id":"http://arxiv.org/abs/2409.01466v1","updated":"2024-09-02T21:05:31Z","published":"2024-09-02T21:05:31Z","title":"PoliPrompt: A High-Performance Cost-Effective LLM-Based Text\n Classification Framework for Political Science","summary":" Recent advancements in large language models (LLMs) have opened new avenues\nfor enhancing text classification efficiency in political science, surpassing\ntraditional machine learning methods that often require extensive feature\nengineering, human labeling, and task-specific training. However, their\neffectiveness in achieving high classification accuracy remains questionable.\nThis paper introduces a three-stage in-context learning approach that leverages\nLLMs to improve classification accuracy while minimizing experimental costs.\nOur method incorporates automatic enhanced prompt generation, adaptive exemplar\nselection, and a consensus mechanism that resolves discrepancies between two\nweaker LLMs, refined by an advanced LLM. We validate our approach using\ndatasets from the BBC news reports, Kavanaugh Supreme Court confirmation, and\n2018 election campaign ads. The results show significant improvements in\nclassification F1 score (+0.36 for zero-shot classification) with manageable\neconomic costs (-78% compared with human labeling), demonstrating that our\nmethod effectively addresses the limitations of traditional machine learning\nwhile offering a scalable and reliable solution for text analysis in political\nscience.\n","authors":["Menglin Liu","Ge Shi"],"pdf_url":"https://arxiv.org/pdf/2409.01466v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02141v1","updated":"2024-09-02T19:39:24Z","published":"2024-09-02T19:39:24Z","title":"Efficient and Scalable Estimation of Tool Representations in Vector\n Space","summary":" Recent advancements in function calling and tool use have significantly\nenhanced the capabilities of large language models (LLMs) by enabling them to\ninteract with external information sources and execute complex tasks. However,\nthe limited context window of LLMs presents challenges when a large number of\ntools are available, necessitating efficient methods to manage prompt length\nand maintain accuracy. Existing approaches, such as fine-tuning LLMs or\nleveraging their reasoning capabilities, either require frequent retraining or\nincur significant latency overhead. A more efficient solution involves training\nsmaller models to retrieve the most relevant tools for a given query, although\nthis requires high quality, domain-specific data. To address those challenges,\nwe present a novel framework for generating synthetic data for tool retrieval\napplications and an efficient data-driven tool retrieval strategy using small\nencoder models. Empowered by LLMs, we create ToolBank, a new tool retrieval\ndataset that reflects real human user usages. For tool retrieval methodologies,\nwe propose novel approaches: (1) Tool2Vec: usage-driven tool embedding\ngeneration for tool retrieval, (2) ToolRefiner: a staged retrieval method that\niteratively improves the quality of retrieved tools, and (3) MLC: framing tool\nretrieval as a multi-label classification problem. With these new methods, we\nachieve improvements of up to 27.28 in Recall@K on the ToolBench dataset and\n30.5 in Recall@K on ToolBank. Additionally, we present further experimental\nresults to rigorously validate our methods. Our code is available at\n\\url{https://github.com/SqueezeAILab/Tool2Vec}\n","authors":["Suhong Moon","Siddharth Jha","Lutfi Eren Erdogan","Sehoon Kim","Woosang Lim","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2409.02141v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2207.12554v2","updated":"2024-09-02T22:49:21Z","published":"2022-07-25T22:17:19Z","title":"Inter-Frame Compression for Dynamic Point Cloud Geometry Coding","summary":" Efficient point cloud compression is essential for applications like virtual\nand mixed reality, autonomous driving, and cultural heritage. This paper\nproposes a deep learning-based inter-frame encoding scheme for dynamic point\ncloud geometry compression. We propose a lossy geometry compression scheme that\npredicts the latent representation of the current frame using the previous\nframe by employing a novel feature space inter-prediction network. The proposed\nnetwork utilizes sparse convolutions with hierarchical multiscale 3D feature\nlearning to encode the current frame using the previous frame. The proposed\nmethod introduces a novel predictor network for motion compensation in the\nfeature domain to map the latent representation of the previous frame to the\ncoordinates of the current frame to predict the current frame's feature\nembedding. The framework transmits the residual of the predicted features and\nthe actual features by compressing them using a learned probabilistic\nfactorized entropy model. At the receiver, the decoder hierarchically\nreconstructs the current frame by progressively rescaling the feature\nembedding. The proposed framework is compared to the state-of-the-art\nVideo-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud\nCompression (G-PCC) schemes standardized by the Moving Picture Experts Group\n(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta\nRate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against\nG-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame\nencoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based\ninter-frame encoding mode using HEVC. These significant performance gains are\ncross-checked and verified in the MPEG working group.\n","authors":["Anique Akhtar","Zhu Li","Geert Van der Auwera"],"pdf_url":"https://arxiv.org/pdf/2207.12554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.15920v2","updated":"2024-09-02T21:35:51Z","published":"2024-06-22T19:20:35Z","title":"SEDMamba: Enhancing Selective State Space Modelling with Bottleneck\n Mechanism and Fine-to-Coarse Temporal Fusion for Efficient Error Detection in\n Robot-Assisted Surgery","summary":" Automated detection of surgical errors can improve robotic-assisted surgery.\nDespite promising progress, existing methods still face challenges in capturing\nrich temporal context to establish long-term dependencies while maintaining\ncomputational efficiency. In this paper, we propose a novel hierarchical model\nnamed SEDMamba, which incorporates the selective state space model (SSM) into\nsurgical error detection, facilitating efficient long sequence modelling with\nlinear complexity. SEDMamba enhances selective SSM with a bottleneck mechanism\nand fine-to-coarse temporal fusion (FCTF) to detect and temporally localize\nsurgical errors in long videos. The bottleneck mechanism compresses and\nrestores features within their spatial dimension, thereby reducing\ncomputational complexity. FCTF utilizes multiple dilated 1D convolutional\nlayers to merge temporal information across diverse scale ranges, accommodating\nerrors of varying duration. Our work also contributes the first-of-its-kind,\nframe-level, in-vivo surgical error dataset to support error detection in real\nsurgical cases. Specifically, we deploy the clinically validated observational\nclinical human reliability assessment tool (OCHRA) to annotate the errors\nduring suturing tasks in an open-source radical prostatectomy dataset\n(SAR-RARP50). Experimental results demonstrate that our SEDMamba outperforms\nstate-of-the-art methods with at least 1.82% AUC and 3.80% AP performance gains\nwith significantly reduced computational complexity. The corresponding error\nannotations, code and models will be released at\nhttps://github.com/wzjialang/SEDMamba.\n","authors":["Jialang Xu","Nazir Sirajudeen","Matthew Boal","Nader Francis","Danail Stoyanov","Evangelos Mazomenos"],"pdf_url":"https://arxiv.org/pdf/2406.15920v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2403.17937v2","updated":"2024-09-02T20:58:43Z","published":"2024-03-26T17:59:58Z","title":"Efficient Video Object Segmentation via Modulated Cross-Attention Memory","summary":" Recently, transformer-based approaches have shown promising results for\nsemi-supervised video object segmentation. However, these approaches typically\nstruggle on long videos due to increased GPU memory demands, as they frequently\nexpand the memory bank every few frames. We propose a transformer-based\napproach, named MAVOS, that introduces an optimized and dynamic long-term\nmodulated cross-attention (MCA) memory to model temporal smoothness without\nrequiring frequent memory expansion. The proposed MCA effectively encodes both\nlocal and global features at various levels of granularity while efficiently\nmaintaining consistent speed regardless of the video length. Extensive\nexperiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017,\ndemonstrate the effectiveness of our proposed contributions leading to\nreal-time inference and markedly reduced memory demands without any degradation\nin segmentation accuracy on long videos. Compared to the best existing\ntransformer-based approach, our MAVOS increases the speed by 7.6x, while\nsignificantly reducing the GPU memory by 87% with comparable segmentation\nperformance on short and long video datasets. Notably on the LVOS dataset, our\nMAVOS achieves a J&F score of 63.3% while operating at 37 frames per second\n(FPS) on a single V100 GPU. Our code and models will be publicly available at:\nhttps://github.com/Amshaker/MAVOS.\n","authors":["Abdelrahman Shaker","Syed Talal Wasim","Martin Danelljan","Salman Khan","Ming-Hsuan Yang","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2403.17937v2.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2408.17095v2","updated":"2024-09-02T20:33:49Z","published":"2024-08-30T08:26:55Z","title":"RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation\n and Retrieval-Guidance","summary":" Diffusion-based models demonstrate impressive generation capabilities.\nHowever, they also have a massive number of parameters, resulting in enormous\nmodel sizes, thus making them unsuitable for deployment on resource-constraint\ndevices. Block-wise generation can be a promising alternative for designing\ncompact-sized (parameter-efficient) deep generative models since the model can\ngenerate one block at a time instead of generating the whole image at once.\nHowever, block-wise generation is also considerably challenging because\nensuring coherence across generated blocks can be non-trivial. To this end, we\ndesign a retrieval-augmented generation (RAG) approach and leverage the\ncorresponding blocks of the images retrieved by the RAG module to condition the\ntraining and generation stages of a block-wise denoising diffusion model. Our\nconditioning schemes ensure coherence across the different blocks during\ntraining and, consequently, during generation. While we showcase our approach\nusing the latent diffusion model (LDM) as the base model, it can be used with\nother variants of denoising diffusion models. We validate the solution of the\ncoherence problem through the proposed approach by reporting substantive\nexperiments to demonstrate our approach's effectiveness in compact model size\nand excellent generation quality.\n","authors":["Avideep Mukherjee","Soumya Banerjee","Piyush Rai","Vinay P. Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2408.17095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05303v4","updated":"2024-09-02T19:49:06Z","published":"2023-08-10T02:47:36Z","title":"Multi-Visual-Inertial System: Analysis, Calibration and Estimation","summary":" In this paper, we study state estimation of multi-visual-inertial systems\n(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary\nnumber of asynchronous inertial measurement units (IMUs) or gyroscopes and\nglobal and(or) rolling shutter cameras. We are especially interested in the\nfull calibration of the associated visual-inertial sensors, including the IMU\nor camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as\nwell as the image readout time of rolling-shutter cameras (if used). To this\nend, we develop a new analytic combined IMU integration with intrinsics-termed\nACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary\nIMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial\nmeasurements to include all the necessary inertial intrinsic and IMU-IMU\nspatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body\nconstraints to eliminate the necessity of auxiliary inertial poses and thus\nreducing computational complexity. By performing observability analysis of\nMVIS, we prove that the standard four unobservable directions remain - no\nmatter how many inertial sensors are used, and also identify, for the first\ntime, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary\ninertial intrinsics. In addition to the extensive simulations that validate our\nanalysis and algorithms, we have built our own MVIS sensor rig and collected\nover 25 real-world datasets to experimentally verify the proposed calibration\nagainst the state-of-the-art calibration method such as Kalibr. We show that\nthe proposed MVIS calibration is able to achieve competing accuracy with\nimproved convergence and repeatability, which is open sourced to better benefit\nthe community.\n","authors":["Yulin Yang","Patrick Geneva","Guoquan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05303v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08486v2","updated":"2024-09-02T19:04:57Z","published":"2024-06-12T17:59:42Z","title":"On Evaluating Adversarial Robustness of Volumetric Medical Segmentation\n Models","summary":" Volumetric medical segmentation models have achieved significant success on\norgan and tumor-based segmentation tasks in recent years. However, their\nvulnerability to adversarial attacks remains largely unexplored, raising\nserious concerns regarding the real-world deployment of tools employing such\nmodels in the healthcare sector. This underscores the importance of\ninvestigating the robustness of existing models. In this context, our work aims\nto empirically examine the adversarial robustness across current volumetric\nsegmentation architectures, encompassing Convolutional, Transformer, and\nMamba-based models. We extend this investigation across four volumetric\nsegmentation datasets, evaluating robustness under both white box and black box\nadversarial attacks. Overall, we observe that while both pixel and\nfrequency-based attacks perform reasonably well under \\emph{white box} setting,\nthe latter performs significantly better under transfer-based black box\nattacks. Across our experiments, we observe transformer-based models show\nhigher robustness than convolution-based models with Mamba-based models being\nthe most vulnerable. Additionally, we show that large-scale training of\nvolumetric segmentation models improves the model's robustness against\nadversarial attacks. The code and robust models are available at\nhttps://github.com/HashmatShadab/Robustness-of-Volumetric-Medical-Segmentation-Models.\n","authors":["Hashmat Shadab Malik","Numan Saeed","Asif Hanif","Muzammal Naseer","Mohammad Yaqub","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2406.08486v2.pdf","comment":"Accepted at British Machine Vision Conference 2024"},{"id":"http://arxiv.org/abs/2408.13423v3","updated":"2024-09-02T18:02:03Z","published":"2024-08-24T01:33:28Z","title":"Training-free Long Video Generation with Chain of Diffusion Model\n Experts","summary":" Video generation models hold substantial potential in areas such as\nfilmmaking. However, current video diffusion models need high computational\ncosts and produce suboptimal results due to high complexity of video generation\ntask. In this paper, we propose \\textbf{ConFiner}, an efficient high-quality\nvideo generation framework that decouples video generation into easier\nsubtasks: structure \\textbf{con}trol and spatial-temporal re\\textbf{fine}ment.\nIt can generate high-quality videos with chain of off-the-shelf diffusion model\nexperts, each expert responsible for a decoupled subtask. During the\nrefinement, we introduce coordinated denoising, which can merge multiple\ndiffusion experts' capabilities into a single sampling. Furthermore, we design\nConFiner-Long framework, which can generate long coherent video with three\nconstraint strategies on ConFiner. Experimental results indicate that with only\n10\\% of the inference cost, our ConFiner surpasses representative models like\nLavie and Modelscope across all objective and subjective metrics. And\nConFiner-Long can generate high-quality and coherent videos with up to 600\nframes.\n","authors":["Wenhao Li","Yichao Cao","Xiu Su","Xi Lin","Shan You","Mingkai Zheng","Yi Chen","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2408.13423v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17346v2","updated":"2024-09-02T17:30:45Z","published":"2024-03-26T03:10:45Z","title":"TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos","summary":" We propose TRAM, a two-stage method to reconstruct a human's global\ntrajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover\nthe camera motion in the presence of dynamic humans and uses the scene\nbackground to derive the motion scale. Using the recovered camera as a\nmetric-scale reference frame, we introduce a video transformer model (VIMO) to\nregress the kinematic body motion of a human. By composing the two motions, we\nachieve accurate recovery of 3D humans in the world space, reducing global\nmotion errors by a large margin from prior work.\nhttps://yufu-wang.github.io/tram4d/\n","authors":["Yufu Wang","Ziyun Wang","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2403.17346v2.pdf","comment":"The project website: https://yufu-wang.github.io/tram4d/"},{"id":"http://arxiv.org/abs/2312.10108v2","updated":"2024-09-02T17:00:21Z","published":"2023-12-15T06:30:55Z","title":"Privacy-Aware Document Visual Question Answering","summary":" Document Visual Question Answering (DocVQA) has quickly grown into a central\ntask of document understanding. But despite the fact that documents contain\nsensitive or copyrighted information, none of the current DocVQA methods offers\nstrong privacy guarantees. In this work, we explore privacy in the domain of\nDocVQA for the first time, highlighting privacy issues in state of the art\nmulti-modal LLM models used for DocVQA, and explore possible solutions.\nSpecifically, we focus on invoice processing as a realistic document\nunderstanding scenario, and propose a large scale DocVQA dataset comprising\ninvoice documents and associated questions and answers. We employ a federated\nlearning scheme, that reflects the real-life distribution of documents in\ndifferent businesses, and we explore the use case where the data of the invoice\nprovider is the sensitive information to be protected. We demonstrate that\nnon-private models tend to memorise, a behaviour that can lead to exposing\nprivate information. We then evaluate baseline training schemes employing\nfederated learning and differential privacy in this multi-modal scenario, where\nthe sensitive information might be exposed through either or both of the two\ninput modalities: vision (document image) or language (OCR tokens). Finally, we\ndesign attacks exploiting the memorisation effect of the model, and demonstrate\ntheir effectiveness in probing a representative DocVQA models.\n","authors":["Rubèn Tito","Khanh Nguyen","Marlon Tobaben","Raouf Kerkouche","Mohamed Ali Souibgui","Kangsoo Jung","Joonas Jälkö","Vincent Poulain D'Andecy","Aurelie Joseph","Lei Kang","Ernest Valveny","Antti Honkela","Mario Fritz","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2312.10108v2.pdf","comment":"35 pages, 12 figures, accepted for publication at the 18th\n International Conference on Document Analysis and Recognition, ICDAR 2024"},{"id":"http://arxiv.org/abs/2408.16154v2","updated":"2024-09-02T16:58:16Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":" Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","Luis Filipe Nakayama","André Anjos","Lilian Berton"],"pdf_url":"https://arxiv.org/pdf/2408.16154v2.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n ECCV 2024"},{"id":"http://arxiv.org/abs/2408.17347v2","updated":"2024-09-02T16:08:32Z","published":"2024-08-30T15:22:13Z","title":"LSMS: Language-guided Scale-aware MedSegmentor for Medical Image\n Referring Segmentation","summary":" Conventional medical image segmentation methods have been found inadequate in\nfacilitating physicians with the identification of specific lesions for\ndiagnosis and treatment. Given the utility of text as an instructional format,\nwe introduce a novel task termed Medical Image Referring Segmentation (MIRS),\nwhich requires segmenting specified lesions in images based on the given\nlanguage expressions. Due to the varying object scales in medical images, MIRS\ndemands robust vision-language modeling and comprehensive multi-scale\ninteraction for precise localization and segmentation under linguistic\nguidance. However, existing medical image segmentation methods fall short in\nmeeting these demands, resulting in insufficient segmentation accuracy. In\nresponse, we propose an approach named Language-guided Scale-aware MedSegmentor\n(LSMS), incorporating two appealing designs: (1)~a Scale-aware Vision-Language\nAttention module that leverages diverse convolutional kernels to acquire rich\nvisual knowledge and interact closely with linguistic features, thereby\nenhancing lesion localization capability; (2)~a Full-Scale Decoder that\nglobally models multi-modal features across various scales, capturing\ncomplementary information between scales to accurately outline lesion\nboundaries. Addressing the lack of suitable datasets for MIRS, we constructed a\nvision-language medical dataset called Reference Hepatic Lesion Segmentation\n(RefHL-Seg). This dataset comprises 2,283 abdominal CT slices from 231 cases,\nwith corresponding textual annotations and segmentation masks for various liver\nlesions in images. We validated the performance of LSMS for MIRS and\nconventional medical image segmentation tasks across various datasets. Our LSMS\nconsistently outperforms on all datasets with lower computational costs. The\ncode and datasets will be released.\n","authors":["Shuyi Ouyang","Jinyang Zhang","Xiangye Lin","Xilai Wang","Qingqing Chen","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2408.17347v2.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.05873v7","updated":"2024-09-02T15:46:13Z","published":"2023-10-09T17:13:10Z","title":"Implicit Concept Removal of Diffusion Models","summary":" Text-to-image (T2I) diffusion models often inadvertently generate unwanted\nconcepts such as watermarks and unsafe images. These concepts, termed as the\n\"implicit concepts\", could be unintentionally learned during training and then\nbe generated uncontrollably during inference. Existing removal methods still\nstruggle to eliminate implicit concepts primarily due to their dependency on\nthe model's ability to recognize concepts it actually can not discern. To\naddress this, we utilize the intrinsic geometric characteristics of implicit\nconcepts and present the Geom-Erasing, a novel concept removal method based on\nthe geometric-driven control. Specifically, once an unwanted implicit concept\nis identified, we integrate the existence and geometric information of the\nconcept into the text prompts with the help of an accessible classifier or\ndetector model. Subsequently, the model is optimized to identify and\ndisentangle this information, which is then adopted as negative prompts during\ngeneration. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel\nimage-text dataset imbued with three typical implicit concepts (i.e., QR codes,\nwatermarks, and text), reflecting real-life situations where implicit concepts\nare easily injected. Geom-Erasing effectively mitigates the generation of\nimplicit concepts, achieving the state-of-the-art results on the Inappropriate\nImage Prompts (I2P) and our challenging Implicit Concept Dataset (ICD)\nbenchmarks.\n","authors":["Zhili Liu","Kai Chen","Yifan Zhang","Jianhua Han","Lanqing Hong","Hang Xu","Zhenguo Li","Dit-Yan Yeung","James Kwok"],"pdf_url":"https://arxiv.org/pdf/2310.05873v7.pdf","comment":"Accepted by ECCV2024. Project Page:\n https://kaichen1998.github.io/projects/geom-erasing/"},{"id":"http://arxiv.org/abs/2405.00145v2","updated":"2024-09-02T14:24:55Z","published":"2024-04-30T18:42:18Z","title":"GUing: A Mobile GUI Search Engine using a Vision-Language Model","summary":" App developers use the Graphical User Interface (GUI) of other apps as a\nsource of inspiration for designing and improving their own apps. Recent\nresearch has thus suggested retrieving relevant GUI designs that match a\ncertain text query from screenshot datasets acquired through crowdsourced or\nautomated exploration of GUIs. However, such text-to-GUI retrieval approaches\nonly leverage the textual information of the GUI elements, neglecting visual\ninformation such as icons or background images. In addition, retrieved\nscreenshots are not steered by app developers and often lack important app\nfeatures that require particular input data.\n To overcome these limitations, this paper proposes GUing, a GUI search engine\nbased on a vision-language model called GUIClip, which we trained specifically\nfor the problem of designing app GUIs. For this, we first collected from Google\nPlay app introduction images which usually display the most representative\nscreenshots and are often captioned (i.e.~labeled) by app vendors. Then, we\ndeveloped an automated pipeline to classify, crop, and extract the captions\nfrom these images. This resulted in a large dataset which we share with this\npaper: including 303k app screenshots, out of which 135k have captions. We used\nthis dataset to train a novel vision-language model, which is, to the best of\nour knowledge, the first of its kind in GUI retrieval. We evaluated our\napproach on various datasets from related work and in manual experiment. The\nresults demonstrate that our model outperforms previous approaches in\ntext-to-GUI retrieval achieving a Recall@10 of up to 0.69 and a HIT@10 of 0.91.\nWe also explored the performance of GUIClip for other GUI tasks including GUI\nclassification and sketch-to-GUI retrieval with encouraging results.\n","authors":["Jialiang Wei","Anne-Lise Courbis","Thomas Lambolais","Binbin Xu","Pierre Louis Bernard","Gérard Dray","Walid Maalej"],"pdf_url":"https://arxiv.org/pdf/2405.00145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.15365v4","updated":"2024-09-02T14:07:08Z","published":"2024-01-27T09:54:16Z","title":"An open dataset for oracle bone script recognition and decipherment","summary":" Oracle bone script, one of the earliest known forms of ancient Chinese\nwriting, presents invaluable research materials for scholars studying the\nhumanities and geography of the Shang Dynasty, dating back 3,000 years. The\nimmense historical and cultural significance of these writings cannot be\noverstated. However, the passage of time has obscured much of their meaning,\npresenting a significant challenge in deciphering these ancient texts. With the\nadvent of Artificial Intelligence (AI), employing AI to assist in deciphering\nOracle Bone Characters (OBCs) has become a feasible option. Yet, progress in\nthis area has been hindered by a lack of high-quality datasets. To address this\nissue, this paper details the creation of the HUST-OBC dataset. This dataset\nencompasses 77,064 images of 1,588 individual deciphered characters and 62,989\nimages of 9,411 undeciphered characters, with a total of 140,053 images,\ncompiled from diverse sources. The hope is that this dataset could inspire and\nassist future research in deciphering those unknown OBCs. All the codes and\ndatasets are available at https://github.com/Yuliang-Liu/Open-Oracle.\n","authors":["Pengjie Wang","Kaile Zhang","Xinyu Wang","Shengwei Han","Yongge Liu","Jinpeng Wan","Haisu Guan","Zhebin Kuang","Lianwen Jin","Xiang Bai","Yuliang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.15365v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05867v2","updated":"2024-09-02T13:39:30Z","published":"2024-08-11T21:59:34Z","title":"SABER-6D: Shape Representation Based Implicit Object Pose Estimation","summary":" In this paper, we propose a novel encoder-decoder architecture, named SABER,\nto learn the 6D pose of the object in the embedding space by learning shape\nrepresentation at a given pose. This model enables us to learn pose by\nperforming shape representation at a target pose from RGB image input. We\nperform shape representation as an auxiliary task which helps us in learning\nrotations space for an object based on 2D images. An image encoder predicts the\nrotation in the embedding space and the DeepSDF based decoder learns to\nrepresent the object's shape at the given pose. As our approach is shape based,\nthe pipeline is suitable for any type of object irrespective of the symmetry.\nMoreover, we need only a CAD model of the objects to train SABER. Our pipeline\nis synthetic data based and can also handle symmetric objects without symmetry\nlabels and, thus, no additional labeled training data is needed. The\nexperimental evaluation shows that our method achieves close to benchmark\nresults for both symmetric objects and asymmetric objects on Occlusion-LineMOD,\nand T-LESS datasets.\n","authors":["Shishir Reddy Vutukur","Mengkejiergeli Ba","Benjamin Busam","Matthias Kayser","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.05867v2.pdf","comment":"ECCV 2024 R6D workshop"},{"id":"http://arxiv.org/abs/2408.16481v2","updated":"2024-09-02T13:12:23Z","published":"2024-08-29T12:16:55Z","title":"A Deep-Learning-Based Label-free No-Reference Image Quality Assessment\n Metric: Application in Sodium MRI Denoising","summary":" New multinuclear MRI techniques, such as sodium MRI, generally suffer from\nlow image quality due to an inherently low signal. Postprocessing methods, such\nas image denoising, have been developed for image enhancement. However, the\nassessment of these enhanced images is challenging especially considering when\nthere is a lack of high resolution and high signal images as reference, such as\nin sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are\napproaches to solve this problem. Existing learning-based NR-IQA metrics rely\non labels derived from subjective human opinions or metrics like\nSignal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate\nground truths, resulting in unreliable assessment. We note that deep learning\n(DL) models have a unique characteristic in that they are specialized to a\ncharacteristic training set, meaning that deviations between the input testing\ndata from the training data will reduce prediction accuracy. Therefore, we\npropose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM),\nwhich does not depend on ground-truth images or labels. MSM measures the\ndifference between the input image and the model's prediction for evaluating\nthe quality of the input image. Experiments conducted on both simulated\ndistorted proton T1-weighted MR images and denoised sodium MR images\ndemonstrate that MSM exhibits a superior evaluation performance on various\nsimulated noises and distortions. MSM also has a substantial agreement with the\nexpert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528,\noutperforming the existing NR-IQA metrics.\n","authors":["Shuaiyu Yuan","Tristan Whitmarsh","Dimitri A Kessler","Otso Arponen","Mary A McLean","Gabrielle Baxter","Frank Riemer","Aneurin J Kennerley","William J Brackenbury","Fiona J Gilbert","Joshua D Kaggie"],"pdf_url":"https://arxiv.org/pdf/2408.16481v2.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2404.14966v2","updated":"2024-09-02T12:55:04Z","published":"2024-04-23T12:20:27Z","title":"Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State\n Space Model","summary":" Existing Transformer-based models for point cloud analysis suffer from\nquadratic complexity, leading to compromised point cloud resolution and\ninformation loss. In contrast, the newly proposed Mamba model, based on state\nspace models (SSM), outperforms Transformer in multiple areas with only linear\ncomplexity. However, the straightforward adoption of Mamba does not achieve\nsatisfactory performance on point cloud tasks. In this work, we present\nMamba3D, a state space model tailored for point cloud learning to enhance local\nfeature extraction, achieving superior performance, high efficiency, and\nscalability potential. Specifically, we propose a simple yet effective Local\nNorm Pooling (LNP) block to extract local geometric features. Additionally, to\nobtain better global features, we introduce a bidirectional SSM (bi-SSM) with\nboth a token forward SSM and a novel backward SSM that operates on the feature\nchannel. Extensive experimental results show that Mamba3D surpasses\nTransformer-based counterparts and concurrent works in multiple tasks, with or\nwithout pre-training. Notably, Mamba3D achieves multiple SoTA, including an\noverall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1%\n(with single-modal pre-training) on the ModelNet40 classification task, with\nonly linear complexity. Our code and weights are available at\nhttps://github.com/xhanxu/Mamba3D.\n","authors":["Xu Han","Yuan Tang","Zhaoxuan Wang","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2404.14966v2.pdf","comment":"ACM MM 2024. Code and weights are available at\n https://github.com/xhanxu/Mamba3D"},{"id":"http://arxiv.org/abs/2408.16845v2","updated":"2024-09-02T10:33:48Z","published":"2024-08-29T18:21:50Z","title":"Enabling Local Editing in Diffusion Models by Joint and Individual\n Component Analysis","summary":" Recent advances in Diffusion Models (DMs) have led to significant progress in\nvisual synthesis and editing tasks, establishing them as a strong competitor to\nGenerative Adversarial Networks (GANs). However, the latent space of DMs is not\nas well understood as that of GANs. Recent research has focused on unsupervised\nsemantic discovery in the latent space of DMs by leveraging the bottleneck\nlayer of the denoising network, which has been shown to exhibit properties of a\nsemantic latent space. However, these approaches are limited to discovering\nglobal attributes. In this paper we address, the challenge of local image\nmanipulation in DMs and introduce an unsupervised method to factorize the\nlatent semantics learned by the denoising network of pre-trained DMs. Given an\narbitrary image and defined regions of interest, we utilize the Jacobian of the\ndenoising network to establish a relation between the regions of interest and\ntheir corresponding subspaces in the latent space. Furthermore, we disentangle\nthe joint and individual components of these subspaces to identify latent\ndirections that enable local image manipulation. Once discovered, these\ndirections can be applied to different images to produce semantically\nconsistent edits, making our method suitable for practical applications.\nExperimental results on various datasets demonstrate that our method can\nproduce semantic edits that are more localized and have better fidelity\ncompared to the state-of-the-art.\n","authors":["Theodoros Kouzelis","Manos Plitsis","Mihalis A. Nicolaou","Yannis Panagakis"],"pdf_url":"https://arxiv.org/pdf/2408.16845v2.pdf","comment":"Accepted at BMVC2024"},{"id":"http://arxiv.org/abs/2302.03531v2","updated":"2024-09-02T10:24:46Z","published":"2023-02-07T15:23:52Z","title":"Structured Generative Models for Scene Understanding","summary":" This position paper argues for the use of \\emph{structured generative models}\n(SGMs) for the understanding of static scenes. This requires the reconstruction\nof a 3D scene from an input image (or a set of multi-view images), whereby the\ncontents of the image(s) are causally explained in terms of models of\ninstantiated objects, each with their own type, shape, appearance and pose,\nalong with global variables like scene lighting and camera parameters. This\napproach also requires scene models which account for the co-occurrences and\ninter-relationships of objects in a scene. The SGM approach has the merits that\nit is compositional and generative, which lead to interpretability and\neditability. \\\\\\\\ To pursue the SGM agenda, we need models for objects and\nscenes, and approaches to carry out inference. We first review models for\nobjects, which include ``things'' (object categories that have a well defined\nshape), and ``stuff'' (categories which have amorphous spatial extent). We then\nmove on to review \\emph{scene models} which describe the inter-relationships of\nobjects. Perhaps the most challenging problem for SGMs is \\emph{inference} of\nthe objects, lighting and camera parameters, and scene inter-relationships from\ninput consisting of a single or multiple images. We conclude with a discussion\nof issues that need addressing to advance the SGM agenda.\n","authors":["Christopher K. I. Williams"],"pdf_url":"https://arxiv.org/pdf/2302.03531v2.pdf","comment":"32 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.06198v2","updated":"2024-09-02T09:28:09Z","published":"2024-05-10T02:26:35Z","title":"MAPL: Memory Augmentation and Pseudo-Labeling for Semi-Supervised\n Anomaly Detection","summary":" Large unlabeled data and difficult-to-identify anomalies are the urgent\nissues need to overcome in most industrial scene. In order to address this\nissue, a new meth-odology for detecting surface defects in in-dustrial settings\nis introduced, referred to as Memory Augmentation and Pseudo-Labeling(MAPL).\nThe methodology first in-troduces an anomaly simulation strategy, which\nsignificantly improves the model's ability to recognize rare or unknown\nanom-aly types by generating simulated anomaly samples. To cope with the\nproblem of the lack of labeling of anomalous simulated samples, a\npseudo-labeler method based on a one-classifier ensemble was employed in this\nstudy, which enhances the robustness of the model in the case of limited\nlabeling data by automatically selecting key pseudo-labeling hyperparameters.\nMeanwhile, a memory-enhanced learning mechanism is introduced to effectively\npredict abnormal regions by analyzing the difference be-tween the input samples\nand the normal samples in the memory pool. An end-to-end learning framework is\nemployed by MAPL to identify the abnormal regions directly from the input data,\nwhich optimizes the ef-ficiency and real-time performance of de-tection. By\nconducting extensive trials on the recently developed BHAD dataset (in-cluding\nMVTec AD [1], Visa [2], and MDPP [3]), MAPL achieves an average im-age-level\nAUROC score of 86.2%, demon-strating a 5.1% enhancement compared to the\noriginal MemSeg [4] model. The source code is available at\nhttps://github.com/jzc777/MAPL.\n","authors":["Junzhuo Chen"],"pdf_url":"https://arxiv.org/pdf/2405.06198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11593v2","updated":"2024-09-02T09:04:51Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v2.pdf","comment":"Accepted by NCMMSC2024"},{"id":"http://arxiv.org/abs/2406.01154v3","updated":"2024-09-02T08:52:19Z","published":"2024-06-03T09:49:54Z","title":"UniUSNet: A Promptable Framework for Universal Ultrasound Disease\n Prediction and Tissue Segmentation","summary":" Ultrasound is widely used in clinical practice due to its affordability,\nportability, and safety. However, current AI research often overlooks combined\ndisease prediction and tissue segmentation. We propose UniUSNet, a universal\nframework for ultrasound image classification and segmentation. This model\nhandles various ultrasound types, anatomical positions, and input formats,\nexcelling in both segmentation and classification tasks. Trained on a\ncomprehensive dataset with over 9.7K annotations from 7 distinct anatomical\npositions, our model matches state-of-the-art performance and surpasses\nsingle-dataset and ablated models. Zero-shot and fine-tuning experiments show\nstrong generalization and adaptability with minimal fine-tuning. We plan to\nexpand our dataset and refine the prompting mechanism, with model weights and\ncode available at (https://github.com/Zehui-Lin/UniUSNet).\n","authors":["Zehui Lin","Zhuoneng Zhang","Xindi Hu","Zhifan Gao","Xin Yang","Yue Sun","Dong Ni","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2406.01154v3.pdf","comment":"Accepted to BIBM 2024"},{"id":"http://arxiv.org/abs/2408.04914v2","updated":"2024-09-02T08:45:47Z","published":"2024-08-09T07:46:01Z","title":"GuidedNet: Semi-Supervised Multi-Organ Segmentation via Labeled Data\n Guide Unlabeled Data","summary":" Semi-supervised multi-organ medical image segmentation aids physicians in\nimproving disease diagnosis and treatment planning and reduces the time and\neffort required for organ annotation.Existing state-of-the-art methods train\nthe labeled data with ground truths and train the unlabeled data with\npseudo-labels. However, the two training flows are separate, which does not\nreflect the interrelationship between labeled and unlabeled data.To address\nthis issue, we propose a semi-supervised multi-organ segmentation method called\nGuidedNet, which leverages the knowledge from labeled data to guide the\ntraining of unlabeled data. The primary goals of this study are to improve the\nquality of pseudo-labels for unlabeled data and to enhance the network's\nlearning capability for both small and complex organs.A key concept is that\nvoxel features from labeled and unlabeled data that are close to each other in\nthe feature space are more likely to belong to the same class.On this basis, a\n3D Consistent Gaussian Mixture Model (3D-CGMM) is designed to leverage the\nfeature distributions from labeled data to rectify the generated\npseudo-labels.Furthermore, we introduce a Knowledge Transfer Cross Pseudo\nSupervision (KT-CPS) strategy, which leverages the prior knowledge obtained\nfrom the labeled data to guide the training of the unlabeled data, thereby\nimproving the segmentation accuracy for both small and complex organs.\nExtensive experiments on two public datasets, FLARE22 and AMOS, demonstrated\nthat GuidedNet is capable of achieving state-of-the-art performance. The source\ncode with our proposed model are available at\nhttps://github.com/kimjisoo12/GuidedNet.\n","authors":["Haochen Zhao","Hui Meng","Deqian Yang","Xiaozheng Xie","Xiaoze Wu","Qingfeng Li","Jianwei Niu"],"pdf_url":"https://arxiv.org/pdf/2408.04914v2.pdf","comment":"Accepted by ACM MM2024, 10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.14491v2","updated":"2024-09-02T08:40:11Z","published":"2024-07-19T17:44:33Z","title":"PD-APE: A Parallel Decoding Framework with Adaptive Position Encoding\n for 3D Visual Grounding","summary":" 3D visual grounding aims to identify objects in 3D point cloud scenes that\nmatch specific natural language descriptions. This requires the model to not\nonly focus on the target object itself but also to consider the surrounding\nenvironment to determine whether the descriptions are met. Most previous works\nattempt to accomplish both tasks within the same module, which can easily lead\nto a distraction of attention. To this end, we propose PD-APE, a dual-branch\ndecoding framework that separately decodes target object attributes and\nsurrounding layouts. Specifically, in the target object branch, the decoder\nprocesses text tokens that describe features of the target object (e.g.,\ncategory and color), guiding the queries to pay attention to the target object\nitself. In the surrounding branch, the queries align with other text tokens\nthat carry surrounding environment information, making the attention maps\naccurately capture the layout described in the text. Benefiting from the\nproposed dual-branch design, the queries are allowed to focus on points\nrelevant to each branch's specific objective. Moreover, we design an adaptive\nposition encoding method for each branch respectively. In the target object\nbranch, the position encoding relies on the relative positions between seed\npoints and predicted 3D boxes. In the surrounding branch, the attention map is\nadditionally guided by the confidence between visual and text features,\nenabling the queries to focus on points that have valuable layout information.\nExtensive experiments demonstrate that we surpass the state-of-the-art on two\nwidely adopted 3D visual grounding datasets, ScanRefer and Nr3D.\n","authors":["Chenshu Hou","Liang Peng","Xiaopei Wu","Xiaofei He","Wenxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2407.14491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11092v3","updated":"2024-09-02T08:37:29Z","published":"2023-10-17T09:21:29Z","title":"DORec: Decomposed Object Reconstruction and Segmentation Utilizing 2D\n Self-Supervised Features","summary":" Recovering 3D geometry and textures of individual objects is crucial for many\nrobotics applications, such as manipulation, pose estimation, and autonomous\ndriving. However, decomposing a target object from a complex background is\nchallenging. Most existing approaches rely on costly manual labels to acquire\nobject instance perception. Recent advancements in 2D self-supervised learning\noffer new prospects for identifying objects of interest, yet leveraging such\nnoisy 2D features for clean decomposition remains difficult. In this paper, we\npropose a Decomposed Object Reconstruction (DORec) network based on neural\nimplicit representations. Our key idea is to use 2D self-supervised features to\ncreate two levels of masks for supervision: a binary mask for foreground\nregions and a K-cluster mask for semantically similar regions. These\ncomplementary masks result in robust decomposition. Experimental results on\ndifferent datasets show DORec's superiority in segmenting and reconstructing\ndiverse foreground objects from varied backgrounds enabling downstream tasks\nsuch as pose estimation.\n","authors":["Jun Wu","Sicheng Li","Sihui Ji","Yifei Yang","Yue Wang","Rong Xiong","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2310.11092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03517v3","updated":"2024-09-02T08:30:37Z","published":"2023-12-06T14:24:26Z","title":"FRDiff : Feature Reuse for Universal Training-free Acceleration of\n Diffusion Models","summary":" The substantial computational costs of diffusion models, especially due to\nthe repeated denoising steps necessary for high-quality image generation,\npresent a major obstacle to their widespread adoption. While several studies\nhave attempted to address this issue by reducing the number of score function\nevaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased\nnumber of denoising iterations misses the opportunity to update fine details,\nresulting in noticeable quality degradation. In our work, we introduce an\nadvanced acceleration technique that leverages the temporal redundancy inherent\nin diffusion models. Reusing feature maps with high temporal similarity opens\nup a new opportunity to save computation resources without compromising output\nquality. To realize the practical benefits of this intuition, we conduct an\nextensive analysis and propose a novel method, FRDiff. FRDiff is designed to\nharness the advantages of both reduced NFE and feature reuse, achieving a\nPareto frontier that balances fidelity and latency trade-offs in various\ngenerative tasks.\n","authors":["Junhyuk So","Jungwon Lee","Eunhyeok Park"],"pdf_url":"https://arxiv.org/pdf/2312.03517v3.pdf","comment":"Accepted at ECCV 2024. Code :\n https://github.com/ECoLab-POSTECH/FRDiff"},{"id":"http://arxiv.org/abs/2406.05677v2","updated":"2024-09-02T07:32:55Z","published":"2024-06-09T07:22:50Z","title":"Evolution-aware VAriance (EVA) Coreset Selection for Medical Image\n Classification","summary":" In the medical field, managing high-dimensional massive medical imaging data\nand performing reliable medical analysis from it is a critical challenge,\nespecially in resource-limited environments such as remote medical facilities\nand mobile devices. This necessitates effective dataset compression techniques\nto reduce storage, transmission, and computational cost. However, existing\ncoreset selection methods are primarily designed for natural image datasets,\nand exhibit doubtful effectiveness when applied to medical image datasets due\nto challenges such as intra-class variation and inter-class similarity. In this\npaper, we propose a novel coreset selection strategy termed as Evolution-aware\nVAriance (EVA), which captures the evolutionary process of model training\nthrough a dual-window approach and reflects the fluctuation of sample\nimportance more precisely through variance measurement. Extensive experiments\non medical image datasets demonstrate the effectiveness of our strategy over\nprevious SOTA methods, especially at high compression rates. EVA achieves\n98.27% accuracy with only 10% training data, compared to 97.20% for the full\ntraining set. None of the compared baseline methods can exceed Random at 5%\nselection rate, while EVA outperforms Random by 5.61%, showcasing its potential\nfor efficient medical image analysis.\n","authors":["Yuxin Hong","Xiao Zhang","Xin Zhang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.05677v2.pdf","comment":"Accepted by ACM Multimedia 2024 (oral), see:\n https://openreview.net/forum?id=m1qrB9KSYD"},{"id":"http://arxiv.org/abs/2405.15434v3","updated":"2024-09-02T07:18:16Z","published":"2024-05-24T11:02:55Z","title":"Biometrics and Behavior Analysis for Detecting Distractions in\n e-Learning","summary":" In this article, we explore computer vision approaches to detect abnormal\nhead pose during e-learning sessions and we introduce a study on the effects of\nmobile phone usage during these sessions. We utilize behavioral data collected\nfrom 120 learners monitored while participating in a MOOC learning sessions.\nOur study focuses on the influence of phone-usage events on behavior and\nphysiological responses, specifically attention, heart rate, and meditation,\nbefore, during, and after phone usage. Additionally, we propose an approach for\nestimating head pose events using images taken by the webcam during the MOOC\nlearning sessions to detect phone-usage events. Our hypothesis suggests that\nhead posture undergoes significant changes when learners interact with a mobile\nphone, contrasting with the typical behavior seen when learners face a computer\nduring e-learning sessions. We propose an approach designed to detect\ndeviations in head posture from the average observed during a learner's\nsession, operating as a semi-supervised method. This system flags events\nindicating alterations in head posture for subsequent human review and\nselection of mobile phone usage occurrences with a sensitivity over 90%.\n","authors":["Álvaro Becerra","Javier Irigoyen","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez","Mutlu Cukurova"],"pdf_url":"https://arxiv.org/pdf/2405.15434v3.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2405.20091v4","updated":"2024-09-02T07:15:02Z","published":"2024-05-30T14:27:40Z","title":"VAAD: Visual Attention Analysis Dashboard applied to e-Learning","summary":" In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v4.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2312.16039v2","updated":"2024-09-02T06:47:08Z","published":"2023-12-26T12:56:31Z","title":"Dual-scale Enhanced and Cross-generative Consistency Learning for\n Semi-supervised Medical Image Segmentation","summary":" Medical image segmentation plays a crucial role in computer-aided diagnosis.\nHowever, existing methods heavily rely on fully supervised training, which\nrequires a large amount of labeled data with time-consuming pixel-wise\nannotations. Moreover, accurately segmenting lesions poses challenges due to\nvariations in shape, size, and location. To address these issues, we propose a\nnovel Dual-scale Enhanced and Cross-generative consistency learning framework\nfor semi-supervised medical image Segmentation (DEC-Seg). First, we propose a\nCross-level Feature Aggregation (CFA) module that integrates cross-level\nadjacent layers to enhance the feature representation ability across different\nresolutions. To address scale variation, we present a scale-enhanced\nconsistency constraint, which ensures consistency in the segmentation maps\ngenerated from the same input image at different scales. This constraint helps\nhandle variations in lesion sizes and improves the robustness of the model.\nFurthermore, we propose a cross-generative consistency scheme, in which the\noriginal and perturbed images can be reconstructed using cross-segmentation\nmaps. This consistency constraint allows us to mine effective feature\nrepresentations and boost the segmentation performance. To further exploit the\nscale information, we propose a Dual-scale Complementary Fusion (DCF) module\nthat integrates features from two scale-specific decoders operating at\ndifferent scales to help produce more accurate segmentation maps. Extensive\nexperimental results on multiple medical segmentation tasks (polyp, skin\nlesion, and brain glioma) demonstrate the effectiveness of our DEC-Seg against\nother state-of-the-art semi-supervised segmentation approaches. The\nimplementation code will be released at https://github.com/taozh2017/DECSeg.\n","authors":["Yunqi Gu","Tao Zhou","Yizhe Zhang","Yi Zhou","Kelei He","Chen Gong","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2312.16039v2.pdf","comment":"12 pages 10 figures"},{"id":"http://arxiv.org/abs/2312.10692v2","updated":"2024-09-02T06:37:04Z","published":"2023-12-17T11:59:14Z","title":"Pedestrian Attribute Recognition via CLIP based Prompt Vision-Language\n Fusion","summary":" Existing pedestrian attribute recognition (PAR) algorithms adopt pre-trained\nCNN (e.g., ResNet) as their backbone network for visual feature learning, which\nmight obtain sub-optimal results due to the insufficient employment of the\nrelations between pedestrian images and attribute labels. In this paper, we\nformulate PAR as a vision-language fusion problem and fully exploit the\nrelations between pedestrian images and attribute labels. Specifically, the\nattribute phrases are first expanded into sentences, and then the pre-trained\nvision-language model CLIP is adopted as our backbone for feature embedding of\nvisual images and attribute descriptions. The contrastive learning objective\nconnects the vision and language modalities well in the CLIP-based feature\nspace, and the Transformer layers used in CLIP can capture the long-range\nrelations between pixels. Then, a multi-modal Transformer is adopted to fuse\nthe dual features effectively and feed-forward network is used to predict\nattributes. To optimize our network efficiently, we propose the region-aware\nprompt tuning technique to adjust very few parameters (i.e., only the prompt\nvectors and classification heads) and fix both the pre-trained VL model and\nmulti-modal Transformer. Our proposed PAR algorithm only adjusts 0.75%\nlearnable parameters compared with the fine-tuning strategy. It also achieves\nnew state-of-the-art performance on both standard and zero-shot settings for\nPAR, including RAPv1, RAPv2, WIDER, PA100K, and PETA-ZS, RAP-ZS datasets. The\nsource code and pre-trained models will be released on\nhttps://github.com/Event-AHU/OpenPAR.\n","authors":["Xiao Wang","Jiandong Jin","Chenglong Li","Jin Tang","Cheng Zhang","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2312.10692v2.pdf","comment":"Accepted by IEEE TCSVT 2024, Camera Ready Version"},{"id":"http://arxiv.org/abs/2408.17011v2","updated":"2024-09-02T06:31:48Z","published":"2024-08-30T04:51:19Z","title":"Disease Classification and Impact of Pretrained Deep Convolution Neural\n Networks on Diverse Medical Imaging Datasets across Imaging Modalities","summary":" Imaging techniques such as Chest X-rays, whole slide images, and optical\ncoherence tomography serve as the initial screening and detection for a wide\nvariety of medical pulmonary and ophthalmic conditions respectively. This paper\ninvestigates the intricacies of using pretrained deep convolutional neural\nnetworks with transfer learning across diverse medical imaging datasets with\nvarying modalities for binary and multiclass classification. We conducted a\ncomprehensive performance analysis with ten network architectures and model\nfamilies each with pretraining and random initialization. Our finding showed\nthat the use of pretrained models as fixed feature extractors yields poor\nperformance irrespective of the datasets. Contrary, histopathology microscopy\nwhole slide images have better performance. It is also found that deeper and\nmore complex architectures did not necessarily result in the best performance.\nThis observation implies that the improvements in ImageNet are not parallel to\nthe medical imaging tasks. Within a medical domain, the performance of the\nnetwork architectures varies within model families with shifts in datasets.\nThis indicates that the performance of models within a specific modality may\nnot be conclusive for another modality within the same domain. This study\nprovides a deeper understanding of the applications of deep learning techniques\nin medical imaging and highlights the impact of pretrained networks across\ndifferent medical imaging datasets under five different experimental settings.\n","authors":["Jutika Borah","Kumaresh Sarmah","Hidam Kumarjit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.17011v2.pdf","comment":"15 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2408.17064v2","updated":"2024-09-02T06:25:09Z","published":"2024-08-30T07:49:35Z","title":"Instant Adversarial Purification with Adversarial Consistency\n Distillation","summary":" Neural networks, despite their remarkable performance in widespread\napplications, including image classification, are also known to be vulnerable\nto subtle adversarial noise. Although some diffusion-based purification methods\nhave been proposed, for example, DiffPure, those methods are time-consuming. In\nthis paper, we propose One Step Control Purification (OSCP), a diffusion-based\npurification model that can purify the adversarial image in one Neural Function\nEvaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and\nControlNet for our one-step purification. OSCP is computationally friendly and\ntime efficient compared to other diffusion-based purification methods; we\nachieve defense success rate of 74.19\\% on ImageNet, only requiring 0.1s for\neach purification. Moreover, there is a fundamental incongruence between\nconsistency distillation and adversarial perturbation. To address this\nontological dissonance, we propose Gaussian Adversarial Noise Distillation\n(GAND), a novel consistency distillation framework that facilitates a more\nnuanced reconciliation of the latent space dynamics, effectively bridging the\nnatural and adversarial manifolds. Our experiments show that the GAND does not\nneed a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient.\n","authors":["Chun Tong Lei","Hon Ming Yam","Zhongliang Guo","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.17064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03024v3","updated":"2024-09-02T05:51:02Z","published":"2023-08-06T05:23:25Z","title":"Show Me the World in My Language: Establishing the First Baseline for\n Scene-Text to Scene-Text Translation","summary":" In this work, we study the task of ``visually'' translating scene text from a\nsource language (e.g., Hindi) to a target language (e.g., English). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe source scene text, such as font, size, and background. There are several\nchallenges associated with this task, such as translation with limited context,\ndeciding between translation and transliteration, accommodating varying text\nlengths within fixed spatial boundaries, and preserving the font and background\nstyles of the source scene text in the target language. To address this\nproblem, we make the following contributions: (i) We study visual translation\nas a standalone problem for the first time in the literature. (ii) We present a\ncascaded framework for visual translation that combines state-of-the-art\nmodules for scene text recognition, machine translation, and scene text\nsynthesis as a baseline for the task. (iii) We propose a set of task-specific\ndesign enhancements to design a variant of the baseline to obtain performance\nimprovements. (iv) Currently, the existing related literature lacks any\ncomprehensive performance evaluation for this novel task. To fill this gap, we\nintroduce several automatic and user-assisted evaluation metrics designed\nexplicitly for evaluating visual translation. Further, we evaluate presented\nbaselines for translating scene text between Hindi and English. Our experiments\ndemonstrate that although we can effectively perform visual translation over a\nlarge collection of scene text images, the presented baseline only partially\naddresses challenges posed by visual translation tasks. We firmly believe that\nthis new task and the limitations of existing models, as reported in this\npaper, should encourage further research in visual translation.\n","authors":["Shreyas Vaidya","Arvind Kumar Sharma","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v3.pdf","comment":"Accepted at ICPR 2024, Project Website:\n https://vl2g.github.io/projects/visTrans/"},{"id":"http://arxiv.org/abs/2403.06764v3","updated":"2024-09-02T05:48:54Z","published":"2024-03-11T14:35:32Z","title":"An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference\n Acceleration for Large Vision-Language Models","summary":" In this study, we identify the inefficient attention phenomena in Large\nVision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5,\nQwenVL-Chat and Video-LLaVA. We find out that the attention computation over\nvisual tokens is of extreme inefficiency in the deep layers of popular LVLMs,\nsuggesting a need for a sparser approach compared to textual data handling. To\nthis end, we introduce FastV, a versatile plug-and-play method designed to\noptimize computational efficiency by learning adaptive attention patterns in\nearly layers and pruning visual tokens in subsequent ones. Our evaluations\ndemonstrate FastV's ability to dramatically reduce computational costs (e.g., a\n45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a\nwide range of image and video understanding tasks. The computational efficiency\nand performance trade-off of FastV are highly customizable and\npareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve\na lower budget than that of a 7B-parameter model, while still maintaining\nsuperior performance. We believe FastV has practical values for deployment of\nLVLMs in edge devices and commercial models. Code is released at\nhttps://github.com/pkunlp-icler/FastV.\n","authors":["Liang Chen","Haozhe Zhao","Tianyu Liu","Shuai Bai","Junyang Lin","Chang Zhou","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2403.06764v3.pdf","comment":"Accepted to ECCV 2024 (Oral), code is released at\n https://github.com/pkunlp-icler/FastV,"},{"id":"http://arxiv.org/abs/2408.11402v2","updated":"2024-09-02T05:46:43Z","published":"2024-08-21T08:01:00Z","title":"Video Diffusion Models are Strong Video Inpainter","summary":" Propagation-based video inpainting using optical flow at the pixel or feature\nlevel has recently garnered significant attention. However, it has limitations\nsuch as the inaccuracy of optical flow prediction and the propagation of noise\nover time. These issues result in non-uniform noise and time consistency\nproblems throughout the video, which are particularly pronounced when the\nremoved area is large and involves substantial movement. To address these\nissues, we propose a novel First Frame Filling Video Diffusion Inpainting model\n(FFF-VDI). We design FFF-VDI inspired by the capabilities of pre-trained\nimage-to-video diffusion models that can transform the first frame image into a\nhighly natural video. To apply this to the video inpainting task, we propagate\nthe noise latent information of future frames to fill the masked areas of the\nfirst frame's noise latent code. Next, we fine-tune the pre-trained\nimage-to-video diffusion model to generate the inpainted video. The proposed\nmodel addresses the limitations of existing methods that rely on optical flow\nquality, producing much more natural and temporally consistent videos. This\nproposed approach is the first to effectively integrate image-to-video\ndiffusion models into video inpainting tasks. Through various comparative\nexperiments, we demonstrate that the proposed model can robustly handle diverse\ninpainting types with high quality.\n","authors":["Minhyeok Lee","Suhwan Cho","Chajin Shin","Jungho Lee","Sunghun Yang","Sangyoun Lee"],"pdf_url":"https://arxiv.org/pdf/2408.11402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10901v2","updated":"2024-09-02T05:25:06Z","published":"2024-08-20T14:43:53Z","title":"A Grey-box Attack against Latent Diffusion Model-based Image Editing by\n Posterior Collapse","summary":" Recent advancements in generative AI, particularly Latent Diffusion Models\n(LDMs), have revolutionized image synthesis and manipulation. However, these\ngenerative techniques raises concerns about data misappropriation and\nintellectual property infringement. Adversarial attacks on machine learning\nmodels have been extensively studied, and a well-established body of research\nhas extended these techniques as a benign metric to prevent the underlying\nmisuse of generative AI. Current approaches to safeguarding images from\nmanipulation by LDMs are limited by their reliance on model-specific knowledge\nand their inability to significantly degrade semantic quality of generated\nimages. In response to these shortcomings, we propose the Posterior Collapse\nAttack (PCA) based on the observation that VAEs suffer from posterior collapse\nduring training. Our method minimizes dependence on the white-box information\nof target models to get rid of the implicit reliance on model-specific\nknowledge. By accessing merely a small amount of LDM parameters, in specific\nmerely the VAE encoder of LDMs, our method causes a substantial semantic\ncollapse in generation quality, particularly in perceptual consistency, and\ndemonstrates strong transferability across various model architectures.\nExperimental results show that PCA achieves superior perturbation effects on\nimage generation of LDMs with lower runtime and VRAM. Our method outperforms\nexisting techniques, offering a more robust and generalizable solution that is\nhelpful in alleviating the socio-technical challenges posed by the rapidly\nevolving landscape of generative AI.\n","authors":["Zhongliang Guo","Lei Fang","Jingyu Lin","Yifei Qian","Shuai Zhao","Zeyu Wang","Junhao Dong","Cunjian Chen","Ognjen Arandjelović","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.10901v2.pdf","comment":"21 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2303.03856v3","updated":"2024-09-02T03:56:06Z","published":"2023-03-07T12:48:02Z","title":"Event Voxel Set Transformer for Spatiotemporal Representation Learning\n on Event Streams","summary":" Event cameras are neuromorphic vision sensors that record a scene as sparse\nand asynchronous event streams. Most event-based methods project events into\ndense frames and process them using conventional vision models, resulting in\nhigh computational complexity. A recent trend is to develop point-based\nnetworks that achieve efficient event processing by learning sparse\nrepresentations. However, existing works may lack robust local information\naggregators and effective feature interaction operations, thus limiting their\nmodeling capabilities. To this end, we propose an attention-aware model named\nEvent Voxel Set Transformer (EVSTr) for efficient spatiotemporal representation\nlearning on event streams. It first converts the event stream into voxel sets\nand then hierarchically aggregates voxel features to obtain robust\nrepresentations. The core of EVSTr is an event voxel transformer encoder that\nconsists of two well-designed components, including the Multi-Scale Neighbor\nEmbedding Layer (MNEL) for local information aggregation and the Voxel\nSelf-Attention Layer (VSAL) for global feature interaction. Enabling the\nnetwork to incorporate a long-range temporal structure, we introduce a segment\nmodeling strategy (S$^{2}$TM) to learn motion patterns from a sequence of\nsegmented voxel sets. The proposed model is evaluated on two recognition tasks,\nincluding object classification and action recognition. To provide a convincing\nmodel evaluation, we present a new event-based action recognition dataset\n(NeuroHAR) recorded in challenging scenarios. Comprehensive experiments show\nthat EVSTr achieves state-of-the-art performance while maintaining low model\ncomplexity.\n","authors":["Bochen Xie","Yongjian Deng","Zhanpeng Shao","Qingsong Xu","Youfu Li"],"pdf_url":"https://arxiv.org/pdf/2303.03856v3.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2402.14154v3","updated":"2024-09-02T02:41:26Z","published":"2024-02-21T22:27:40Z","title":"MM-Soc: Benchmarking Multimodal Large Language Models in Social Media\n Platforms","summary":" Social media platforms are hubs for multimodal information exchange,\nencompassing text, images, and videos, making it challenging for machines to\ncomprehend the information or emotions associated with interactions in online\nspaces. Multimodal Large Language Models (MLLMs) have emerged as a promising\nsolution to these challenges, yet they struggle to accurately interpret human\nemotions and complex content such as misinformation. This paper introduces\nMM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of\nmultimodal social media content. MM-Soc compiles prominent multimodal datasets\nand incorporates a novel large-scale YouTube tagging dataset, targeting a range\nof tasks from misinformation detection, hate speech detection, and social\ncontext generation. Through our exhaustive evaluation on ten size-variants of\nfour open-source MLLMs, we have identified significant performance disparities,\nhighlighting the need for advancements in models' social understanding\ncapabilities. Our analysis reveals that, in a zero-shot setting, various types\nof MLLMs generally exhibit difficulties in handling social media tasks.\nHowever, MLLMs demonstrate performance improvements post fine-tuning,\nsuggesting potential pathways for improvement. Our code and data are available\nat https://github.com/claws-lab/MMSoc.git.\n","authors":["Yiqiao Jin","Minje Choi","Gaurav Verma","Jindong Wang","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2402.14154v3.pdf","comment":"In Proceedings of ACL 2024"},{"id":"http://arxiv.org/abs/2408.15063v3","updated":"2024-09-02T02:32:45Z","published":"2024-08-27T13:47:31Z","title":"Adapting Segment Anything Model to Multi-modal Salient Object Detection\n with Semantic Feature Fusion Guidance","summary":" Although most existing multi-modal salient object detection (SOD) methods\ndemonstrate effectiveness through training models from scratch, the limited\nmulti-modal data hinders these methods from reaching optimality. In this paper,\nwe propose a novel framework to explore and exploit the powerful feature\nrepresentation and zero-shot generalization ability of the pre-trained Segment\nAnything Model (SAM) for multi-modal SOD. Despite serving as a recent vision\nfundamental model, driving the class-agnostic SAM to comprehend and detect\nsalient objects accurately is non-trivial, especially in challenging scenes. To\nthis end, we develop \\underline{SAM} with se\\underline{m}antic\nf\\underline{e}ature fu\\underline{s}ion guidanc\\underline{e} (Sammese), which\nincorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to\nmulti-modal SOD tasks. However, it is difficult for SAM trained on single-modal\ndata to directly mine the complementary benefits of multi-modal inputs and\ncomprehensively utilize them to achieve accurate saliency prediction. To\naddress these issues, we first design a multi-modal complementary fusion module\nto extract robust multi-modal semantic features by integrating information from\nvisible and thermal or depth image pairs. Then, we feed the extracted\nmulti-modal semantic features into both the SAM image encoder and mask decoder\nfor fine-tuning and prompting, respectively. Specifically, in the image\nencoder, a multi-modal adapter is proposed to adapt the single-modal SAM to\nmulti-modal information. In the mask decoder, a semantic-geometric prompt\ngeneration strategy is proposed to produce corresponding embeddings with\nvarious saliency cues. Extensive experiments on both RGB-D and RGB-T SOD\nbenchmarks show the effectiveness of the proposed framework. The code will be\navailable at \\url{https://github.com/Angknpng/Sammese}.\n","authors":["Kunpeng Wang","Danying Lin","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15063v3.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2402.02112v2","updated":"2024-09-02T02:20:05Z","published":"2024-02-03T10:35:42Z","title":"S-NeRF++: Autonomous Driving Simulation via Neural Reconstruction and\n Generation","summary":" Autonomous driving simulation system plays a crucial role in enhancing\nself-driving data and simulating complex and rare traffic scenarios, ensuring\nnavigation safety. However, traditional simulation systems, which often heavily\nrely on manual modeling and 2D image editing, struggled with scaling to\nextensive scenes and generating realistic simulation data. In this study, we\npresent S-NeRF++, an innovative autonomous driving simulation system based on\nneural reconstruction. Trained on widely-used self-driving datasets such as\nnuScenes and Waymo, S-NeRF++ can generate a large number of realistic street\nscenes and foreground objects with high rendering quality as well as offering\nconsiderable flexibility in manipulation and simulation. Specifically, S-NeRF++\nis an enhanced neural radiance field for synthesizing large-scale scenes and\nmoving vehicles, with improved scene parameterization and camera pose learning.\nThe system effectively utilizes noisy and sparse LiDAR data to refine training\nand address depth outliers, ensuring high-quality reconstruction and novel-view\nrendering. It also provides a diverse foreground asset bank by reconstructing\nand generating different foreground vehicles to support comprehensive scenario\ncreation.Moreover, we have developed an advanced foreground-background fusion\npipeline that skillfully integrates illumination and shadow effects, further\nenhancing the realism of our simulations. With the high-quality simulated data\nprovided by our S-NeRF++, we found the perception methods enjoy performance\nboosts on several autonomous driving downstream tasks, further demonstrating\nour proposed simulator's effectiveness.\n","authors":["Yurui Chen","Junge Zhang","Ziyang Xie","Wenye Li","Feihu Zhang","Jiachen Lu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2402.02112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07704v2","updated":"2024-09-02T02:08:22Z","published":"2023-09-14T13:29:41Z","title":"NutritionVerse: Empirical Study of Various Dietary Intake Estimation\n Approaches","summary":" Accurate dietary intake estimation is critical for informing policies and\nprograms to support healthy eating, as malnutrition has been directly linked to\ndecreased quality of life. However self-reporting methods such as food diaries\nsuffer from substantial bias. Other conventional dietary assessment techniques\nand emerging alternative approaches such as mobile applications incur high time\ncosts and may necessitate trained personnel. Recent work has focused on using\ncomputer vision and machine learning to automatically estimate dietary intake\nfrom food images, but the lack of comprehensive datasets with diverse\nviewpoints, modalities and food annotations hinders the accuracy and realism of\nsuch methods. To address this limitation, we introduce NutritionVerse-Synth,\nthe first large-scale dataset of 84,984 photorealistic synthetic 2D food images\nwith associated dietary information and multimodal annotations (including depth\nimages, instance masks, and semantic masks). Additionally, we collect a real\nimage dataset, NutritionVerse-Real, containing 889 images of 251 dishes to\nevaluate realism. Leveraging these novel datasets, we develop and benchmark\nNutritionVerse, an empirical study of various dietary intake estimation\napproaches, including indirect segmentation-based and direct prediction\nnetworks. We further fine-tune models pretrained on synthetic data with real\nimages to provide insights into the fusion of synthetic and real data. Finally,\nwe release both datasets (NutritionVerse-Synth, NutritionVerse-Real) on\nhttps://www.kaggle.com/nutritionverse/datasets as part of an open initiative to\naccelerate machine learning for dietary sensing.\n","authors":["Chi-en Amy Tai","Matthew Keller","Saeejith Nair","Yuhao Chen","Yifan Wu","Olivia Markham","Krish Parmar","Pengcheng Xi","Heather Keller","Sharon Kirkpatrick","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2309.07704v2.pdf","comment":"Corrections made to Tables 6, 7, and 8, and corrections made to\n Experiments Part C. Additional clarification made in Section 4"},{"id":"http://arxiv.org/abs/2403.10814v2","updated":"2024-09-02T00:54:47Z","published":"2024-03-16T05:21:42Z","title":"DarkGS: Learning Neural Illumination and 3D Gaussians Relighting for\n Robotic Exploration in the Dark","summary":" Humans have the remarkable ability to construct consistent mental models of\nan environment, even under limited or varying levels of illumination. We wish\nto endow robots with this same capability. In this paper, we tackle the\nchallenge of constructing a photorealistic scene representation under poorly\nilluminated conditions and with a moving light source. We approach the task of\nmodeling illumination as a learning problem, and utilize the developed\nillumination model to aid in scene reconstruction. We introduce an innovative\nframework that uses a data-driven approach, Neural Light Simulators (NeLiS), to\nmodel and calibrate the camera-light system. Furthermore, we present DarkGS, a\nmethod that applies NeLiS to create a relightable 3D Gaussian scene model\ncapable of real-time, photorealistic rendering from novel viewpoints. We show\nthe applicability and robustness of our proposed simulator and system in a\nvariety of real-world environments.\n","authors":["Tianyi Zhang","Kaining Huang","Weiming Zhi","Matthew Johnson-Roberson"],"pdf_url":"https://arxiv.org/pdf/2403.10814v2.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.12606v2","updated":"2024-09-02T00:52:01Z","published":"2024-08-08T05:04:13Z","title":"Towards Non-invasive and Personalized Management of Breast Cancer\n Patients from Multiparametric MRI via A Large Mixture-of-Modality-Experts\n Model","summary":" Breast magnetic resonance imaging (MRI) is the imaging technique with the\nhighest sensitivity for detecting breast cancer and is routinely used for women\nat high risk. Despite the comprehensive multiparametric protocol of breast MRI,\nexisting artificial intelligence-based studies predominantly rely on single\nsequences and have limited validation. Here we report a large\nmixture-of-modality-experts model (MOME) that integrates multiparametric MRI\ninformation within a unified structure, offering a noninvasive method for\npersonalized breast cancer management. We have curated the largest\nmultiparametric breast MRI dataset, involving 5,205 patients from three\nhospitals in the north, southeast, and southwest of China, for the development\nand extensive evaluation of our model. MOME demonstrated accurate and robust\nidentification of breast cancer. It achieved comparable performance for\nmalignancy recognition to that of four senior radiologists and significantly\noutperformed a junior radiologist, with 0.913 AUROC, 0.948 AUPRC, 0.905 F1\nscore, and 0.723 MCC. Our findings suggest that MOME could reduce the need for\nbiopsies in BI-RADS 4 patients with a ratio of 7.3%, classify triple-negative\nbreast cancer with an AUROC of 0.709, and predict pathological complete\nresponse to neoadjuvant chemotherapy with an AUROC of 0.694. The model further\nsupports scalable and interpretable inference, adapting to missing modalities\nand providing decision explanations by highlighting lesions and measuring\nmodality contributions. MOME exemplifies a discriminative, robust, scalable,\nand interpretable multimodal model, paving the way for noninvasive,\npersonalized management of breast cancer patients based on multiparametric\nbreast imaging data.\n","authors":["Luyang Luo","Mingxiang Wu","Mei Li","Yi Xin","Qiong Wang","Varut Vardhanabhuti","Winnie CW Chu","Zhenhui Li","Juan Zhou","Pranav Rajpurkar","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2408.12606v2.pdf","comment":"27 pages, 8 figures, 10 tables"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2404.07981v2","updated":"2024-09-02T21:29:04Z","published":"2024-04-11T17:57:32Z","title":"Manipulating Large Language Models to Increase Product Visibility","summary":" Large language models (LLMs) are increasingly being integrated into search\nengines to provide natural language responses tailored to user queries.\nCustomers and end-users are also becoming more dependent on these models for\nquick and easy purchase decisions. In this work, we investigate whether\nrecommendations from LLMs can be manipulated to enhance a product's visibility.\nWe demonstrate that adding a strategic text sequence (STS) -- a carefully\ncrafted message -- to a product's information page can significantly increase\nits likelihood of being listed as the LLM's top recommendation. To understand\nthe impact of STS, we use a catalog of fictitious coffee machines and analyze\nits effect on two target products: one that seldom appears in the LLM's\nrecommendations and another that usually ranks second. We observe that the\nstrategic text sequence significantly enhances the visibility of both products\nby increasing their chances of appearing as the top recommendation. This\nability to manipulate LLM-generated search responses provides vendors with a\nconsiderable competitive advantage and has the potential to disrupt fair market\ncompetition. Just as search engine optimization (SEO) revolutionized how\nwebpages are customized to rank higher in search engine results, influencing\nLLM recommendations could profoundly impact content optimization for AI-driven\nsearch services. Code for our experiments is available at\nhttps://github.com/aounon/llm-rank-optimizer.\n","authors":["Aounon Kumar","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2404.07981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02935v2","updated":"2024-09-02T20:06:21Z","published":"2022-12-06T12:45:15Z","title":"A multi-language toolkit for supporting automated checking of research\n outputs","summary":" This article presents the automatic checking of research outputs package\nacro, which assists researchers and data governance teams by automatically\napplying best-practice principles-based statistical disclosure control (SDC)\ntechniques on-the-fly as researchers conduct their analyses. acro distinguishes\nbetween: research output that is safe to publish; output that requires further\nanalysis; and output that cannot be published because it creates substantial\nrisk of disclosing private data. This is achieved through the use of a\nlightweight Python wrapper that sits over well-known analysis tools that\nproduce outputs such as tables, plots, and statistical models. This adds\nfunctionality to (i) identify potentially disclosive outputs against a range of\ncommonly used disclosure tests; (ii) apply disclosure mitigation strategies\nwhere required; (iii) report reasons for applying SDC; and (iv) produce simple\nsummary documents trusted research environment staff can use to streamline\ntheir workflow. The major analytical programming languages used by researchers\nare supported: Python, R, and Stata. The acro code and documentation are\navailable under an MIT license at https://github.com/AI-SDC/ACRO\n","authors":["Richard J. Preen","Maha Albashir","Simon Davy","Jim Smith"],"pdf_url":"https://arxiv.org/pdf/2212.02935v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01304v3","updated":"2024-09-02T18:53:22Z","published":"2023-11-02T15:18:00Z","title":"VM-Rec: A Variational Mapping Approach for Cold-start User\n Recommendation","summary":" The cold-start problem is a common challenge for most recommender systems.\nThe practical application of most cold-start methods is hindered by the\ndeficiency in auxiliary content information for users. Moreover, most methods\nnecessitate simultaneous updates to the extensive parameters of recommender\nmodels, leading to significant training costs, particularly in large-scale\nindustrial scenarios. We observe that the model can generate expressive\nembeddings for warm users with relatively more interactions. Initially, these\nusers were cold-start users, and after transitioning to warm users, they\nexhibit clustering patterns in their embeddings with consistent initial\ninteractions. Based on this motivation, we propose a Variational Mapping\napproach for cold-start user Recommendation (VM-Rec), mapping from few initial\ninteractions to expressive embeddings for cold-start users. Specifically, we\nencode the initial interactions into a latent representation, where each\ndimension disentangledly signifies the degree of association with each warm\nuser. Subsequently, we utilize this latent representation as the parameters for\nthe mapping function, mapping (decoding) it into an expressive embedding, which\ncan be integrated into a pre-trained recommender model directly. Our method is\nevaluated on three datasets using the same base model, demonstrating superior\nperformance compared to other popular cold-start methods.\n","authors":["Linan Zheng","Jiale Chen","Pengsheng Liu","Guangfa Zhang","Jinyun Fang"],"pdf_url":"https://arxiv.org/pdf/2311.01304v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05141v3","updated":"2024-09-02T10:55:30Z","published":"2024-08-09T15:53:55Z","title":"A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning","summary":" Retrieval-augmented generation (RAG) is a framework enabling large language\nmodels (LLMs) to enhance their accuracy and reduce hallucinations by\nintegrating external knowledge bases. In this paper, we introduce a hybrid RAG\nsystem enhanced through a comprehensive suite of optimizations that\nsignificantly improve retrieval quality, augment reasoning capabilities, and\nrefine numerical computation ability. We refined the text chunks and tables in\nweb pages, added attribute predictors to reduce hallucinations, conducted LLM\nKnowledge Extractor and Knowledge Graph Extractor, and finally built a\nreasoning strategy with all the references. We evaluated our system on the CRAG\ndataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and\nonline evaluations demonstrate that our system significantly enhances complex\nreasoning capabilities. In local evaluations, we have significantly improved\naccuracy and reduced error rates compared to the baseline model, achieving a\nnotable increase in scores. In the meanwhile, we have attained outstanding\nresults in online assessments, demonstrating the performance and generalization\ncapabilities of the proposed system. The source code for our system is released\nin \\url{https://gitlab.aicrowd.com/shizueyy/crag-new}.\n","authors":["Ye Yuan","Chengwu Liu","Jingyang Yuan","Gongbo Sun","Siqi Li","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.05141v3.pdf","comment":"Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024"},{"id":"http://arxiv.org/abs/2312.12162v2","updated":"2024-09-02T07:58:20Z","published":"2023-12-19T13:51:48Z","title":"PEPT: Expert Finding Meets Personalized Pre-training","summary":" Finding experts is essential in Community Question Answering (CQA) platforms\nas it enables the effective routing of questions to potential users who can\nprovide relevant answers. The key is to personalized learning expert\nrepresentations based on their historical answered questions, and accurately\nmatching them with target questions. There have been some preliminary works\nexploring the usability of PLMs in expert finding, such as pre-training expert\nor question representations. However, these models usually learn pure text\nrepresentations of experts from histories, disregarding personalized and\nfine-grained expert modeling. For alleviating this, we present a personalized\npre-training and fine-tuning paradigm, which could effectively learn expert\ninterest and expertise simultaneously. Specifically, in our pre-training\nframework, we integrate historical answered questions of one expert with one\ntarget question, and regard it as a candidate aware expert-level input unit.\nThen, we fuse expert IDs into the pre-training for guiding the model to model\npersonalized expert representations, which can help capture the unique\ncharacteristics and expertise of each individual expert. Additionally, in our\npre-training task, we design: 1) a question-level masked language model task to\nlearn the relatedness between histories, enabling the modeling of\nquestion-level expert interest; 2) a vote-oriented task to capture\nquestion-level expert expertise by predicting the vote score the expert would\nreceive. Through our pre-training framework and tasks, our approach could\nholistically learn expert representations including interests and expertise.\nOur method has been extensively evaluated on six real-world CQA datasets, and\nthe experimental results consistently demonstrate the superiority of our\napproach over competitive baseline methods.\n","authors":["Qiyao Peng","Hongyan Xu","Yinghui Wang","Hongtao Liu","Cuiying Huo","Wenjun Wang"],"pdf_url":"https://arxiv.org/pdf/2312.12162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01445v1","updated":"2024-09-02T20:00:49Z","published":"2024-09-02T20:00:49Z","title":"Sync from the Sea: Retrieving Alignable Videos from Large-Scale Datasets","summary":" Temporal video alignment aims to synchronize the key events like object\ninteractions or action phase transitions in two videos. Such methods could\nbenefit various video editing, processing, and understanding tasks. However,\nexisting approaches operate under the restrictive assumption that a suitable\nvideo pair for alignment is given, significantly limiting their broader\napplicability. To address this, we re-pose temporal alignment as a search\nproblem and introduce the task of Alignable Video Retrieval (AVR). Given a\nquery video, our approach can identify well-alignable videos from a large\ncollection of clips and temporally synchronize them to the query. To achieve\nthis, we make three key contributions: 1) we introduce DRAQ, a video\nalignability indicator to identify and re-rank the best alignable video from a\nset of candidates; 2) we propose an effective and generalizable frame-level\nvideo feature design to improve the alignment performance of several\noff-the-shelf feature representations, and 3) we propose a novel benchmark and\nevaluation protocol for AVR using cycle-consistency metrics. Our experiments on\n3 datasets, including large-scale Kinetics700, demonstrate the effectiveness of\nour approach in identifying alignable video pairs from diverse datasets.\nProject Page: https://daveishan.github.io/avr-webpage/.\n","authors":["Ishan Rajendrakumar Dave","Fabian Caba Heilbron","Mubarak Shah","Simon Jenni"],"pdf_url":"https://arxiv.org/pdf/2409.01445v1.pdf","comment":"ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2409.01357v1","updated":"2024-09-02T16:19:13Z","published":"2024-09-02T16:19:13Z","title":"Know When to Fuse: Investigating Non-English Hybrid Retrieval in the\n Legal Domain","summary":" Hybrid search has emerged as an effective strategy to offset the limitations\nof different matching paradigms, especially in out-of-domain contexts where\nnotable improvements in retrieval quality have been observed. However, existing\nresearch predominantly focuses on a limited set of retrieval methods, evaluated\nin pairs on domain-general datasets exclusively in English. In this work, we\nstudy the efficacy of hybrid search across a variety of prominent retrieval\nmodels within the unexplored field of law in the French language, assessing\nboth zero-shot and in-domain scenarios. Our findings reveal that in a zero-shot\ncontext, fusing different domain-general models consistently enhances\nperformance compared to using a standalone model, regardless of the fusion\nmethod. Surprisingly, when models are trained in-domain, we find that fusion\ngenerally diminishes performance relative to using the best single system,\nunless fusing scores with carefully tuned weights. These novel insights, among\nothers, expand the applicability of prior findings across a new field and\nlanguage, and contribute to a deeper understanding of hybrid search in\nnon-English specialized domains.\n","authors":["Antoine Louis","Gijs van Dijck","Gerasimos Spanakis"],"pdf_url":"https://arxiv.org/pdf/2409.01357v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.01192v1","updated":"2024-09-02T11:58:56Z","published":"2024-09-02T11:58:56Z","title":"SSD4Rec: A Structured State Space Duality Model for Efficient Sequential\n Recommendation","summary":" Sequential recommendation methods are crucial in modern recommender systems\nfor their remarkable capability to understand a user's changing interests based\non past interactions. However, a significant challenge faced by current methods\n(e.g., RNN- or Transformer-based models) is to effectively and efficiently\ncapture users' preferences by modeling long behavior sequences, which impedes\ntheir various applications like short video platforms where user interactions\nare numerous. Recently, an emerging architecture named Mamba, built on state\nspace models (SSM) with efficient hardware-aware designs, has showcased the\ntremendous potential for sequence modeling, presenting a compelling avenue for\naddressing the challenge effectively. Inspired by this, we propose a novel\ngeneric and efficient sequential recommendation backbone, SSD4Rec, which\nexplores the seamless adaptation of Mamba for sequential recommendations.\nSpecifically, SSD4Rec marks the variable- and long-length item sequences with\nsequence registers and processes the item representations with bidirectional\nStructured State Space Duality (SSD) blocks. This not only allows for\nhardware-aware matrix multiplication but also empowers outstanding capabilities\nin variable-length and long-range sequence modeling. Extensive evaluations on\nfour benchmark datasets demonstrate that the proposed model achieves\nstate-of-the-art performance while maintaining near-linear scalability with\nuser sequence length. Our code is publicly available at\nhttps://github.com/ZhangYifeng1995/SSD4Rec.\n","authors":["Haohao Qu","Yifeng Zhang","Liangbo Ning","Wenqi Fan","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2409.01192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01152v1","updated":"2024-09-02T10:37:53Z","published":"2024-09-02T10:37:53Z","title":"Real World Conversational Entity Linking Requires More Than Zeroshots","summary":" Entity linking (EL) in conversations faces notable challenges in practical\napplications, primarily due to the scarcity of entity-annotated conversational\ndatasets and sparse knowledge bases (KB) containing domain-specific, long-tail\nentities. We designed targeted evaluation scenarios to measure the efficacy of\nEL models under resource constraints. Our evaluation employs two KBs: Fandom,\nexemplifying real-world EL complexities, and the widely used Wikipedia. First,\nwe assess EL models' ability to generalize to a new unfamiliar KB using Fandom\nand a novel zero-shot conversational entity linking dataset that we curated\nbased on Reddit discussions on Fandom entities. We then evaluate the\nadaptability of EL models to conversational settings without prior training.\nOur results indicate that current zero-shot EL models falter when introduced to\nnew, domain-specific KBs without prior training, significantly dropping in\nperformance. Our findings reveal that previous evaluation approaches fall short\nof capturing real-world complexities for zero-shot EL, highlighting the\nnecessity for new approaches to design and assess conversational EL models to\nadapt to limited resources. The evaluation setup and the dataset proposed in\nthis research are made publicly available.\n","authors":["Mohanna Hoveyda","Arjen P. de Vries","Maarten de Rijke","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2409.01152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01140v1","updated":"2024-09-02T10:20:35Z","published":"2024-09-02T10:20:35Z","title":"LLM-PQA: LLM-enhanced Prediction Query Answering","summary":" The advent of Large Language Models (LLMs) provides an opportunity to change\nthe way queries are processed, moving beyond the constraints of conventional\nSQL-based database systems. However, using an LLM to answer a prediction query\nis still challenging, since an external ML model has to be employed and\ninference has to be performed in order to provide an answer. This paper\nintroduces LLM-PQA, a novel tool that addresses prediction queries formulated\nin natural language. LLM-PQA is the first to combine the capabilities of LLMs\nand retrieval-augmented mechanism for the needs of prediction queries by\nintegrating data lakes and model zoos. This integration provides users with\naccess to a vast spectrum of heterogeneous data and diverse ML models,\nfacilitating dynamic prediction query answering. In addition, LLM-PQA can\ndynamically train models on demand, based on specific query requirements,\nensuring reliable and relevant results even when no pre-trained model in a\nmodel zoo, available for the task.\n","authors":["Ziyu Li","Wenjie Zhao","Asterios Katsifodimos","Rihan Hai"],"pdf_url":"https://arxiv.org/pdf/2409.01140v1.pdf","comment":"This paper is accepted as a demo at CIKM 2024"},{"id":"http://arxiv.org/abs/2409.01082v1","updated":"2024-09-02T09:10:47Z","published":"2024-09-02T09:10:47Z","title":"Evidential Transformers for Improved Image Retrieval","summary":" We introduce the Evidential Transformer, an uncertainty-driven transformer\nmodel for improved and robust image retrieval. In this paper, we make several\ncontributions to content-based image retrieval (CBIR). We incorporate\nprobabilistic methods into image retrieval, achieving robust and reliable\nresults, with evidential classification surpassing traditional training based\non multiclass classification as a baseline for deep metric learning.\nFurthermore, we improve the state-of-the-art retrieval results on several\ndatasets by leveraging the Global Context Vision Transformer (GC ViT)\narchitecture. Our experimental results consistently demonstrate the reliability\nof our approach, setting a new benchmark in CBIR in all test settings on the\nStanford Online Products (SOP) and CUB-200-2011 datasets.\n","authors":["Danilo Dordevic","Suryansh Kumar"],"pdf_url":"https://arxiv.org/pdf/2409.01082v1.pdf","comment":"6 pages, 6 figures, To be presented at the 3rd Workshop on\n Uncertainty Quantification for Computer Vision, at the ECCV 2024 conference\n in Milan, Italy"},{"id":"http://arxiv.org/abs/2409.01012v1","updated":"2024-09-02T07:44:48Z","published":"2024-09-02T07:44:48Z","title":"Improved Diversity-Promoting Collaborative Metric Learning for\n Recommendation","summary":" Collaborative Metric Learning (CML) has recently emerged as a popular method\nin recommendation systems (RS), closing the gap between metric learning and\ncollaborative filtering. Following the convention of RS, existing practices\nexploit unique user representation in their model design. This paper focuses on\na challenging scenario where a user has multiple categories of interests. Under\nthis setting, the unique user representation might induce preference bias,\nespecially when the item category distribution is imbalanced. To address this\nissue, we propose a novel method called \\textit{Diversity-Promoting\nCollaborative Metric Learning} (DPCML), with the hope of considering the\ncommonly ignored minority interest of the user. The key idea behind DPCML is to\nintroduce a set of multiple representations for each user in the system where\nusers' preference toward an item is aggregated by taking the minimum item-user\ndistance among their embedding set. Specifically, we instantiate two effective\nassignment strategies to explore a proper quantity of vectors for each user.\nMeanwhile, a \\textit{Diversity Control Regularization Scheme} (DCRS) is\ndeveloped to accommodate the multi-vector representation strategy better.\nTheoretically, we show that DPCML could induce a smaller generalization error\nthan traditional CML. Furthermore, we notice that CML-based approaches usually\nrequire \\textit{negative sampling} to reduce the heavy computational burden\ncaused by the pairwise objective therein. In this paper, we reveal the\nfundamental limitation of the widely adopted hard-aware sampling from the\nOne-Way Partial AUC (OPAUC) perspective and then develop an effective sampling\nalternative for the CML-based paradigm. Finally, comprehensive experiments over\na range of benchmark datasets speak to the efficacy of DPCML. Code are\navailable at \\url{https://github.com/statusrank/LibCML}.\n","authors":["Shilong Bao","Qianqian Xu","Zhiyong Yang","Yuan He","Xiaochun Cao","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2409.01012v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.15292"},{"id":"http://arxiv.org/abs/2409.00890v1","updated":"2024-09-02T01:54:33Z","published":"2024-09-02T01:54:33Z","title":"Towards Investigating Biases in Spoken Conversational Search","summary":" Voice-based systems like Amazon Alexa, Google Assistant, and Apple Siri,\nalong with the growing popularity of OpenAI's ChatGPT and Microsoft's Copilot,\nserve diverse populations, including visually impaired and low-literacy\ncommunities. This reflects a shift in user expectations from traditional search\nto more interactive question-answering models. However, presenting information\neffectively in voice-only channels remains challenging due to their linear\nnature. This limitation can impact the presentation of complex queries\ninvolving controversial topics with multiple perspectives. Failing to present\ndiverse viewpoints may perpetuate or introduce biases and affect user\nattitudes. Balancing information load and addressing biases is crucial in\ndesigning a fair and effective voice-based system. To address this, we (i)\nreview how biases and user attitude changes have been studied in screen-based\nweb search, (ii) address challenges in studying these changes in voice-based\nsettings like SCS, (iii) outline research questions, and (iv) propose an\nexperimental setup with variables, data, and instruments to explore biases in a\nvoice-based setting like Spoken Conversational Search.\n","authors":["Sachin Pathiyan Cherumanal","Falk Scholer","Johanne R. Trippas","Damiano Spina"],"pdf_url":"https://arxiv.org/pdf/2409.00890v1.pdf","comment":"Accepted Late-Breaking Results at ACM ICMI Companion 2024"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.14211v2","updated":"2024-09-02T23:41:47Z","published":"2024-07-19T11:17:42Z","title":"Advanced Predictive Modeling for Enhanced Mortality Prediction in ICU\n Stroke Patients Using Clinical Data","summary":" Background: Stroke is second-leading cause of disability and death among\nadults. Approximately 17 million people suffer from a stroke annually, with\nabout 85% being ischemic strokes. Predicting mortality of ischemic stroke\npatients in intensive care unit (ICU) is crucial for optimizing treatment\nstrategies, allocating resources, and improving survival rates. Methods: We\nacquired data on ICU ischemic stroke patients from MIMIC-IV database, including\ndiagnoses, vital signs, laboratory tests, medications, procedures, treatments,\nand clinical notes. Stroke patients were randomly divided into training (70%,\nn=2441), test (15%, n=523), and validation (15%, n=523) sets. To address data\nimbalances, we applied Synthetic Minority Over-sampling Technique (SMOTE). We\nselected 30 features for model development, significantly reducing feature\nnumber from 1095 used in the best study. We developed a deep learning model to\nassess mortality risk and implemented several baseline machine learning models\nfor comparison. Results: XGB-DL model, combining XGBoost for feature selection\nand deep learning, effectively minimized false positives. Model's AUROC\nimproved from 0.865 (95% CI: 0.821 - 0.905) on first day to 0.903 (95% CI:\n0.868 - 0.936) by fourth day using data from 3,646 ICU mortality patients in\nthe MIMIC-IV database with 0.945 AUROC (95% CI: 0.944 - 0.947) during training.\nAlthough other ML models also performed well in terms of AUROC, we chose Deep\nLearning for its higher specificity. Conclusions: Through enhanced feature\nselection and data cleaning, proposed model demonstrates a 13% AUROC\nimprovement compared to existing models while reducing feature number from 1095\nin previous studies to 30.\n","authors":["Armin Abdollahi","Negin Ashrafi","Maryam Pishgar"],"pdf_url":"https://arxiv.org/pdf/2407.14211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04845v2","updated":"2024-09-02T22:43:33Z","published":"2024-02-07T13:44:47Z","title":"AlphaFold Meets Flow Matching for Generating Protein Ensembles","summary":" The biological functions of proteins often depend on dynamic structural\nensembles. In this work, we develop a flow-based generative modeling approach\nfor learning and sampling the conformational landscapes of proteins. We\nrepurpose highly accurate single-state predictors such as AlphaFold and ESMFold\nand fine-tune them under a custom flow matching framework to obtain\nsequence-conditoned generative models of protein structure called AlphaFlow and\nESMFlow. When trained and evaluated on the PDB, our method provides a superior\ncombination of precision and diversity compared to AlphaFold with MSA\nsubsampling. When further trained on ensembles from all-atom MD, our method\naccurately captures conformational flexibility, positional distributions, and\nhigher-order ensemble observables for unseen proteins. Moreover, our method can\ndiversify a static PDB structure with faster wall-clock convergence to certain\nequilibrium properties than replicate MD trajectories, demonstrating its\npotential as a proxy for expensive physics-based simulations. Code is available\nat https://github.com/bjing2016/alphaflow.\n","authors":["Bowen Jing","Bonnie Berger","Tommi Jaakkola"],"pdf_url":"https://arxiv.org/pdf/2402.04845v2.pdf","comment":"ICML 2024"},{"id":"http://arxiv.org/abs/2405.14099v3","updated":"2024-09-02T21:40:41Z","published":"2024-05-23T02:01:05Z","title":"Automatic Differentiation is Essential in Training Neural Networks for\n Solving Differential Equations","summary":" Neural network-based approaches have recently shown significant promise in\nsolving partial differential equations (PDEs) in science and engineering,\nespecially in scenarios featuring complex domains or incorporation of empirical\ndata. One advantage of the neural network methods for PDEs lies in its\nautomatic differentiation (AD), which necessitates only the sample points\nthemselves, unlike traditional finite difference (FD) approximations that\nrequire nearby local points to compute derivatives. In this paper, we\nquantitatively demonstrate the advantage of AD in training neural networks. The\nconcept of truncated entropy is introduced to characterize the training\nproperty. Specifically, through comprehensive experimental and theoretical\nanalyses conducted on random feature models and two-layer neural networks, we\ndiscover that the defined truncated entropy serves as a reliable metric for\nquantifying the residual loss of random feature models and the training speed\nof neural networks for both AD and FD methods. Our experimental and theoretical\nanalyses demonstrate that, from a training perspective, AD outperforms FD in\nsolving PDEs.\n","authors":["Chuqi Chen","Yahong Yang","Yang Xiang","Wenrui Hao"],"pdf_url":"https://arxiv.org/pdf/2405.14099v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07820v3","updated":"2024-09-02T21:17:39Z","published":"2023-01-18T23:16:53Z","title":"On the limits of neural network explainability via descrambling","summary":" We characterize the exact solutions to neural network descrambling--a\nmathematical model for explaining the fully connected layers of trained neural\nnetworks (NNs). By reformulating the problem to the minimization of the\nBrockett function arising in graph matching and complexity theory we show that\nthe principal components of the hidden layer preactivations can be\ncharacterized as the optimal explainers or descramblers for the layer weights,\nleading to descrambled weight matrices. We show that in typical deep learning\ncontexts these descramblers take diverse and interesting forms including (1)\nmatching largest principal components with the lowest frequency modes of the\nFourier basis for isotropic hidden data, (2) discovering the semantic\ndevelopment in two-layer linear NNs for signal recovery problems, and (3)\nexplaining CNNs by optimally permuting the neurons. Our numerical experiments\nindicate that the eigendecompositions of the hidden layer data--now understood\nas the descramblers--can also reveal the layer's underlying transformation.\nThese results illustrate that the SVD is more directly related to the\nexplainability of NNs than previously thought and offers a promising avenue for\ndiscovering interpretable motifs for the hidden action of NNs, especially in\ncontexts of operator learning or physics-informed NNs, where the input/output\ndata has limited human readability.\n","authors":["Shashank Sule","Richard G. Spencer","Wojciech Czaja"],"pdf_url":"https://arxiv.org/pdf/2301.07820v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07529v3","updated":"2024-09-02T20:42:08Z","published":"2024-06-11T17:55:25Z","title":"MAP: Low-compute Model Merging with Amortized Pareto Fronts via\n Quadratic Approximation","summary":" Model merging has emerged as an effective approach to combine multiple\nsingle-task models, fine-tuned from the same pre-trained model, into a\nmultitask model. This process typically involves computing a weighted average\nof the model parameters without any additional training. Existing model-merging\nmethods focus on enhancing average task accuracy. However, interference and\nconflicts between the objectives of different tasks can lead to trade-offs\nduring model merging. In real-world applications, a set of solutions with\nvarious trade-offs can be more informative, helping practitioners make\ndecisions based on diverse preferences. In this paper, we introduce a novel\nlow-compute algorithm, Model Merging with Amortized Pareto Front (MAP). MAP\nidentifies a Pareto set of scaling coefficients for merging multiple models to\nreflect the trade-offs. The core component of MAP is approximating the\nevaluation metrics of the various tasks using a quadratic approximation\nsurrogate model derived from a pre-selected set of scaling coefficients,\nenabling amortized inference. Experimental results on vision and natural\nlanguage processing tasks show that MAP can accurately identify the Pareto\nfront. To further reduce the required computation of MAP, we propose (1) a\nBayesian adaptive sampling algorithm and (2) a nested merging scheme with\nmultiple stages.\n","authors":["Lu Li","Tianyu Zhang","Zhiqi Bu","Suyuchen Wang","Huan He","Jie Fu","Yonghui Wu","Jiang Bian","Yong Chen","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2406.07529v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17095v2","updated":"2024-09-02T20:33:49Z","published":"2024-08-30T08:26:55Z","title":"RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation\n and Retrieval-Guidance","summary":" Diffusion-based models demonstrate impressive generation capabilities.\nHowever, they also have a massive number of parameters, resulting in enormous\nmodel sizes, thus making them unsuitable for deployment on resource-constraint\ndevices. Block-wise generation can be a promising alternative for designing\ncompact-sized (parameter-efficient) deep generative models since the model can\ngenerate one block at a time instead of generating the whole image at once.\nHowever, block-wise generation is also considerably challenging because\nensuring coherence across generated blocks can be non-trivial. To this end, we\ndesign a retrieval-augmented generation (RAG) approach and leverage the\ncorresponding blocks of the images retrieved by the RAG module to condition the\ntraining and generation stages of a block-wise denoising diffusion model. Our\nconditioning schemes ensure coherence across the different blocks during\ntraining and, consequently, during generation. While we showcase our approach\nusing the latent diffusion model (LDM) as the base model, it can be used with\nother variants of denoising diffusion models. We validate the solution of the\ncoherence problem through the proposed approach by reporting substantive\nexperiments to demonstrate our approach's effectiveness in compact model size\nand excellent generation quality.\n","authors":["Avideep Mukherjee","Soumya Banerjee","Piyush Rai","Vinay P. Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2408.17095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19289v4","updated":"2024-09-02T20:21:29Z","published":"2024-03-28T10:19:36Z","title":"Uplift Modeling Under Limited Supervision","summary":" Estimating causal effects in e-commerce tends to involve costly treatment\nassignments which can be impractical in large-scale settings. Leveraging\nmachine learning to predict such treatment effects without actual intervention\nis a standard practice to diminish the risk. However, existing methods for\ntreatment effect prediction tend to rely on training sets of substantial size,\nwhich are built from real experiments and are thus inherently risky to create.\nIn this work we propose a graph neural network to diminish the required\ntraining set size, relying on graphs that are common in e-commerce data.\nSpecifically, we view the problem as node regression with a restricted number\nof labeled instances, develop a two-model neural architecture akin to previous\ncausal effect estimators, and test varying message-passing layers for encoding.\nFurthermore, as an extra step, we combine the model with an acquisition\nfunction to guide the creation of the training set in settings with extremely\nlow experimental budget. The framework is flexible since each step can be used\nseparately with other models or treatment policies. The experiments on real\nlarge-scale networks indicate a clear advantage of our methodology over the\nstate of the art, which in many cases performs close to random, underlining the\nneed for models that can generalize with limited supervision to reduce\nexperimental risks.\n","authors":["George Panagopoulos","Daniele Malitesta","Fragkiskos D. Malliaros","Jun Pang"],"pdf_url":"https://arxiv.org/pdf/2403.19289v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.04118v2","updated":"2024-09-02T18:03:26Z","published":"2024-03-07T00:20:11Z","title":"Globally Stable Neural Imitation Policies","summary":" Imitation learning presents an effective approach to alleviate the\nresource-intensive and time-consuming nature of policy learning from scratch in\nthe solution space. Even though the resulting policy can mimic expert\ndemonstrations reliably, it often lacks predictability in unexplored regions of\nthe state-space, giving rise to significant safety concerns in the face of\nperturbations. To address these challenges, we introduce the Stable Neural\nDynamical System (SNDS), an imitation learning regime which produces a policy\nwith formal stability guarantees. We deploy a neural policy architecture that\nfacilitates the representation of stability based on Lyapunov theorem, and\njointly train the policy and its corresponding Lyapunov candidate to ensure\nglobal stability. We validate our approach by conducting extensive experiments\nin simulation and successfully deploying the trained policies on a real-world\nmanipulator arm. The experimental results demonstrate that our method overcomes\nthe instability, accuracy, and computational intensity problems associated with\nprevious imitation learning methods, making our method a promising solution for\nstable policy learning in complex planning scenarios.\n","authors":["Amin Abyaneh","Mariana Sosa Guzmán","Hsiu-Chin Lin"],"pdf_url":"https://arxiv.org/pdf/2403.04118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21009v2","updated":"2024-09-02T18:01:44Z","published":"2024-07-30T17:55:36Z","title":"AI-Assisted Generation of Difficult Math Questions","summary":" Current LLM training positions mathematical reasoning as a core capability.\nWith publicly available sources fully tapped, there is unmet demand for diverse\nand challenging math questions. Relying solely on human experts is both\ntime-consuming and costly, while LLM-generated questions often lack the\nrequisite diversity and difficulty. We present a design framework that combines\nthe strengths of LLMs with a human-in-the-loop approach to generate a diverse\narray of challenging math questions. We leverage LLM metacognition skills\n[Didolkar et al., 2024] of a strong LLM to extract core \"skills\" from existing\nmath datasets. These skills serve as the basis for generating novel and\ndifficult questions by prompting the LLM with random pairs of core skills. The\nuse of two different skills within each question makes finding such questions\nan \"out of distribution\" task for both LLMs and humans. Our pipeline employs\nLLMs to iteratively generate and refine questions and solutions through\nmultiturn prompting. Human annotators then verify and further refine the\nquestions, with their efficiency enhanced via further LLM interactions.\nApplying this pipeline on skills extracted from the MATH dataset [Hendrycks et\nal., 2021] resulted in MATH$^2$ - a dataset of higher-quality math questions,\nas evidenced by: (a) Lower performance of all models on MATH$^2$ than on MATH\n(b) Higher performance on MATH when using MATH$^2$ questions as in-context\nexamples. Although focused on mathematics, our methodology seems applicable to\nother domains requiring structured reasoning, and potentially as a component of\nscalable oversight. Also of interest is a striking relationship observed\nbetween models' performance on the new dataset: the success rate on MATH$^2$ is\nthe square on MATH, suggesting that successfully solving the question in\nMATH$^2$ requires a nontrivial combination of two distinct math skills.\n","authors":["Vedant Shah","Dingli Yu","Kaifeng Lyu","Simon Park","Nan Rosemary Ke","Michael Mozer","Yoshua Bengio","Sanjeev Arora","Anirudh Goyal"],"pdf_url":"https://arxiv.org/pdf/2407.21009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00872v2","updated":"2024-09-02T17:41:24Z","published":"2024-08-01T18:46:05Z","title":"Online Detection of Anomalies in Temporal Knowledge Graphs with\n Interpretability","summary":" Temporal knowledge graphs (TKGs) are valuable resources for capturing\nevolving relationships among entities, yet they are often plagued by noise,\nnecessitating robust anomaly detection mechanisms. Existing dynamic graph\nanomaly detection approaches struggle to capture the rich semantics introduced\nby node and edge categories within TKGs, while TKG embedding methods lack\ninterpretability, undermining the credibility of anomaly detection. Moreover,\nthese methods falter in adapting to pattern changes and semantic drifts\nresulting from knowledge updates. To tackle these challenges, we introduce\nAnoT, an efficient TKG summarization method tailored for interpretable online\nanomaly detection in TKGs. AnoT begins by summarizing a TKG into a novel rule\ngraph, enabling flexible inference of complex patterns in TKGs. When new\nknowledge emerges, AnoT maps it onto a node in the rule graph and traverses the\nrule graph recursively to derive the anomaly score of the knowledge. The\ntraversal yields reachable nodes that furnish interpretable evidence for the\nvalidity or the anomalous of the new knowledge. Overall, AnoT embodies a\ndetector-updater-monitor architecture, encompassing a detector for offline TKG\nsummarization and online scoring, an updater for real-time rule graph updates\nbased on emerging knowledge, and a monitor for estimating the approximation\nerror of the rule graph. Experimental results on four real-world datasets\ndemonstrate that AnoT surpasses existing methods significantly in terms of\naccuracy and interoperability. All of the raw datasets and the implementation\nof AnoT are provided in https://github.com/zjs123/ANoT.\n","authors":["Jiasheng Zhang","Rex Ying","Jie Shao"],"pdf_url":"https://arxiv.org/pdf/2408.00872v2.pdf","comment":"26 pages, 10 figures. Accepted by SIGMOD 2025"},{"id":"http://arxiv.org/abs/2407.04268v3","updated":"2024-09-02T17:13:22Z","published":"2024-07-05T05:45:34Z","title":"NeuFair: Neural Network Fairness Repair with Dropout","summary":" This paper investigates neuron dropout as a post-processing bias mitigation\nfor deep neural networks (DNNs). Neural-driven software solutions are\nincreasingly applied in socially critical domains with significant fairness\nimplications. While neural networks are exceptionally good at finding\nstatistical patterns from data, they may encode and amplify existing biases\nfrom the historical data. Existing bias mitigation algorithms often require\nmodifying the input dataset or the learning algorithms. We posit that the\nprevalent dropout methods that prevent over-fitting during training by randomly\ndropping neurons may be an effective and less intrusive approach to improve the\nfairness of pre-trained DNNs. However, finding the ideal set of neurons to drop\nis a combinatorial problem. We propose NeuFair, a family of post-processing\nrandomized algorithms that mitigate unfairness in pre-trained DNNs via dropouts\nduring inference after training. Our randomized search is guided by an\nobjective to minimize discrimination while maintaining the model's utility. We\nshow that our design of randomized algorithms is effective and efficient in\nimproving fairness (up to 69%) with minimal or no model performance\ndegradation. We provide intuitive explanations of these phenomena and carefully\nexamine the influence of various hyperparameters of search algorithms on the\nresults. Finally, we empirically and conceptually compare NeuFair to different\nstate-of-the-art bias mitigators.\n","authors":["Vishnu Asutosh Dasu","Ashish Kumar","Saeid Tizpaz-Niari","Gang Tan"],"pdf_url":"https://arxiv.org/pdf/2407.04268v3.pdf","comment":"Paper accepted at ACM ISSTA 2024"},{"id":"http://arxiv.org/abs/2312.10108v2","updated":"2024-09-02T17:00:21Z","published":"2023-12-15T06:30:55Z","title":"Privacy-Aware Document Visual Question Answering","summary":" Document Visual Question Answering (DocVQA) has quickly grown into a central\ntask of document understanding. But despite the fact that documents contain\nsensitive or copyrighted information, none of the current DocVQA methods offers\nstrong privacy guarantees. In this work, we explore privacy in the domain of\nDocVQA for the first time, highlighting privacy issues in state of the art\nmulti-modal LLM models used for DocVQA, and explore possible solutions.\nSpecifically, we focus on invoice processing as a realistic document\nunderstanding scenario, and propose a large scale DocVQA dataset comprising\ninvoice documents and associated questions and answers. We employ a federated\nlearning scheme, that reflects the real-life distribution of documents in\ndifferent businesses, and we explore the use case where the data of the invoice\nprovider is the sensitive information to be protected. We demonstrate that\nnon-private models tend to memorise, a behaviour that can lead to exposing\nprivate information. We then evaluate baseline training schemes employing\nfederated learning and differential privacy in this multi-modal scenario, where\nthe sensitive information might be exposed through either or both of the two\ninput modalities: vision (document image) or language (OCR tokens). Finally, we\ndesign attacks exploiting the memorisation effect of the model, and demonstrate\ntheir effectiveness in probing a representative DocVQA models.\n","authors":["Rubèn Tito","Khanh Nguyen","Marlon Tobaben","Raouf Kerkouche","Mohamed Ali Souibgui","Kangsoo Jung","Joonas Jälkö","Vincent Poulain D'Andecy","Aurelie Joseph","Lei Kang","Ernest Valveny","Antti Honkela","Mario Fritz","Dimosthenis Karatzas"],"pdf_url":"https://arxiv.org/pdf/2312.10108v2.pdf","comment":"35 pages, 12 figures, accepted for publication at the 18th\n International Conference on Document Analysis and Recognition, ICDAR 2024"},{"id":"http://arxiv.org/abs/2408.13295v2","updated":"2024-09-02T17:00:05Z","published":"2024-08-23T14:47:10Z","title":"Exploring Bias and Prediction Metrics to Characterise the Fairness of\n Machine Learning for Equity-Centered Public Health Decision-Making: A\n Narrative Review","summary":" Background: The rapid advancement of Machine Learning (ML) represents novel\nopportunities to enhance public health research, surveillance, and\ndecision-making. However, there is a lack of comprehensive understanding of\nalgorithmic bias, systematic errors in predicted population health outcomes,\nresulting from the public health application of ML. The objective of this\nnarrative review is to explore the types of bias generated by ML and\nquantitative metrics to assess these biases.\n Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of\nElectrical and Electronics Engineers), ACM (Association for Computing\nMachinery) Digital Library, Science Direct, and Springer Nature. We used\nkeywords to identify studies describing types of bias and metrics to measure\nthese in the domain of ML and public and population health published in English\nbetween 2008 and 2023, inclusive.\n Results: A total of 72 articles met the inclusion criteria. Our review\nidentified the commonly described types of bias and quantitative metrics to\nassess these biases from an equity perspective.\n Conclusion : The review will help formalize the evaluation framework for ML\non public health from an equity perspective.\n","authors":["Shaina Raza","Arash Shaban-Nejad","Elham Dolatabadi","Hiroshi Mamiya"],"pdf_url":"https://arxiv.org/pdf/2408.13295v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2408.16154v2","updated":"2024-09-02T16:58:16Z","published":"2024-08-28T22:14:44Z","title":"Does Data-Efficient Generalization Exacerbate Bias in Foundation Models?","summary":" Foundation models have emerged as robust models with label efficiency in\ndiverse domains. In medical imaging, these models contribute to the advancement\nof medical diagnoses due to the difficulty in obtaining labeled data. However,\nit is unclear whether using a large amount of unlabeled data, biased by the\npresence of sensitive attributes during pre-training, influences the fairness\nof the model. This research examines the bias in the Foundation model\n(RetFound) when it is applied to fine-tune the Brazilian Multilabel\nOphthalmological Dataset (BRSET), which has a different population than the\npre-training dataset. The model evaluation, in comparison with supervised\nlearning, shows that the Foundation Model has the potential to reduce the gap\nbetween the maximum AUC and minimum AUC evaluations across gender and age\ngroups. However, in a data-efficient generalization, the model increases the\nbias when the data amount decreases. These findings suggest that when deploying\na Foundation Model in real-life scenarios with limited data, the possibility of\nfairness issues should be considered.\n","authors":["Dilermando Queiroz","Anderson Carlos","Maíra Fatoretto","Luis Filipe Nakayama","André Anjos","Lilian Berton"],"pdf_url":"https://arxiv.org/pdf/2408.16154v2.pdf","comment":"Preprint of paper to be presented at Fairness and Ethics Towards\n Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during\n ECCV 2024"},{"id":"http://arxiv.org/abs/2408.09493v2","updated":"2024-09-02T16:19:25Z","published":"2024-08-18T14:16:55Z","title":"Ancestral Reinforcement Learning: Unifying Zeroth-Order Optimization and\n Genetic Algorithms for Reinforcement Learning","summary":" Reinforcement Learning (RL) offers a fundamental framework for discovering\noptimal action strategies through interactions within unknown environments.\nRecent advancement have shown that the performance and applicability of RL can\nsignificantly be enhanced by exploiting a population of agents in various ways.\nZeroth-Order Optimization (ZOO) leverages an agent population to estimate the\ngradient of the objective function, enabling robust policy refinement even in\nnon-differentiable scenarios. As another application, Genetic Algorithms (GA)\nboosts the exploration of policy landscapes by mutational generation of policy\ndiversity in an agent population and its refinement by selection. A natural\nquestion is whether we can have the best of two worlds that the agent\npopulation can have. In this work, we propose Ancestral Reinforcement Learning\n(ARL), which synergistically combines the robust gradient estimation of ZOO\nwith the exploratory power of GA. The key idea in ARL is that each agent within\na population infers gradient by exploiting the history of its ancestors, i.e.,\nthe ancestor population in the past, while maintaining the diversity of\npolicies in the current population as in GA. We also theoretically reveal that\nthe populational search in ARL implicitly induces the KL-regularization of the\nobjective function, resulting in the enhanced exploration. Our results extend\nthe applicability of populational algorithms for RL.\n","authors":["So Nakashima","Tetsuya J. Kobayashi"],"pdf_url":"https://arxiv.org/pdf/2408.09493v2.pdf","comment":"16pages, 3 figures"},{"id":"http://arxiv.org/abs/2303.17708v4","updated":"2024-09-02T15:23:52Z","published":"2023-03-30T21:00:38Z","title":"Analysis of Failures and Risks in Deep Learning Model Converters: A Case\n Study in the ONNX Ecosystem","summary":" Software engineers develop, fine-tune, and deploy deep learning (DL) models\nusing a variety of development frameworks and runtime environments. DL model\nconverters move models between frameworks and to runtime environments.\nConversion errors compromise model quality and disrupt deployment. However, the\nfailure characteristics of DL model converters are unknown, adding risk when\nusing DL interoperability technologies.\n This paper analyzes failures in DL model converters. We survey software\nengineers about DL interoperability tools, use cases, and pain points (N=92).\nThen, we characterize failures in model converters associated with the main\ninteroperability tool, ONNX (N=200 issues in PyTorch and TensorFlow). Finally,\nwe formulate and test two hypotheses about structural causes for the failures\nwe studied. We find that the node conversion stage of a model converter\naccounts for ~75% of the defects and 33% of reported failure are related to\nsemantically incorrect models. The cause of semantically incorrect models is\nelusive, but models with behaviour inconsistencies share operator sequences.\nOur results motivate future research on making DL interoperability software\nsimpler to maintain, extend, and validate. Research into behavioural tolerances\nand architectural coverage metrics could be fruitful.\n","authors":["Purvish Jajal","Wenxin Jiang","Arav Tewari","Erik Kocinare","Joseph Woo","Anusha Sarraf","Yung-Hsiang Lu","George K. Thiruvathukal","James C. Davis"],"pdf_url":"https://arxiv.org/pdf/2303.17708v4.pdf","comment":"[ISSTA'24] Proceedings of the 33rd ACM SIGSOFT International\n Symposium on Software Testing and Analysis (ISSTA) 2024"},{"id":"http://arxiv.org/abs/2405.15444v3","updated":"2024-09-02T15:09:05Z","published":"2024-05-24T11:20:41Z","title":"HyperInterval: Hypernetwork approach to training weight interval regions\n in continual learning","summary":" Recently, a new Continual Learning (CL) paradigm was presented to control\ncatastrophic forgetting, called Interval Continual Learning (InterContiNet),\nwhich relies on enforcing interval constraints on the neural network parameter\nspace. Unfortunately, InterContiNet training is challenging due to the high\ndimensionality of the weight space, making intervals difficult to manage. To\naddress this issue, we introduce \\our{} \\footnote{The source code is available\nat https://github.com/gmum/HyperInterval}, a technique that employs interval\narithmetic within the embedding space and utilizes a hypernetwork to map these\nintervals to the target network parameter space. We train interval embeddings\nfor consecutive tasks and train a hypernetwork to transform these embeddings\ninto weights of the target network. An embedding for a given task is trained\nalong with the hypernetwork, preserving the response of the target network for\nthe previous task embeddings. Interval arithmetic works with a more manageable,\nlower-dimensional embedding space rather than directly preparing intervals in a\nhigh-dimensional weight space. Our model allows faster and more efficient\ntraining. Furthermore, \\our{} maintains the guarantee of not forgetting. At the\nend of training, we can choose one universal embedding to produce a single\nnetwork dedicated to all tasks. In such a framework, hypernetwork is used only\nfor training and, finally, we can utilize one set of weights. \\our{} obtains\nsignificantly better results than InterContiNet and gives SOTA results on\nseveral benchmarks.\n","authors":["Patryk Krukowski","Anna Bielawska","Kamil Książek","Paweł Wawrzyński","Paweł Batorski","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2405.15444v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09146v5","updated":"2024-09-02T14:38:01Z","published":"2024-02-14T12:55:28Z","title":"ResQuNNs:Towards Enabling Deep Learning in Quantum Convolution Neural\n Networks","summary":" In this paper, we present a novel framework for enhancing the performance of\nQuanvolutional Neural Networks (QuNNs) by introducing trainable quanvolutional\nlayers and addressing the critical challenges associated with them. Traditional\nquanvolutional layers, although beneficial for feature extraction, have largely\nbeen static, offering limited adaptability. Unlike state-of-the-art, our\nresearch overcomes this limitation by enabling training within these layers,\nsignificantly increasing the flexibility and potential of QuNNs. However, the\nintroduction of multiple trainable quanvolutional layers induces complexities\nin gradient-based optimization, primarily due to the difficulty in accessing\ngradients across these layers. To resolve this, we propose a novel\narchitecture, Residual Quanvolutional Neural Networks (ResQuNNs), leveraging\nthe concept of residual learning, which facilitates the flow of gradients by\nadding skip connections between layers. By inserting residual blocks between\nquanvolutional layers, we ensure enhanced gradient access throughout the\nnetwork, leading to improved training performance. Moreover, we provide\nempirical evidence on the strategic placement of these residual blocks within\nQuNNs. Through extensive experimentation, we identify an efficient\nconfiguration of residual blocks, which enables gradients across all the layers\nin the network that eventually results in efficient training. Our findings\nsuggest that the precise location of residual blocks plays a crucial role in\nmaximizing the performance gains in QuNNs. Our results mark a substantial step\nforward in the evolution of quantum deep learning, offering new avenues for\nboth theoretical development and practical quantum computing applications.\n","authors":["Muhammad Kashif","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2402.09146v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04896v2","updated":"2024-09-02T13:55:25Z","published":"2024-06-07T12:43:17Z","title":"Stabilizing Extreme Q-learning by Maclaurin Expansion","summary":" In offline reinforcement learning, in-sample learning methods have been\nwidely used to prevent performance degradation caused by evaluating\nout-of-distribution actions from the dataset. Extreme Q-learning (XQL) employs\na loss function based on the assumption that Bellman error follows a Gumbel\ndistribution, enabling it to model the soft optimal value function in an\nin-sample manner. It has demonstrated strong performance in both offline and\nonline reinforcement learning settings. However, issues remain, such as the\ninstability caused by the exponential term in the loss function and the risk of\nthe error distribution deviating from the Gumbel distribution. Therefore, we\npropose Maclaurin Expanded Extreme Q-learning to enhance stability. In this\nmethod, applying Maclaurin expansion to the loss function in XQL enhances\nstability against large errors. This approach involves adjusting the modeled\nvalue function between the value function under the behavior policy and the\nsoft optimal value function, thus achieving a trade-off between stability and\noptimality depending on the order of expansion. It also enables adjustment of\nthe error distribution assumption from a normal distribution to a Gumbel\ndistribution. Our method significantly stabilizes learning in online RL tasks\nfrom DM Control, where XQL was previously unstable. Additionally, it improves\nperformance in several offline RL tasks from D4RL.\n","authors":["Motoki Omura","Takayuki Osa","Yusuke Mukuta","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2406.04896v2.pdf","comment":"Accepted at RLC 2024: The first Reinforcement Learning Conference"},{"id":"http://arxiv.org/abs/2408.16772v2","updated":"2024-09-02T13:19:40Z","published":"2024-08-14T17:19:56Z","title":"An Effective Information Theoretic Framework for Channel Pruning","summary":" Channel pruning is a promising method for accelerating and compressing\nconvolutional neural networks. However, current pruning algorithms still remain\nunsolved problems that how to assign layer-wise pruning ratios properly and\ndiscard the least important channels with a convincing criterion. In this\npaper, we present a novel channel pruning approach via information theory and\ninterpretability of neural networks. Specifically, we regard information\nentropy as the expected amount of information for convolutional layers. In\naddition, if we suppose a matrix as a system of linear equations, a higher-rank\nmatrix represents there exist more solutions to it, which indicates more\nuncertainty. From the point of view of information theory, the rank can also\ndescribe the amount of information. In a neural network, considering the rank\nand entropy as two information indicators of convolutional layers, we propose a\nfusion function to reach a compromise of them, where the fusion results are\ndefined as ``information concentration''. When pre-defining layer-wise pruning\nratios, we employ the information concentration as a reference instead of\nheuristic and engineering tuning to provide a more interpretable solution.\nMoreover, we leverage Shapley values, which are a potent tool in the\ninterpretability of neural networks, to evaluate the channel contributions and\ndiscard the least important channels for model compression while maintaining\nits performance. Extensive experiments demonstrate the effectiveness and\npromising performance of our method. For example, our method improves the\naccuracy by 0.21% when reducing 45.5% FLOPs and removing 40.3% parameters for\nResNet-56 on CIFAR-10. Moreover, our method obtains loss in Top-1/Top-5\naccuracies of 0.43%/0.11% by reducing 41.6% FLOPs and removing 35.0% parameters\nfor ResNet-50 on ImageNet.\n","authors":["Yihao Chen","Zefang Wang"],"pdf_url":"https://arxiv.org/pdf/2408.16772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.14966v2","updated":"2024-09-02T12:55:04Z","published":"2024-04-23T12:20:27Z","title":"Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State\n Space Model","summary":" Existing Transformer-based models for point cloud analysis suffer from\nquadratic complexity, leading to compromised point cloud resolution and\ninformation loss. In contrast, the newly proposed Mamba model, based on state\nspace models (SSM), outperforms Transformer in multiple areas with only linear\ncomplexity. However, the straightforward adoption of Mamba does not achieve\nsatisfactory performance on point cloud tasks. In this work, we present\nMamba3D, a state space model tailored for point cloud learning to enhance local\nfeature extraction, achieving superior performance, high efficiency, and\nscalability potential. Specifically, we propose a simple yet effective Local\nNorm Pooling (LNP) block to extract local geometric features. Additionally, to\nobtain better global features, we introduce a bidirectional SSM (bi-SSM) with\nboth a token forward SSM and a novel backward SSM that operates on the feature\nchannel. Extensive experimental results show that Mamba3D surpasses\nTransformer-based counterparts and concurrent works in multiple tasks, with or\nwithout pre-training. Notably, Mamba3D achieves multiple SoTA, including an\noverall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1%\n(with single-modal pre-training) on the ModelNet40 classification task, with\nonly linear complexity. Our code and weights are available at\nhttps://github.com/xhanxu/Mamba3D.\n","authors":["Xu Han","Yuan Tang","Zhaoxuan Wang","Xianzhi Li"],"pdf_url":"https://arxiv.org/pdf/2404.14966v2.pdf","comment":"ACM MM 2024. Code and weights are available at\n https://github.com/xhanxu/Mamba3D"},{"id":"http://arxiv.org/abs/2208.14153v6","updated":"2024-09-02T12:44:58Z","published":"2022-08-30T11:12:59Z","title":"Identifying Weight-Variant Latent Causal Models","summary":" The task of causal representation learning aims to uncover latent\nhigher-level causal representations that affect lower-level observations.\nIdentifying true latent causal representations from observed data, while\nallowing instantaneous causal relations among latent variables, remains a\nchallenge, however. To this end, we start from the analysis of three intrinsic\nproperties in identifying latent space from observations: transitivity,\npermutation indeterminacy, and scaling indeterminacy. We find that transitivity\nacts as a key role in impeding the identifiability of latent causal\nrepresentations. To address the unidentifiable issue due to transitivity, we\nintroduce a novel identifiability condition where the underlying latent causal\nmodel satisfies a linear-Gaussian model, in which the causal coefficients and\nthe distribution of Gaussian noise are modulated by an additional observed\nvariable. Under some mild assumptions, we can show that the latent causal\nrepresentations can be identified up to trivial permutation and scaling.\nFurthermore, based on this theoretical result, we propose a novel method,\ntermed Structural caUsAl Variational autoEncoder, which directly learns latent\ncausal representations and causal relationships among them, together with the\nmapping from the latent causal variables to the observed ones. We show that the\nproposed method learns the true parameters asymptotically. Experimental results\non synthetic and real data demonstrate the identifiability and consistency\nresults and the efficacy of the proposed method in learning latent causal\nrepresentations.\n","authors":["Yuhang Liu","Zhen Zhang","Dong Gong","Mingming Gong","Biwei Huang","Anton van den Hengel","Kun Zhang","Javen Qinfeng Shi"],"pdf_url":"https://arxiv.org/pdf/2208.14153v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12801v2","updated":"2024-09-02T12:37:27Z","published":"2024-01-11T00:45:33Z","title":"Deep Learning-based Target-To-User Association in Integrated Sensing and\n Communication Systems","summary":" In Integrated Sensing and Communication (ISAC) systems, matching the radar\ntargets with communication user equipments (UEs) is functional to several\ncommunication tasks, such as proactive handover and beam prediction. In this\npaper, we consider a radar-assisted communication system where a base station\n(BS) is equipped with a multiple-input-multiple-output (MIMO) radar that has a\ndouble aim: (i) associate vehicular radar targets to vehicular equipments (VEs)\nin the communication beamspace and (ii) predict the beamforming vector for each\nVE from radar data. The proposed target-to-user (T2U) association consists of\ntwo stages. First, vehicular radar targets are detected from range-angle\nimages, and, for each, a beamforming vector is estimated. Then, the inferred\nper-target beamforming vectors are matched with the ones utilized at the BS for\ncommunication to perform target-to-user (T2U) association. Joint multi-target\ndetection and beam inference is obtained by modifying the you only look once\n(YOLO) model, which is trained over simulated range-angle radar images.\nSimulation results over different urban vehicular mobility scenarios show that\nthe proposed T2U method provides a probability of correct association that\nincreases with the size of the BS antenna array, highlighting the respective\nincrease of the separability of the VEs in the beamspace. Moreover, we show\nthat the modified YOLO architecture can effectively perform both beam\nprediction and radar target detection, with similar performance in mean average\nprecision on the latter over different antenna array sizes.\n","authors":["Lorenzo Cazzella","Marouan Mizmizi","Dario Tagliaferri","Damiano Badini","Matteo Matteucci","Umberto Spagnolini"],"pdf_url":"https://arxiv.org/pdf/2401.12801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00040v2","updated":"2024-09-02T12:00:23Z","published":"2024-07-31T14:06:18Z","title":"Barlow Twins Deep Neural Network for Advanced 1D Drug-Target Interaction\n Prediction","summary":" Accurate prediction of drug-target interactions is critical for advancing\ndrug discovery. By reducing time and cost, machine learning and deep learning\ncan accelerate this laborious discovery process. In a novel approach,\nBarlowDTI, we utilise the powerful Barlow Twins architecture for\nfeature-extraction while considering the structure of the target protein. Our\nmethod achieves state-of-the-art predictive performance against multiple\nestablished benchmarks using only one-dimensional input. The use of gradient\nboosting machine as the underlying predictor ensures fast and efficient\npredictions without the need for substantial computational resources. We also\ninvestigate how the model reaches its decision based on individual training\nsamples. By comparing co-crystal structures, we find that BarlowDTI effectively\nexploits catalytically active and stabilising residues, highlighting the\nmodel's ability to generalise from one-dimensional input data. In addition, we\nfurther benchmark new baselines against existing methods. Together, these\ninnovations improve the efficiency and effectiveness of drug-target interaction\npredictions, providing robust tools for accelerating drug development and\ndeepening the understanding of molecular interactions. Therefore, we provide an\neasy-to-use web interface that can be freely accessed at\nhttps://www.bio.nat.tum.de/oc2/barlowdti .\n","authors":["Maximilian G. Schuh","Davide Boldini","Annkathrin I. Bohne","Stephan A. Sieber"],"pdf_url":"https://arxiv.org/pdf/2408.00040v2.pdf","comment":"Refined model architecture, additional results added"},{"id":"http://arxiv.org/abs/2404.12979v2","updated":"2024-09-02T11:52:47Z","published":"2024-04-19T16:09:17Z","title":"TRNet: Two-level Refinement Network leveraging Speech Enhancement for\n Noise Robust Speech Emotion Recognition","summary":" One persistent challenge in Speech Emotion Recognition (SER) is the\nubiquitous environmental noise, which frequently results in deteriorating SER\nperformance in practice. In this paper, we introduce a Two-level Refinement\nNetwork, dubbed TRNet, to address this challenge. Specifically, a pre-trained\nspeech enhancement module is employed for front-end noise reduction and noise\nlevel estimation. Later, we utilize clean speech spectrograms and their\ncorresponding deep representations as reference signals to refine the\nspectrogram distortion and representation shift of enhanced speech during model\ntraining. Experimental results validate that the proposed TRNet substantially\npromotes the robustness of the proposed system in both matched and unmatched\nnoisy environments, without compromising its performance in noise-free\nenvironments.\n","authors":["Chengxin Chen","Pengyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12979v2.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.01001v5","updated":"2024-09-02T11:31:16Z","published":"2023-05-31T05:04:50Z","title":"DiffLoad: Uncertainty Quantification in Electrical Load Forecasting with\n the Diffusion Model","summary":" Electrical load forecasting plays a crucial role in decision-making for power\nsystems, including unit commitment and economic dispatch. The integration of\nrenewable energy sources and the occurrence of external events, such as the\nCOVID-19 pandemic, have rapidly increased uncertainties in load forecasting.\nThe uncertainties in load forecasting can be divided into two types: epistemic\nuncertainty and aleatoric uncertainty. Separating these types of uncertainties\ncan help decision-makers better understand where and to what extent the\nuncertainty is, thereby enhancing their confidence in the following\ndecision-making. This paper proposes a diffusion-based Seq2Seq structure to\nestimate epistemic uncertainty and employs the robust additive Cauchy\ndistribution to estimate aleatoric uncertainty. Our method not only ensures the\naccuracy of load forecasting but also demonstrates the ability to separate the\ntwo types of uncertainties and be applicable to different levels of loads. The\nrelevant code can be found at\n\\url{https://anonymous.4open.science/r/DiffLoad-4714/}.\n","authors":["Zhixian Wang","Qingsong Wen","Chaoli Zhang","Liang Sun","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2306.01001v5.pdf","comment":"Accepted by IEEE Transactions on Power Systems, 2024"},{"id":"http://arxiv.org/abs/2408.16286v2","updated":"2024-09-02T10:56:20Z","published":"2024-08-29T06:37:16Z","title":"Near-Optimal Policy Identification in Robust Constrained Markov Decision\n Processes via Epigraph Form","summary":" Designing a safe policy for uncertain environments is crucial in real-world\ncontrol applications. However, this challenge remains inadequately addressed\nwithin the Markov decision process (MDP) framework. This paper presents the\nfirst algorithm capable of identifying a near-optimal policy in a robust\nconstrained MDP (RCMDP), where an optimal policy minimizes cumulative cost\nwhile satisfying constraints in the worst-case scenario across a set of\nenvironments. We first prove that the conventional Lagrangian max-min\nformulation with policy gradient methods can become trapped in suboptimal\nsolutions by encountering a sum of conflicting gradients from the objective and\nconstraint functions during its inner minimization problem. To address this, we\nleverage the epigraph form of the RCMDP problem, which resolves the conflict by\nselecting a single gradient from either the objective or the constraints.\nBuilding on the epigraph form, we propose a binary search algorithm with a\npolicy gradient subroutine and prove that it identifies an\n$\\varepsilon$-optimal policy in an RCMDP with\n$\\tilde{\\mathcal{O}}(\\varepsilon^{-4})$ policy evaluations.\n","authors":["Toshinori Kitamura","Tadashi Kozuno","Wataru Kumagai","Kenta Hoshino","Yohei Hosoe","Kazumi Kasaura","Masashi Hamaya","Paavo Parmas","Yutaka Matsuo"],"pdf_url":"https://arxiv.org/pdf/2408.16286v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00760v2","updated":"2024-09-02T10:55:37Z","published":"2024-02-01T16:50:41Z","title":"EuroPED-NN: Uncertainty aware surrogate model","summary":" This work successfully generates an uncertainty-aware surrogate model of the\nEuroPED plasma pedestal model using the Bayesian neural network with noise\ncontrastive prior (BNN-NCP) technique. This model is trained using data from\nthe JET-ILW pedestal database and subsequent model evaluations, conforming to\nEuroPED-NN. The BNN-NCP technique has been proven to be a suitable method for\ngenerating uncertainty-aware surrogate models. It matches the output results of\na regular neural network while providing confidence estimates for predictions\nas uncertainties. Additionally, it highlights out-of-distribution (OOD) regions\nusing surrogate model uncertainties. This provides critical insights into model\nrobustness and reliability. EuroPED-NN has been physically validated, first,\nanalyzing electron density $n_e\\!\\left(\\psi_{\\text{pol}}=0.94\\right)$ with\nrespect to increasing plasma current, $I_p$, and second, validating the\n$\\Delta-\\beta_{p,ped}$ relation associated with the EuroPED model. This affirms\nthe robustness of the underlying physics learned by the surrogate model. On top\nof that, the method was used to develop a EuroPED-like model fed with\nexperimental data, i.e. an uncertainty aware experimental model, which is\nfunctional in JET database. Both models have been also tested in $\\sim 50$ AUG\nshots.\n","authors":["A. Panera Alvarez","A. Ho","A. Jarvinen","S. Saarelma","S. Wiesen","JET Contributors","the AUG team"],"pdf_url":"https://arxiv.org/pdf/2402.00760v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15398v4","updated":"2024-09-02T10:35:42Z","published":"2023-07-28T08:48:32Z","title":"The Initial Screening Order Problem","summary":" We investigate the role of the initial screening order (ISO) in candidate\nscreening tasks, such as employee hiring and academic admissions, in which a\nscreener is tasked with selecting $k$ candidates from a candidate pool. The ISO\nrefers to the order in which the screener searches the candidate pool. Today,\nit is common for the ISO to be the product of an information access system,\nsuch as an online platform or a database query. The ISO has been largely\noverlooked in the literature, despite its potential impact on the optimality\nand fairness of the chosen $k$ candidates, especially under a human screener.\nWe define two problem formulations describing the search behavior of the\nscreener under the ISO: the best-$k$, where the screener selects the $k$ best\ncandidates; and the good-$k$, where the screener selects the $k$ first\ngood-enough candidates. To study the impact of the ISO, we introduce a\nhuman-like screener and compare it to its algorithmic counterpart, where the\nhuman-like screener is conceived to be inconsistent over time due to fatigue.\nIn particular, our analysis shows that the ISO, under a human-like screener\nsolving for the good-$k$ problem, hinders individual fairness despite meeting\ngroup level fairness, and hampers the optimality of the selected $k$\ncandidates. This is due to position bias, where a candidate's evaluation is\naffected by its position within the ISO. We report extensive simulated\nexperiments exploring the parameters of the best-$k$ and good-$k$ problems for\nthe algorithmic and human-like screeners. The simulation framework is flexible\nenough to account for multiple screening settings, being an alternative to\nrunning real-world candidate screening procedures. This work is motivated by a\nreal-world candidate screening problem studied in collaboration with an\nEuropean company.\n","authors":["Jose M. Alvarez","Antonio Mastropietro","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2307.15398v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16845v2","updated":"2024-09-02T10:33:48Z","published":"2024-08-29T18:21:50Z","title":"Enabling Local Editing in Diffusion Models by Joint and Individual\n Component Analysis","summary":" Recent advances in Diffusion Models (DMs) have led to significant progress in\nvisual synthesis and editing tasks, establishing them as a strong competitor to\nGenerative Adversarial Networks (GANs). However, the latent space of DMs is not\nas well understood as that of GANs. Recent research has focused on unsupervised\nsemantic discovery in the latent space of DMs by leveraging the bottleneck\nlayer of the denoising network, which has been shown to exhibit properties of a\nsemantic latent space. However, these approaches are limited to discovering\nglobal attributes. In this paper we address, the challenge of local image\nmanipulation in DMs and introduce an unsupervised method to factorize the\nlatent semantics learned by the denoising network of pre-trained DMs. Given an\narbitrary image and defined regions of interest, we utilize the Jacobian of the\ndenoising network to establish a relation between the regions of interest and\ntheir corresponding subspaces in the latent space. Furthermore, we disentangle\nthe joint and individual components of these subspaces to identify latent\ndirections that enable local image manipulation. Once discovered, these\ndirections can be applied to different images to produce semantically\nconsistent edits, making our method suitable for practical applications.\nExperimental results on various datasets demonstrate that our method can\nproduce semantic edits that are more localized and have better fidelity\ncompared to the state-of-the-art.\n","authors":["Theodoros Kouzelis","Manos Plitsis","Mihalis A. Nicolaou","Yannis Panagakis"],"pdf_url":"https://arxiv.org/pdf/2408.16845v2.pdf","comment":"Accepted at BMVC2024"},{"id":"http://arxiv.org/abs/2307.15438v3","updated":"2024-09-02T10:23:41Z","published":"2023-07-28T09:40:19Z","title":"Autonomous Payload Thermal Control","summary":" In small satellites there is less room for heat control equipment, scientific\ninstruments, and electronic components. Furthermore, the near proximity of\nelectronic components makes power dissipation difficult, with the risk of not\nbeing able to control the temperature appropriately, reducing component\nlifetime and mission performance. To address this challenge, taking advantage\nof the advent of increasing intelligence on board satellites, an autonomous\nthermal control tool that uses deep reinforcement learning is proposed for\nlearning the thermal control policy onboard. The tool was evaluated in a real\nspace edge processing computer that will be used in a demonstration payload\nhosted in the International Space Station (ISS). The experiment results show\nthat the proposed framework is able to learn to control the payload processing\npower to maintain the temperature under operational ranges, complementing\ntraditional thermal control systems.\n","authors":["Alejandro D. Mousist"],"pdf_url":"https://arxiv.org/pdf/2307.15438v3.pdf","comment":"To be included in the proceedings of ESA's SPAICE conference at\n ECSAT, UK, 2024"},{"id":"http://arxiv.org/abs/2306.16838v6","updated":"2024-09-02T09:54:53Z","published":"2023-06-29T10:29:29Z","title":"Fast Robust Kernel Regression through Sign Gradient Descent with Early\n Stopping","summary":" Kernel ridge regression, KRR, is a generalization of linear ridge regression\nthat is non-linear in the data, but linear in the model parameters. Here, we\nintroduce an equivalent formulation of the objective function of KRR, which\nopens up both for replacing the ridge penalty with the $\\ell_\\infty$ and\n$\\ell_1$ penalties and for studying kernel ridge regression from the\nperspective of gradient descent.\n Using the $\\ell_\\infty$ and $\\ell_1$ penalties, we obtain robust and sparse\nkernel regression, respectively. We further study the similarities between\nexplicitly regularized kernel regression and the solutions obtained by early\nstopping of iterative gradient-based methods, where we connect $\\ell_\\infty$\nregularization to sign gradient descent, $\\ell_1$ regularization to forward\nstagewise regression (also known as coordinate descent), and $\\ell_2$\nregularization to gradient descent, and, in the last case, theoretically bound\nfor the differences. We exploit the close relations between $\\ell_\\infty$\nregularization and sign gradient descent, and between $\\ell_1$ regularization\nand coordinate descent to propose computationally efficient methods for robust\nand sparse kernel regression.\n We finally compare robust kernel regression through sign gradient descent to\nexisting methods for robust kernel regression on five real data sets,\ndemonstrating that our method is one to two orders of magnitude faster, without\ncompromising accuracy.\n","authors":["Oskar Allerbo"],"pdf_url":"https://arxiv.org/pdf/2306.16838v6.pdf","comment":"Article arXiv:2306.16838v1 has been updated and split into two\n articles: this article and arXiv:2311.01762. Thus, some of the content in\n arXiv:2306.16838v1 is not a part of arXiv:2306.16838v2, but of\n arXiv:2311.01762"},{"id":"http://arxiv.org/abs/2407.11876v2","updated":"2024-09-02T09:49:49Z","published":"2024-07-16T16:00:42Z","title":"Simplifying the Theory on Over-Smoothing","summary":" Graph convolutions have gained popularity due to their ability to efficiently\noperate on data with an irregular geometric structure. However, graph\nconvolutions cause over-smoothing, which refers to representations becoming\nmore similar with increased depth. However, many different definitions and\nintuitions currently coexist, leading to research efforts focusing on\nincompatible directions. This paper attempts to align these directions by\nshowing that over-smoothing is merely a special case of power iteration. This\ngreatly simplifies the existing theory on over-smoothing, making it more\naccessible. Based on the theory, we provide a novel comprehensive definition of\nrank collapse as a generalized form of over-smoothing and introduce the\nrank-one distance as a corresponding metric. Our empirical evaluation of 14\ncommonly used methods shows that more models than were previously known suffer\nfrom this issue.\n","authors":["Andreas Roth"],"pdf_url":"https://arxiv.org/pdf/2407.11876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07362v2","updated":"2024-09-02T08:28:44Z","published":"2024-08-14T08:19:23Z","title":"BadMerging: Backdoor Attacks Against Model Merging","summary":" Fine-tuning pre-trained models for downstream tasks has led to a\nproliferation of open-sourced task-specific models. Recently, Model Merging\n(MM) has emerged as an effective approach to facilitate knowledge transfer\namong these independently fine-tuned models. MM directly combines multiple\nfine-tuned task-specific models into a merged model without additional\ntraining, and the resulting model shows enhanced capabilities in multiple\ntasks. Although MM provides great utility, it may come with security risks\nbecause an adversary can exploit MM to affect multiple downstream tasks.\nHowever, the security risks of MM have barely been studied. In this paper, we\nfirst find that MM, as a new learning paradigm, introduces unique challenges\nfor existing backdoor attacks due to the merging process. To address these\nchallenges, we introduce BadMerging, the first backdoor attack specifically\ndesigned for MM. Notably, BadMerging allows an adversary to compromise the\nentire merged model by contributing as few as one backdoored task-specific\nmodel. BadMerging comprises a two-stage attack mechanism and a novel\nfeature-interpolation-based loss to enhance the robustness of embedded\nbackdoors against the changes of different merging parameters. Considering that\na merged model may incorporate tasks from different domains, BadMerging can\njointly compromise the tasks provided by the adversary (on-task attack) and\nother contributors (off-task attack) and solve the corresponding unique\nchallenges with novel attack designs. Extensive experiments show that\nBadMerging achieves remarkable attacks against various MM algorithms. Our\nablation study demonstrates that the proposed attack designs can progressively\ncontribute to the attack performance. Finally, we show that prior defense\nmechanisms fail to defend against our attacks, highlighting the need for more\nadvanced defense.\n","authors":["Jinghuai Zhang","Jianfeng Chi","Zheng Li","Kunlin Cai","Yang Zhang","Yuan Tian"],"pdf_url":"https://arxiv.org/pdf/2408.07362v2.pdf","comment":"To appear in ACM Conference on Computer and Communications Security\n (CCS), 2024"},{"id":"http://arxiv.org/abs/2407.16237v2","updated":"2024-09-02T07:25:21Z","published":"2024-07-23T07:22:25Z","title":"OriGen:Enhancing RTL Code Generation with Code-to-Code Augmentation and\n Self-Reflection","summary":" Recent studies have demonstrated the significant potential of Large Language\nModels (LLMs) in generating Register Transfer Level (RTL) code, with notable\nadvancements showcased by commercial models such as GPT-4 and Claude3-Opus.\nHowever, these proprietary LLMs often raise concerns regarding privacy and\nsecurity. While open-source LLMs offer solutions to these concerns, they\ntypically underperform commercial models in RTL code generation tasks,\nprimarily due to the scarcity of high-quality open-source RTL datasets. To\naddress this challenge, we introduce OriGen , a fully open-source framework\nthat incorporates self-reflection capabilities and a novel dataset augmentation\nmethodology for generating high-quality, large-scale RTL code. Our approach\nemploys a code-tocode augmentation technique to enhance the quality of\nopen-source RTL code datasets. Furthermore, OriGen can rectify syntactic errors\nthrough a self-reflection process that leverages compiler feedback.\nExperimental results demonstrate that OriGen significantly outperforms other\nopen-source alternatives in RTL code generation. It surpasses the previous\nbest-performing open-source LLM by 12.8% and even exceeds GPT-4 Turbo in the\npass@1 metric on the VerilogEval-Human benchmark. Moreover, OriGen exhibits\nsuperior capabilities in self-reflection and error correction, outperforming\nGPT-4 by 19.9% on a benchmark designed to evaluate self-reflection\ncapabilities.\n","authors":["Fan Cui","Chenyang Yin","Kexing Zhou","Youwei Xiao","Guangyu Sun","Qiang Xu","Qipeng Guo","Demin Song","Dahua Lin","Xingcheng Zhang"," Yun"," Liang"],"pdf_url":"https://arxiv.org/pdf/2407.16237v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15434v3","updated":"2024-09-02T07:18:16Z","published":"2024-05-24T11:02:55Z","title":"Biometrics and Behavior Analysis for Detecting Distractions in\n e-Learning","summary":" In this article, we explore computer vision approaches to detect abnormal\nhead pose during e-learning sessions and we introduce a study on the effects of\nmobile phone usage during these sessions. We utilize behavioral data collected\nfrom 120 learners monitored while participating in a MOOC learning sessions.\nOur study focuses on the influence of phone-usage events on behavior and\nphysiological responses, specifically attention, heart rate, and meditation,\nbefore, during, and after phone usage. Additionally, we propose an approach for\nestimating head pose events using images taken by the webcam during the MOOC\nlearning sessions to detect phone-usage events. Our hypothesis suggests that\nhead posture undergoes significant changes when learners interact with a mobile\nphone, contrasting with the typical behavior seen when learners face a computer\nduring e-learning sessions. We propose an approach designed to detect\ndeviations in head posture from the average observed during a learner's\nsession, operating as a semi-supervised method. This system flags events\nindicating alterations in head posture for subsequent human review and\nselection of mobile phone usage occurrences with a sensitivity over 90%.\n","authors":["Álvaro Becerra","Javier Irigoyen","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez","Mutlu Cukurova"],"pdf_url":"https://arxiv.org/pdf/2405.15434v3.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2405.20091v4","updated":"2024-09-02T07:15:02Z","published":"2024-05-30T14:27:40Z","title":"VAAD: Visual Attention Analysis Dashboard applied to e-Learning","summary":" In this paper, we present an approach in the Multimodal Learning Analytics\nfield. Within this approach, we have developed a tool to visualize and analyze\neye movement data collected during learning sessions in online courses. The\ntool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These\neye movement data have been gathered using an eye-tracker and subsequently\nprocessed and visualized for interpretation. The purpose of the tool is to\nconduct a descriptive analysis of the data by facilitating its visualization,\nenabling the identification of differences and learning patterns among various\nlearner populations. Additionally, it integrates a predictive module capable of\nanticipating learner activities during a learning session. Consequently, VAAD\nholds the potential to offer valuable insights into online learning behaviors\nfrom both descriptive and predictive perspectives.\n","authors":["Miriam Navarro","Álvaro Becerra","Roberto Daza","Ruth Cobos","Aythami Morales","Julian Fierrez"],"pdf_url":"https://arxiv.org/pdf/2405.20091v4.pdf","comment":"Published in IEEE Intl. Symposium on Computers in Education (SIIE)\n 2024"},{"id":"http://arxiv.org/abs/2208.10230v4","updated":"2024-09-02T07:10:37Z","published":"2022-08-19T14:55:12Z","title":"From Static to Dynamic Structures: Improving Binding Affinity Prediction\n with Graph-Based Deep Learning","summary":" Accurate prediction of protein-ligand binding affinities is an essential\nchallenge in structure-based drug design. Despite recent advances in\ndata-driven methods for affinity prediction, their accuracy is still limited,\npartially because they only take advantage of static crystal structures while\nthe actual binding affinities are generally determined by the thermodynamic\nensembles between proteins and ligands. One effective way to approximate such a\nthermodynamic ensemble is to use molecular dynamics (MD) simulation. Here, an\nMD dataset containing 3,218 different protein-ligand complexes is curated, and\nDynaformer, a graph-based deep learning model is further developed to predict\nthe binding affinities by learning the geometric characteristics of the\nprotein-ligand interactions from the MD trajectories. In silico experiments\ndemonstrated that the model exhibits state-of-the-art scoring and ranking power\non the CASF-2016 benchmark dataset, outperforming the methods hitherto\nreported. Moreover, in a virtual screening on heat shock protein 90 (HSP90)\nusing Dynaformer, 20 candidates are identified and their binding affinities are\nfurther experimentally validated. Dynaformer displayed promising results in\nvirtual drug screening, revealing 12 hit compounds (two are in the\nsubmicromolar range), including several novel scaffolds. Overall, these results\ndemonstrated that the approach offer a promising avenue for accelerating the\nearly drug discovery process.\n","authors":["Yaosen Min","Ye Wei","Peizhuo Wang","Xiaoting Wang","Han Li","Nian Wu","Stefan Bauer","Shuxin Zheng","Yu Shi","Yingheng Wang","Ji Wu","Dan Zhao","Jianyang Zeng"],"pdf_url":"https://arxiv.org/pdf/2208.10230v4.pdf","comment":"Update the content according to the published version on Advanced\n Science (https://doi.org/10.1002/advs.202405404)"},{"id":"http://arxiv.org/abs/2407.00710v2","updated":"2024-09-02T07:01:31Z","published":"2024-06-30T14:21:32Z","title":"Directly Handling Missing Data in Linear Discriminant Analysis for\n Enhancing Classification Accuracy and Interpretability","summary":" As the adoption of Artificial Intelligence (AI) models expands into critical\nreal-world applications, ensuring the explainability of these models becomes\nparamount, particularly in sensitive fields such as medicine and finance.\nLinear Discriminant Analysis (LDA) remains a popular choice for classification\ndue to its interpretable nature, derived from its capacity to model class\ndistributions and enhance class separation through linear combinations of\nfeatures. However, real-world datasets often suffer from incomplete data,\nposing substantial challenges for both classification accuracy and model\ninterpretability. In this paper, we introduce a novel and robust classification\nmethod, termed Weighted missing Linear Discriminant Analysis (WLDA), which\nextends LDA to handle datasets with missing values without the need for\nimputation. Our approach innovatively incorporates a weight matrix that\npenalizes missing entries, thereby refining parameter estimation directly on\nincomplete data. This methodology not only preserves the interpretability of\nLDA but also significantly enhances classification performance in scenarios\nplagued by missing data. We conduct an in-depth theoretical analysis to\nestablish the properties of WLDA and thoroughly evaluate its explainability.\nExperimental results across various datasets demonstrate that WLDA consistently\noutperforms traditional methods, especially in challenging environments where\nmissing values are prevalent in both training and test datasets. This\nadvancement provides a critical tool for improving classification accuracy and\nmaintaining model transparency in the face of incomplete data.\n","authors":["Tuan L. Vo","Uyen Dang","Thu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2407.00710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03963v3","updated":"2024-09-02T06:51:36Z","published":"2024-05-07T02:49:59Z","title":"ERATTA: Extreme RAG for Table To Answers with Large Language Models","summary":" Large language models (LLMs) with retrieval augmented-generation (RAG) have\nbeen the optimal choice for scalable generative AI solutions in the recent\npast. Although RAG implemented with AI agents (agentic-RAG) has been recently\npopularized, its suffers from unstable cost and unreliable performances for\nEnterprise-level data-practices. Most existing use-cases that incorporate RAG\nwith LLMs have been either generic or extremely domain specific, thereby\nquestioning the scalability and generalizability of RAG-LLM approaches. In this\nwork, we propose a unique LLM-based system where multiple LLMs can be invoked\nto enable data authentication, user-query routing, data-retrieval and custom\nprompting for question-answering capabilities from Enterprise-data tables. The\nsource tables here are highly fluctuating and large in size and the proposed\nframework enables structured responses in under 10 seconds per query.\nAdditionally, we propose a five metric scoring module that detects and reports\nhallucinations in the LLM responses. Our proposed system and scoring metrics\nachieve >90% confidence scores across hundreds of user queries in the\nsustainability, financial health and social media domains. Extensions to the\nproposed extreme RAG architectures can enable heterogeneous source querying\nusing LLMs.\n","authors":["Sohini Roychowdhury","Marko Krema","Anvar Mahammad","Brian Moore","Arijit Mukherjee","Punit Prakashchandra"],"pdf_url":"https://arxiv.org/pdf/2405.03963v3.pdf","comment":"5 pages, 4 tables, IEEE Big Data, 2024"},{"id":"http://arxiv.org/abs/2408.17011v2","updated":"2024-09-02T06:31:48Z","published":"2024-08-30T04:51:19Z","title":"Disease Classification and Impact of Pretrained Deep Convolution Neural\n Networks on Diverse Medical Imaging Datasets across Imaging Modalities","summary":" Imaging techniques such as Chest X-rays, whole slide images, and optical\ncoherence tomography serve as the initial screening and detection for a wide\nvariety of medical pulmonary and ophthalmic conditions respectively. This paper\ninvestigates the intricacies of using pretrained deep convolutional neural\nnetworks with transfer learning across diverse medical imaging datasets with\nvarying modalities for binary and multiclass classification. We conducted a\ncomprehensive performance analysis with ten network architectures and model\nfamilies each with pretraining and random initialization. Our finding showed\nthat the use of pretrained models as fixed feature extractors yields poor\nperformance irrespective of the datasets. Contrary, histopathology microscopy\nwhole slide images have better performance. It is also found that deeper and\nmore complex architectures did not necessarily result in the best performance.\nThis observation implies that the improvements in ImageNet are not parallel to\nthe medical imaging tasks. Within a medical domain, the performance of the\nnetwork architectures varies within model families with shifts in datasets.\nThis indicates that the performance of models within a specific modality may\nnot be conclusive for another modality within the same domain. This study\nprovides a deeper understanding of the applications of deep learning techniques\nin medical imaging and highlights the impact of pretrained networks across\ndifferent medical imaging datasets under five different experimental settings.\n","authors":["Jutika Borah","Kumaresh Sarmah","Hidam Kumarjit Singh"],"pdf_url":"https://arxiv.org/pdf/2408.17011v2.pdf","comment":"15 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2405.07510v5","updated":"2024-09-02T06:27:05Z","published":"2024-05-13T07:10:53Z","title":"PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator","summary":" We present Piecewise Rectified Flow (PeRFlow), a flow-based method for\naccelerating diffusion models. PeRFlow divides the sampling process of\ngenerative flows into several time windows and straightens the trajectories in\neach interval via the reflow operation, thereby approaching piecewise linear\nflows. PeRFlow achieves superior performance in a few-step generation.\nMoreover, through dedicated parameterizations, the PeRFlow models inherit\nknowledge from the pretrained diffusion models. Thus, the training converges\nfast and the obtained models show advantageous transfer ability, serving as\nuniversal plug-and-play accelerators that are compatible with various workflows\nbased on the pre-trained diffusion models. Codes for training and inference are\npublicly released. https://github.com/magic-research/piecewise-rectified-flow\n","authors":["Hanshu Yan","Xingchao Liu","Jiachun Pan","Jun Hao Liew","Qiang Liu","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2405.07510v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17064v2","updated":"2024-09-02T06:25:09Z","published":"2024-08-30T07:49:35Z","title":"Instant Adversarial Purification with Adversarial Consistency\n Distillation","summary":" Neural networks, despite their remarkable performance in widespread\napplications, including image classification, are also known to be vulnerable\nto subtle adversarial noise. Although some diffusion-based purification methods\nhave been proposed, for example, DiffPure, those methods are time-consuming. In\nthis paper, we propose One Step Control Purification (OSCP), a diffusion-based\npurification model that can purify the adversarial image in one Neural Function\nEvaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and\nControlNet for our one-step purification. OSCP is computationally friendly and\ntime efficient compared to other diffusion-based purification methods; we\nachieve defense success rate of 74.19\\% on ImageNet, only requiring 0.1s for\neach purification. Moreover, there is a fundamental incongruence between\nconsistency distillation and adversarial perturbation. To address this\nontological dissonance, we propose Gaussian Adversarial Noise Distillation\n(GAND), a novel consistency distillation framework that facilitates a more\nnuanced reconciliation of the latent space dynamics, effectively bridging the\nnatural and adversarial manifolds. Our experiments show that the GAND does not\nneed a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient.\n","authors":["Chun Tong Lei","Hon Ming Yam","Zhongliang Guo","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.17064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14033v2","updated":"2024-09-02T05:55:06Z","published":"2024-08-26T05:55:48Z","title":"MLR-Copilot: Autonomous Machine Learning Research based on Large\n Language Models Agents","summary":" Machine learning research, crucial for technological advancements and\ninnovation, often faces significant challenges due to its inherent complexity,\nslow pace of experimentation, and the necessity for specialized expertise.\nMotivated by this, we present a new systematic framework, autonomous Machine\nLearning Research with large language models (MLR-Copilot), designed to enhance\nmachine learning research productivity through the automatic generation and\nimplementation of research ideas using Large Language Model (LLM) agents. The\nframework consists of three phases: research idea generation, experiment\nimplementation, and implementation execution. First, existing research papers\nare used to generate hypotheses and experimental plans vis IdeaAgent powered by\nLLMs. Next, the implementation generation phase translates these plans into\nexecutables with ExperimentAgent. This phase leverages retrieved prototype code\nand optionally retrieves candidate models and data. Finally, the execution\nphase, also managed by ExperimentAgent, involves running experiments with\nmechanisms for human feedback and iterative debugging to enhance the likelihood\nof achieving executable research outcomes. We evaluate our framework on five\nmachine learning research tasks and the experimental results show the\nframework's potential to facilitate the research progress and innovations.\n","authors":["Ruochen Li","Teerth Patel","Qingyun Wang","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2408.14033v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10901v2","updated":"2024-09-02T05:25:06Z","published":"2024-08-20T14:43:53Z","title":"A Grey-box Attack against Latent Diffusion Model-based Image Editing by\n Posterior Collapse","summary":" Recent advancements in generative AI, particularly Latent Diffusion Models\n(LDMs), have revolutionized image synthesis and manipulation. However, these\ngenerative techniques raises concerns about data misappropriation and\nintellectual property infringement. Adversarial attacks on machine learning\nmodels have been extensively studied, and a well-established body of research\nhas extended these techniques as a benign metric to prevent the underlying\nmisuse of generative AI. Current approaches to safeguarding images from\nmanipulation by LDMs are limited by their reliance on model-specific knowledge\nand their inability to significantly degrade semantic quality of generated\nimages. In response to these shortcomings, we propose the Posterior Collapse\nAttack (PCA) based on the observation that VAEs suffer from posterior collapse\nduring training. Our method minimizes dependence on the white-box information\nof target models to get rid of the implicit reliance on model-specific\nknowledge. By accessing merely a small amount of LDM parameters, in specific\nmerely the VAE encoder of LDMs, our method causes a substantial semantic\ncollapse in generation quality, particularly in perceptual consistency, and\ndemonstrates strong transferability across various model architectures.\nExperimental results show that PCA achieves superior perturbation effects on\nimage generation of LDMs with lower runtime and VRAM. Our method outperforms\nexisting techniques, offering a more robust and generalizable solution that is\nhelpful in alleviating the socio-technical challenges posed by the rapidly\nevolving landscape of generative AI.\n","authors":["Zhongliang Guo","Lei Fang","Jingyu Lin","Yifei Qian","Shuai Zhao","Zeyu Wang","Junhao Dong","Cunjian Chen","Ognjen Arandjelović","Chun Pong Lau"],"pdf_url":"https://arxiv.org/pdf/2408.10901v2.pdf","comment":"21 pages, 7 figures, 10 tables"},{"id":"http://arxiv.org/abs/2312.03814v2","updated":"2024-09-02T05:18:49Z","published":"2023-12-06T18:29:23Z","title":"Pearl: A Production-ready Reinforcement Learning Agent","summary":" Reinforcement learning (RL) is a versatile framework for optimizing long-term\ngoals. Although many real-world problems can be formalized with RL, learning\nand deploying a performant RL policy requires a system designed to address\nseveral important challenges, including the exploration-exploitation dilemma,\npartial observability, dynamic action spaces, and safety concerns. While the\nimportance of these challenges has been well recognized, existing open-source\nRL libraries do not explicitly address them. This paper introduces Pearl, a\nProduction-Ready RL software package designed to embrace these challenges in a\nmodular way. In addition to presenting benchmarking results, we also highlight\nexamples of Pearl's ongoing industry adoption to demonstrate its advantages for\nproduction use cases. Pearl is open sourced on GitHub at\ngithub.com/facebookresearch/pearl and its official website is\npearlagent.github.io.\n","authors":["Zheqing Zhu","Rodrigo de Salvo Braz","Jalaj Bhandari","Daniel Jiang","Yi Wan","Yonathan Efroni","Liyuan Wang","Ruiyang Xu","Hongbo Guo","Alex Nikulkov","Dmytro Korenkevych","Urun Dogan","Frank Cheng","Zheng Wu","Wanqiao Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03814v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14759v3","updated":"2024-09-02T04:51:17Z","published":"2024-05-23T16:29:30Z","title":"Fault Tolerant ML: Efficient Meta-Aggregation and Synchronous Training","summary":" In this paper, we investigate the challenging framework of Byzantine-robust\ntraining in distributed machine learning (ML) systems, focusing on enhancing\nboth efficiency and practicality. As distributed ML systems become integral for\ncomplex ML tasks, ensuring resilience against Byzantine failures-where workers\nmay contribute incorrect updates due to malice or error-gains paramount\nimportance. Our first contribution is the introduction of the Centered Trimmed\nMeta Aggregator (CTMA), an efficient meta-aggregator that upgrades baseline\naggregators to optimal performance levels, while requiring low computational\ndemands. Additionally, we propose harnessing a recently developed gradient\nestimation technique based on a double-momentum strategy within the Byzantine\ncontext. Our paper highlights its theoretical and practical advantages for\nByzantine-robust training, especially in simplifying the tuning process and\nreducing the reliance on numerous hyperparameters. The effectiveness of this\ntechnique is supported by theoretical insights within the stochastic convex\noptimization (SCO) framework and corroborated by empirical evidence.\n","authors":["Tehila Dahan","Kfir Y. Levy"],"pdf_url":"https://arxiv.org/pdf/2405.14759v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14951v2","updated":"2024-09-02T04:13:50Z","published":"2024-06-21T08:03:25Z","title":"An Idiosyncrasy of Time-discretization in Reinforcement Learning","summary":" Many reinforcement learning algorithms are built on an assumption that an\nagent interacts with an environment over fixed-duration, discrete time steps.\nHowever, physical systems are continuous in time, requiring a choice of\ntime-discretization granularity when digitally controlling them. Furthermore,\nsuch systems do not wait for decisions to be made before advancing the\nenvironment state, necessitating the study of how the choice of discretization\nmay affect a reinforcement learning algorithm. In this work, we consider the\nrelationship between the definitions of the continuous-time and discrete-time\nreturns. Specifically, we acknowledge an idiosyncrasy with naively applying a\ndiscrete-time algorithm to a discretized continuous-time environment, and note\nhow a simple modification can better align the return definitions. This\nobservation is of practical consideration when dealing with environments where\ntime-discretization granularity is a choice, or situations where such\ngranularity is inherently stochastic.\n","authors":["Kris De Asis","Richard S. Sutton"],"pdf_url":"https://arxiv.org/pdf/2406.14951v2.pdf","comment":"RLC 2024"},{"id":"http://arxiv.org/abs/2312.15551v4","updated":"2024-09-02T03:26:58Z","published":"2023-12-24T21:46:14Z","title":"On the Benefits of Public Representations for Private Transfer Learning\n under Distribution Shift","summary":" Public pretraining is a promising approach to improve differentially private\nmodel training. However, recent work has noted that many positive research\nresults studying this paradigm only consider in-distribution tasks, and may not\napply to settings where there is distribution shift between the pretraining and\nfinetuning data -- a scenario that is likely when finetuning private tasks due\nto the sensitive nature of the data. In this work, we show empirically across\nthree tasks that even in settings with large distribution shift, where both\nzero-shot performance from public data and training from scratch with private\ndata give unusably weak results, public features can in fact improve private\ntraining accuracy by up to 67\\% over private training from scratch. We provide\na theoretical explanation for this phenomenon, showing that if the public and\nprivate data share a low-dimensional representation, public representations can\nimprove the sample complexity of private training even if it is impossible to\nlearn the private task from the public data alone. Altogether, our results\nprovide evidence that public data can indeed make private training practical in\nrealistic settings of extreme distribution shift.\n","authors":["Pratiksha Thaker","Amrith Setlur","Zhiwei Steven Wu","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2312.15551v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12816v4","updated":"2024-09-02T01:48:34Z","published":"2023-03-22T07:34:33Z","title":"From Wide to Deep: Dimension Lifting Network for Parameter-efficient\n Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) that maps entities and relations into vector\nrepresentations is essential for downstream applications. Conventional KGE\nmethods require high-dimensional representations to learn the complex structure\nof knowledge graph, but lead to oversized model parameters. Recent advances\nreduce parameters by low-dimensional entity representations, while developing\ntechniques (e.g., knowledge distillation or reinvented representation forms) to\ncompensate for reduced dimension. However, such operations introduce\ncomplicated computations and model designs that may not benefit large knowledge\ngraphs. To seek a simple strategy to improve the parameter efficiency of\nconventional KGE models, we take inspiration from that deeper neural networks\nrequire exponentially fewer parameters to achieve expressiveness comparable to\nwider networks for compositional structures. We view all entity representations\nas a single-layer embedding network, and conventional KGE methods that adopt\nhigh-dimensional entity representations equal widening the embedding network to\ngain expressiveness. To achieve parameter efficiency, we instead propose a\ndeeper embedding network for entity representations, i.e., a narrow entity\nembedding layer plus a multi-layer dimension lifting network (LiftNet).\nExperiments on three public datasets show that by integrating LiftNet, four\nconventional KGE methods with 16-dimensional representations achieve comparable\nlink prediction accuracy as original models that adopt 512-dimensional\nrepresentations, saving 68.4% to 96.9% parameters.\n","authors":["Borui Cai","Yong Xiang","Longxiang Gao","Di Wu","He Zhang","Jiong Jin","Tom Luan"],"pdf_url":"https://arxiv.org/pdf/2303.12816v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12530v2","updated":"2024-09-02T01:39:58Z","published":"2024-04-18T22:23:24Z","title":"TrajDeleter: Enabling Trajectory Forgetting in Offline Reinforcement\n Learning Agents","summary":" Reinforcement learning (RL) trains an agent from experiences interacting with\nthe environment. In scenarios where online interactions are impractical,\noffline RL, which trains the agent using pre-collected datasets, has become\npopular. While this new paradigm presents remarkable effectiveness across\nvarious real-world domains, like healthcare and energy management, there is a\ngrowing demand to enable agents to rapidly and completely eliminate the\ninfluence of specific trajectories from both the training dataset and the\ntrained agents. To meet this problem, this paper advocates Trajdeleter, the\nfirst practical approach to trajectory unlearning for offline RL agents. The\nkey idea of Trajdeleter is to guide the agent to demonstrate deteriorating\nperformance when it encounters states associated with unlearning trajectories.\nSimultaneously, it ensures the agent maintains its original performance level\nwhen facing other remaining trajectories. Additionally, we introduce\nTrajauditor, a simple yet efficient method to evaluate whether Trajdeleter\nsuccessfully eliminates the specific trajectories of influence from the offline\nRL agent. Extensive experiments conducted on six offline RL algorithms and\nthree tasks demonstrate that Trajdeleter requires only about 1.5% of the time\nneeded for retraining from scratch. It effectively unlearns an average of 94.8%\nof the targeted trajectories yet still performs well in actual environment\ninteractions after unlearning. The replication package and agent parameters are\navailable online.\n","authors":["Chen Gong","Kecen Li","Jin Yao","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2404.12530v2.pdf","comment":"Accepted at NDSS 2025. The presented document here is the full\n version of our paper"},{"id":"http://arxiv.org/abs/2208.13273v2","updated":"2024-09-02T00:22:45Z","published":"2022-08-28T19:07:54Z","title":"Blending Neural Operators and Relaxation Methods in PDE Numerical\n Solvers","summary":" Neural networks suffer from spectral bias having difficulty in representing\nthe high frequency components of a function while relaxation methods can\nresolve high frequencies efficiently but stall at moderate to low frequencies.\nWe exploit the weaknesses of the two approaches by combining them\nsynergistically to develop a fast numerical solver of partial differential\nequations (PDEs) at scale. Specifically, we propose HINTS, a hybrid, iterative,\nnumerical, and transferable solver by integrating a Deep Operator Network\n(DeepONet) with standard relaxation methods, leading to parallel efficiency and\nalgorithmic scalability for a wide class of PDEs, not tractable with existing\nmonolithic solvers. HINTS balances the convergence behavior across the spectrum\nof eigenmodes by utilizing the spectral bias of DeepONet, resulting in a\nuniform convergence rate and hence exceptional performance of the hybrid solver\noverall. Moreover, HINTS applies to large-scale, multidimensional systems, it\nis flexible with regards to discretizations, computational domain, and boundary\nconditions.\n","authors":["Enrui Zhang","Adar Kahana","Alena Kopaničáková","Eli Turkel","Rishikesh Ranade","Jay Pathak","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2208.13273v2.pdf","comment":"Main text: 17 pages, 6 figures. Supplementary Information: 30 pages,\n 8 figures, 2 tables, 4 algorithms"}],"Multimedia":[{"id":"http://arxiv.org/abs/2207.12554v2","updated":"2024-09-02T22:49:21Z","published":"2022-07-25T22:17:19Z","title":"Inter-Frame Compression for Dynamic Point Cloud Geometry Coding","summary":" Efficient point cloud compression is essential for applications like virtual\nand mixed reality, autonomous driving, and cultural heritage. This paper\nproposes a deep learning-based inter-frame encoding scheme for dynamic point\ncloud geometry compression. We propose a lossy geometry compression scheme that\npredicts the latent representation of the current frame using the previous\nframe by employing a novel feature space inter-prediction network. The proposed\nnetwork utilizes sparse convolutions with hierarchical multiscale 3D feature\nlearning to encode the current frame using the previous frame. The proposed\nmethod introduces a novel predictor network for motion compensation in the\nfeature domain to map the latent representation of the previous frame to the\ncoordinates of the current frame to predict the current frame's feature\nembedding. The framework transmits the residual of the predicted features and\nthe actual features by compressing them using a learned probabilistic\nfactorized entropy model. At the receiver, the decoder hierarchically\nreconstructs the current frame by progressively rescaling the feature\nembedding. The proposed framework is compared to the state-of-the-art\nVideo-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud\nCompression (G-PCC) schemes standardized by the Moving Picture Experts Group\n(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta\nRate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against\nG-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame\nencoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based\ninter-frame encoding mode using HEVC. These significant performance gains are\ncross-checked and verified in the MPEG working group.\n","authors":["Anique Akhtar","Zhu Li","Geert Van der Auwera"],"pdf_url":"https://arxiv.org/pdf/2207.12554v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11593v2","updated":"2024-09-02T09:04:51Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v2.pdf","comment":"Accepted by NCMMSC2024"},{"id":"http://arxiv.org/abs/2308.03024v3","updated":"2024-09-02T05:51:02Z","published":"2023-08-06T05:23:25Z","title":"Show Me the World in My Language: Establishing the First Baseline for\n Scene-Text to Scene-Text Translation","summary":" In this work, we study the task of ``visually'' translating scene text from a\nsource language (e.g., Hindi) to a target language (e.g., English). Visual\ntranslation involves not just the recognition and translation of scene text but\nalso the generation of the translated image that preserves visual features of\nthe source scene text, such as font, size, and background. There are several\nchallenges associated with this task, such as translation with limited context,\ndeciding between translation and transliteration, accommodating varying text\nlengths within fixed spatial boundaries, and preserving the font and background\nstyles of the source scene text in the target language. To address this\nproblem, we make the following contributions: (i) We study visual translation\nas a standalone problem for the first time in the literature. (ii) We present a\ncascaded framework for visual translation that combines state-of-the-art\nmodules for scene text recognition, machine translation, and scene text\nsynthesis as a baseline for the task. (iii) We propose a set of task-specific\ndesign enhancements to design a variant of the baseline to obtain performance\nimprovements. (iv) Currently, the existing related literature lacks any\ncomprehensive performance evaluation for this novel task. To fill this gap, we\nintroduce several automatic and user-assisted evaluation metrics designed\nexplicitly for evaluating visual translation. Further, we evaluate presented\nbaselines for translating scene text between Hindi and English. Our experiments\ndemonstrate that although we can effectively perform visual translation over a\nlarge collection of scene text images, the presented baseline only partially\naddresses challenges posed by visual translation tasks. We firmly believe that\nthis new task and the limitations of existing models, as reported in this\npaper, should encourage further research in visual translation.\n","authors":["Shreyas Vaidya","Arvind Kumar Sharma","Prajwal Gatti","Anand Mishra"],"pdf_url":"https://arxiv.org/pdf/2308.03024v3.pdf","comment":"Accepted at ICPR 2024, Project Website:\n https://vl2g.github.io/projects/visTrans/"},{"id":"http://arxiv.org/abs/2409.01352v1","updated":"2024-09-02T16:11:12Z","published":"2024-09-02T16:11:12Z","title":"Spectron: Target Speaker Extraction using Conditional Transformer with\n Adversarial Refinement","summary":" Recently, attention-based transformers have become a de facto standard in\nmany deep learning applications including natural language processing, computer\nvision, signal processing, etc.. In this paper, we propose a transformer-based\nend-to-end model to extract a target speaker's speech from a monaural\nmulti-speaker mixed audio signal. Unlike existing speaker extraction methods,\nwe introduce two additional objectives to impose speaker embedding consistency\nand waveform encoder invertibility and jointly train both speaker encoder and\nspeech separator to better capture the speaker conditional embedding.\nFurthermore, we leverage a multi-scale discriminator to refine the perceptual\nquality of the extracted speech. Our experiments show that the use of a dual\npath transformer in the separator backbone along with proposed training\nparadigm improves the CNN baseline by $3.12$ dB points. Finally, we compare our\napproach with recent state-of-the-arts and show that our model outperforms\nexisting methods by $4.1$ dB points on an average without creating additional\ndata dependency.\n","authors":["Tathagata Bandyopadhyay"],"pdf_url":"https://arxiv.org/pdf/2409.01352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01029v1","updated":"2024-09-02T08:06:47Z","published":"2024-09-02T08:06:47Z","title":"Multi-Reference Generative Face Video Compression with Contrastive\n Learning","summary":" Generative face video coding (GFVC) has been demonstrated as a potential\napproach to low-latency, low bitrate video conferencing. GFVC frameworks\nachieve an extreme gain in coding efficiency with over 70% bitrate savings when\ncompared to conventional codecs at bitrates below 10kbps. In recent MPEG/JVET\nstandardization efforts, all the information required to reconstruct video\nsequences using GFVC frameworks are adopted as part of the supplemental\nenhancement information (SEI) in existing compression pipelines. In light of\nthis development, we aim to address a challenge that has been weakly addressed\nin prior GFVC frameworks, i.e., reconstruction drift as the distance between\nthe reference and target frames increases. This challenge creates the need to\nupdate the reference buffer more frequently by transmitting more Intra-refresh\nframes, which are the most expensive element of the GFVC bitstream. To overcome\nthis problem, we propose instead multiple reference animation as a robust\napproach to minimizing reconstruction drift, especially when used in a\nbi-directional prediction mode. Further, we propose a contrastive learning\nformulation for multi-reference animation. We observe that using a contrastive\nlearning framework enhances the representation capabilities of the animation\ngenerator. The resulting framework, MRDAC (Multi-Reference Deep Animation\nCodec) can therefore be used to compress longer sequences with fewer reference\nframes or achieve a significant gain in reconstruction accuracy at comparable\nbitrates to previous frameworks. Quantitative and qualitative results show\nsignificant coding and reconstruction quality gains compared to previous GFVC\nmethods, and more accurate animation quality in presence of large pose and\nfacial expression changes.\n","authors":["Goluck Konuko","Giuseppe Valenzise"],"pdf_url":"https://arxiv.org/pdf/2409.01029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00971v1","updated":"2024-09-02T06:26:48Z","published":"2024-09-02T06:26:48Z","title":"Interpretable Convolutional SyncNet","summary":" Because videos in the wild can be out of sync for various reasons, a sync-net\nis used to bring the video back into sync for tasks that require synchronized\nvideos. Previous state-of-the-art (SOTA) sync-nets use InfoNCE loss, rely on\nthe transformer architecture, or both. Unfortunately, the former makes the\nmodel's output difficult to interpret, and the latter is unfriendly with large\nimages, thus limiting the usefulness of sync-nets. In this work, we train a\nconvolutional sync-net using the balanced BCE loss (BBCE), a loss inspired by\nthe binary cross entropy (BCE) and the InfoNCE losses. In contrast to the\nInfoNCE loss, the BBCE loss does not require complicated sampling schemes. Our\nmodel can better handle larger images, and its output can be given a\nprobabilistic interpretation. The probabilistic interpretation allows us to\ndefine metrics such as probability at offset and offscreen ratio to evaluate\nthe sync quality of audio-visual (AV) speech datasets. Furthermore, our model\nachieves SOTA accuracy of $96.5\\%$ on the LRS2 dataset and $93.8\\%$ on the LRS3\ndataset.\n","authors":["Sungjoon Park","Jaesub Yun","Donggeon Lee","Minsik Park"],"pdf_url":"https://arxiv.org/pdf/2409.00971v1.pdf","comment":"8+5 pages"}]},"2024-09-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.02920v1","updated":"2024-09-04T17:59:52Z","published":"2024-09-04T17:59:52Z","title":"RoboTwin: Dual-Arm Robot Benchmark with Generative Digital Twins (early\n version)","summary":" Effective collaboration of dual-arm robots and their tool use capabilities\nare increasingly important areas in the advancement of robotics. These skills\nplay a significant role in expanding robots' ability to operate in diverse\nreal-world environments. However, progress is impeded by the scarcity of\nspecialized training data. This paper introduces RoboTwin, a novel benchmark\ndataset combining real-world teleoperated data with synthetic data from digital\ntwins, designed for dual-arm robotic scenarios. Using the COBOT Magic platform,\nwe have collected diverse data on tool usage and human-robot interaction. We\npresent a innovative approach to creating digital twins using AI-generated\ncontent, transforming 2D images into detailed 3D models. Furthermore, we\nutilize large language models to generate expert-level training data and\ntask-specific pose sequences oriented toward functionality. Our key\ncontributions are: 1) the RoboTwin benchmark dataset, 2) an efficient\nreal-to-simulation pipeline, and 3) the use of language models for automatic\nexpert-level data generation. These advancements are designed to address the\nshortage of robotic training data, potentially accelerating the development of\nmore capable and versatile robotic systems for a wide range of real-world\napplications. The project page is available at\nhttps://robotwin-benchmark.github.io/early-version/\n","authors":["Yao Mu","Tianxing Chen","Shijia Peng","Zanxin Chen","Zeyu Gao","Yude Zou","Lunkai Lin","Zhiqiang Xie","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2409.02920v1.pdf","comment":"Project page: https://robotwin-benchmark.github.io/early-version/"},{"id":"http://arxiv.org/abs/2409.02908v1","updated":"2024-09-04T17:48:19Z","published":"2024-09-04T17:48:19Z","title":"Masked Diffusion Models are Secretly Time-Agnostic Masked Models and\n Exploit Inaccurate Categorical Sampling","summary":" Masked diffusion models (MDMs) have emerged as a popular research topic for\ngenerative modeling of discrete data, thanks to their superior performance over\nother discrete diffusion models, and are rivaling the auto-regressive models\n(ARMs) for language modeling tasks. The recent effort in simplifying the masked\ndiffusion framework further leads to alignment with continuous-space diffusion\nmodels and more principled training and sampling recipes. In this paper,\nhowever, we reveal that both training and sampling of MDMs are theoretically\nfree from the time variable, arguably the key signature of diffusion models,\nand are instead equivalent to masked models. The connection on the sampling\naspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we\nshow that the FHS is theoretically equivalent to MDMs' original generation\nprocess while significantly alleviating the time-consuming categorical sampling\nand achieving a 20$\\times$ speedup. In addition, our investigation challenges\nprevious claims that MDMs can surpass ARMs in generative perplexity. We\nidentify, for the first time, an underlying numerical issue, even with the\n32-bit floating-point precision, which results in inaccurate categorical\nsampling. We show that the numerical issue lowers the effective temperature\nboth theoretically and empirically, leading to unfair assessments of MDMs'\ngeneration results in the previous literature.\n","authors":["Kaiwen Zheng","Yongxin Chen","Hanzi Mao","Ming-Yu Liu","Jun Zhu","Qinsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02908v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2409.02897v1","updated":"2024-09-04T17:41:19Z","published":"2024-09-04T17:41:19Z","title":"LongCite: Enabling LLMs to Generate Fine-grained Citations in\n Long-context QA","summary":" Though current long-context large language models (LLMs) have demonstrated\nimpressive capacities in answering user questions based on extensive text, the\nlack of citations in their responses makes user verification difficult, leading\nto concerns about their trustworthiness due to their potential hallucinations.\nIn this work, we aim to enable long-context LLMs to generate responses with\nfine-grained sentence-level citations, improving their faithfulness and\nverifiability. We first introduce LongBench-Cite, an automated benchmark for\nassessing current LLMs' performance in Long-Context Question Answering with\nCitations (LQAC), revealing considerable room for improvement. To this end, we\npropose CoF (Coarse to Fine), a novel pipeline that utilizes off-the-shelf LLMs\nto automatically generate long-context QA instances with precise sentence-level\ncitations, and leverage this pipeline to construct LongCite-45k, a large-scale\nSFT dataset for LQAC. Finally, we train LongCite-8B and LongCite-9B using the\nLongCite-45k dataset, successfully enabling their generation of accurate\nresponses and fine-grained sentence-level citations in a single output. The\nevaluation results on LongBench-Cite show that our trained models achieve\nstate-of-the-art citation quality, surpassing advanced proprietary models\nincluding GPT-4o.\n","authors":["jiajie Zhang","Yushi Bai","Xin Lv","Wanjun Gu","Danqing Liu","Minhao Zou","Shulin Cao","Lei Hou","Yuxiao Dong","Ling Feng","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2409.02897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07832v3","updated":"2024-09-04T17:31:00Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable (https://github.com/batmanlab/Ladder).\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02889v1","updated":"2024-09-04T17:25:21Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v1.pdf","comment":"19 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2407.18322v2","updated":"2024-09-04T17:16:05Z","published":"2024-07-01T19:52:41Z","title":"The Need for Guardrails with Large Language Models in Medical\n Safety-Critical Settings: An Artificial Intelligence Application in the\n Pharmacovigilance Ecosystem","summary":" Large language models (LLMs) are useful tools with the capacity for\nperforming specific types of knowledge work at an effective scale. However, LLM\ndeployments in high-risk and safety-critical domains pose unique challenges,\nnotably the issue of ``hallucination,'' where LLMs can generate fabricated\ninformation. This is particularly concerning in settings such as drug safety,\nwhere inaccuracies could lead to patient harm. To mitigate these risks, we have\ndeveloped and demonstrated a proof of concept suite of guardrails specifically\ndesigned to mitigate certain types of hallucinations and errors for drug\nsafety, and potentially applicable to other medical safety-critical contexts.\nThese guardrails include mechanisms to detect anomalous documents to prevent\nthe ingestion of inappropriate data, identify incorrect drug names or adverse\nevent terms, and convey uncertainty in generated content. We integrated these\nguardrails with an LLM fine-tuned for a text-to-text task, which involves\nconverting both structured and unstructured data within adverse event reports\ninto natural language. This method was applied to translate individual case\nsafety reports, demonstrating effective application in a pharmacovigilance\nprocessing task. Our guardrail framework offers a set of tools with broad\napplicability across various domains, ensuring LLMs can be safely used in\nhigh-risk situations by eliminating the occurrence of key errors, including the\ngeneration of incorrect pharmacovigilance-related terms, thus adhering to\nstringent regulatory and quality standards in medical safety-critical\nenvironments.\n","authors":["Joe B Hakim","Jeffery L Painter","Darmendra Ramcharran","Vijay Kara","Greg Powell","Paulina Sobczak","Chiho Sato","Andrew Bate","Andrew Beam"],"pdf_url":"https://arxiv.org/pdf/2407.18322v2.pdf","comment":"27 pages, 6 figures, 4 tables and supplementary material provided"},{"id":"http://arxiv.org/abs/2409.02877v1","updated":"2024-09-04T17:01:02Z","published":"2024-09-04T17:01:02Z","title":"Configurable Foundation Models: Building LLMs from a Modular Perspective","summary":" Advancements in LLMs have recently unveiled challenges tied to computational\nefficiency and continual scalability due to their requirements of huge\nparameters, making the applications and evolution of these models on devices\nwith limited computation resources and scenarios requiring various abilities\nincreasingly cumbersome. Inspired by modularity within the human brain, there\nis a growing tendency to decompose LLMs into numerous functional modules,\nallowing for inference with part of modules and dynamic assembly of modules to\ntackle complex tasks, such as mixture-of-experts. To highlight the inherent\nefficiency and composability of the modular approach, we coin the term brick to\nrepresent each functional module, designating the modularized structure as\nconfigurable foundation models. In this paper, we offer a comprehensive\noverview and investigation of the construction, utilization, and limitation of\nconfigurable foundation models. We first formalize modules into emergent bricks\n- functional neuron partitions that emerge during the pre-training phase, and\ncustomized bricks - bricks constructed via additional post-training to improve\nthe capabilities and knowledge of LLMs. Based on diverse functional bricks, we\nfurther present four brick-oriented operations: retrieval and routing, merging,\nupdating, and growing. These operations allow for dynamic configuration of LLMs\nbased on instructions to handle complex tasks. To verify our perspective, we\nconduct an empirical analysis on widely-used LLMs. We find that the FFN layers\nfollow modular patterns with functional specialization of neurons and\nfunctional neuron partitions. Finally, we highlight several open issues and\ndirections for future research. Overall, this paper aims to offer a fresh\nmodular perspective on existing LLM research and inspire the future creation of\nmore efficient and scalable foundational models.\n","authors":["Chaojun Xiao","Zhengyan Zhang","Chenyang Song","Dazhi Jiang","Feng Yao","Xu Han","Xiaozhi Wang","Shuo Wang","Yufei Huang","Guanyu Lin","Yingfa Chen","Weilin Zhao","Yuge Tu","Zexuan Zhong","Ao Zhang","Chenglei Si","Khai Hao Moo","Chenyang Zhao","Huimin Chen","Yankai Lin","Zhiyuan Liu","Jingbo Shang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2409.02877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02841v1","updated":"2024-09-04T16:14:05Z","published":"2024-09-04T16:14:05Z","title":"Historical German Text Normalization Using Type- and Token-Based\n Language Modeling","summary":" Historic variations of spelling poses a challenge for full-text search or\nnatural language processing on historical digitized texts. To minimize the gap\nbetween the historic orthography and contemporary spelling, usually an\nautomatic orthographic normalization of the historical source material is\npursued. This report proposes a normalization system for German literary texts\nfrom c. 1700-1900, trained on a parallel corpus. The proposed system makes use\nof a machine learning approach using Transformer language models, combining an\nencoder-decoder model to normalize individual word types, and a pre-trained\ncausal language model to adjust these normalizations within their context. An\nextensive evaluation shows that the proposed system provides state-of-the-art\naccuracy, comparable with a much larger fully end-to-end sentence-based\nnormalization system, fine-tuning a pre-trained Transformer large language\nmodel. However, the normalization of historical text remains a challenge due to\ndifficulties for models to generalize, and the lack of extensive high-quality\nparallel data.\n","authors":["Anton Ehrmanntraut"],"pdf_url":"https://arxiv.org/pdf/2409.02841v1.pdf","comment":"27 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.08763v4","updated":"2024-09-04T16:13:18Z","published":"2024-03-13T17:58:57Z","title":"Simple and Scalable Strategies to Continually Pre-train Large Language\n Models","summary":" Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to start the process over again once new data becomes available. A much\nmore efficient solution is to continually pre-train these models, saving\nsignificant compute compared to re-training. However, the distribution shift\ninduced by new data typically results in degraded performance on previous data\nor poor adaptation to the new data. In this work, we show that a simple and\nscalable combination of learning rate (LR) re-warming, LR re-decaying, and\nreplay of previous data is sufficient to match the performance of fully\nre-training from scratch on all available data, as measured by the final loss\nand the average score on several language model (LM) evaluation benchmarks.\nSpecifically, we show this for a weak but realistic distribution shift between\ntwo commonly used LLM pre-training datasets (English$\\rightarrow$English) and a\nstronger distribution shift (English$\\rightarrow$German) at the $405$M\nparameter model scale with large dataset sizes (hundreds of billions of\ntokens). Selecting the weak but realistic shift for larger-scale experiments,\nwe also find that our continual learning strategies match the re-training\nbaseline for a 10B parameter LLM. Our results demonstrate that LLMs can be\nsuccessfully updated via simple and scalable continual learning strategies,\nmatching the re-training baseline using only a fraction of the compute.\nFinally, inspired by previous work, we propose alternatives to the cosine\nlearning rate schedule that help circumvent forgetting induced by LR re-warming\nand that are not bound to a fixed token budget.\n","authors":["Adam Ibrahim","Benjamin Thérien","Kshitij Gupta","Mats L. Richter","Quentin Anthony","Timothée Lesort","Eugene Belilovsky","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2403.08763v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02840v1","updated":"2024-09-04T16:12:30Z","published":"2024-09-04T16:12:30Z","title":"R2GQA: Retriever-Reader-Generator Question Answering System to Support\n Students Understanding Legal Regulations in Higher Education","summary":" In this article, we propose the R2GQA system, a Retriever-Reader-Generator\nQuestion Answering system, consisting of three main components: Document\nRetriever, Machine Reader, and Answer Generator. The Retriever module employs\nadvanced information retrieval techniques to extract the context of articles\nfrom a dataset of legal regulation documents. The Machine Reader module\nutilizes state-of-the-art natural language understanding algorithms to\ncomprehend the retrieved documents and extract answers. Finally, the Generator\nmodule synthesizes the extracted answers into concise and informative responses\nto questions of students regarding legal regulations. Furthermore, we built the\nViRHE4QA dataset in the domain of university training regulations, comprising\n9,758 question-answer pairs with a rigorous construction process. This is the\nfirst Vietnamese dataset in the higher regulations domain with various types of\nanswers, both extractive and abstractive. In addition, the R2GQA system is the\nfirst system to offer abstractive answers in Vietnamese. This paper discusses\nthe design and implementation of each module within the R2GQA system on the\nViRHE4QA dataset, highlighting their functionalities and interactions.\nFurthermore, we present experimental results demonstrating the effectiveness\nand utility of the proposed system in supporting the comprehension of students\nof legal regulations in higher education settings. In general, the R2GQA system\nand the ViRHE4QA dataset promise to contribute significantly to related\nresearch and help students navigate complex legal documents and regulations,\nempowering them to make informed decisions and adhere to institutional policies\neffectively. Our dataset is available for research purposes.\n","authors":["Phuc-Tinh Pham Do","Duy-Ngoc Dinh Cao","Khanh Quoc Tran","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.02840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02836v1","updated":"2024-09-04T16:02:30Z","published":"2024-09-04T16:02:30Z","title":"Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency\n Discussions by Few-Shot Learning with Large Language Models","summary":" This study performs analysis of Predictive statements, Hope speech, and\nRegret Detection behaviors within cryptocurrency-related discussions,\nleveraging advanced natural language processing techniques. We introduce a\nnovel classification scheme named \"Prediction statements,\" categorizing\ncomments into Predictive Incremental, Predictive Decremental, Predictive\nNeutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large\nlanguage model, we explore sentiment dynamics across five prominent\ncryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis\nreveals distinct patterns in predictive sentiments, with Matic demonstrating a\nnotably higher propensity for optimistic predictions. Additionally, we\ninvestigate hope and regret sentiments, uncovering nuanced interplay between\nthese emotions and predictive behaviors. Despite encountering limitations\nrelated to data volume and resource availability, our study reports valuable\ndiscoveries concerning investor behavior and sentiment trends within the\ncryptocurrency market, informing strategic decision-making and future research\nendeavors.\n","authors":["Moein Shahiki Tash","Zahra Ahani","Mohim Tash","Olga Kolesnikova","Grigori Sidorov"],"pdf_url":"https://arxiv.org/pdf/2409.02836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02834v1","updated":"2024-09-04T16:00:21Z","published":"2024-09-04T16:00:21Z","title":"CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the\n Mathematics Reasoning of Large Multimodal Models","summary":" Large language models (LLMs) have obtained promising results in mathematical\nreasoning, which is a foundational skill for human intelligence. Most previous\nstudies focus on improving and measuring the performance of LLMs based on\ntextual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few\nresearchers have released English multimodal math datasets (e.g., MATHVISTA and\nMATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In\nthis paper, we release a Chinese multimodal math (CMM-Math) dataset, including\nbenchmark and training parts, to evaluate and enhance the mathematical\nreasoning of LMMs. CMM-Math contains over 28,000 high-quality samples,\nfeaturing a variety of problem types (e.g., multiple-choice, fill-in-the-blank,\nand so on) with detailed solutions across 12 grade levels from elementary to\nhigh school in China. Specifically, the visual context may be present in the\nquestions or opinions, which makes this dataset more challenging. Through\ncomprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math\ndataset face challenges, emphasizing the necessity for further improvements in\nLMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to\nhandle the problems with mixed input of multiple images and text segments. We\ntrain our model using three stages, including foundational pre-training,\nfoundational fine-tuning, and mathematical fine-tuning. The extensive\nexperiments indicate that our model effectively improves math reasoning\nperformance by comparing it with the SOTA LMMs over three multimodal\nmathematical datasets.\n","authors":["Wentao Liu","Qianjun Pan","Yi Zhang","Zhuo Liu","Ji Wu","Jie Zhou","Aimin Zhou","Qin Chen","Bo Jiang","Liang He"],"pdf_url":"https://arxiv.org/pdf/2409.02834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00509v2","updated":"2024-09-04T15:55:22Z","published":"2024-08-31T17:19:30Z","title":"LongRecipe: Recipe for Efficient Long Context Generalization in Large\n Language Models","summary":" Large language models (LLMs) face significant challenges in handling\nlong-context tasks because of their limited effective context window size\nduring pretraining, which restricts their ability to generalize over extended\nsequences. Meanwhile, extending the context window in LLMs through\npost-pretraining is highly resource-intensive. To address this, we introduce\nLongRecipe, an efficient training strategy for extending the context window of\nLLMs, including impactful token analysis, position index transformation, and\ntraining optimization strategies. It simulates long-sequence inputs while\nmaintaining training efficiency and significantly improves the model's\nunderstanding of long-range dependencies. Experiments on three types of LLMs\nshow that LongRecipe can utilize long sequences while requiring only 30% of the\ntarget context window size, and reduces computational training resource over\n85% compared to full sequence training. Furthermore, LongRecipe also preserves\nthe original LLM's capabilities in general tasks. Ultimately, we can extend the\neffective context window of open-source LLMs from 8k to 128k, achieving\nperformance close to GPT-4 with just one day of dedicated training using a\nsingle GPU with 80G memory. Our code is released at\nhttps://github.com/zhiyuanhubj/LongRecipe.\n","authors":["Zhiyuan Hu","Yuliang Liu","Jinman Zhao","Suyuchen Wang","Yan Wang","Wei Shen","Qing Gu","Anh Tuan Luu","See-Kiong Ng","Zhiwei Jiang","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2409.00509v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2405.04346v2","updated":"2024-09-04T15:48:40Z","published":"2024-05-07T14:23:22Z","title":"Revisiting Character-level Adversarial Attacks for Language Models","summary":" Adversarial attacks in Natural Language Processing apply perturbations in the\ncharacter or token levels. Token-level attacks, gaining prominence for their\nuse of gradient-based methods, are susceptible to altering sentence semantics,\nleading to invalid adversarial examples. While character-level attacks easily\nmaintain semantics, they have received less attention as they cannot easily\nadopt popular gradient-based methods, and are thought to be easy to defend.\nChallenging these beliefs, we introduce Charmer, an efficient query-based\nadversarial attack capable of achieving high attack success rate (ASR) while\ngenerating highly similar adversarial examples. Our method successfully targets\nboth small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2,\nCharmer improves the ASR in 4.84% points and the USE similarity in 8% points\nwith respect to the previous art. Our implementation is available in\nhttps://github.com/LIONS-EPFL/Charmer.\n","authors":["Elias Abad Rocamora","Yongtao Wu","Fanghui Liu","Grigorios G. Chrysos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2405.04346v2.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2408.15778v2","updated":"2024-09-04T15:35:15Z","published":"2024-08-28T13:16:41Z","title":"LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language\n Models","summary":" Large Language Models (LLMs) have demonstrated notable capabilities across\nvarious tasks, showcasing complex problem-solving abilities. Understanding and\nexecuting complex rules, along with multi-step planning, are fundamental to\nlogical reasoning and critical for practical LLM agents and decision-making\nsystems. However, evaluating LLMs as effective rule-based executors and\nplanners remains underexplored. In this paper, we introduce LogicGame, a novel\nbenchmark designed to evaluate the comprehensive rule understanding, execution,\nand planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame\nprovides diverse games that contain a series of rules with an initial state,\nrequiring models to comprehend and apply predefined regulations to solve\nproblems. We create simulated scenarios in which models execute or plan\noperations to achieve specific outcomes. These game scenarios are specifically\ndesigned to distinguish logical reasoning from mere knowledge by relying\nexclusively on predefined rules. This separation allows for a pure assessment\nof rule-based reasoning capabilities. The evaluation considers not only final\noutcomes but also intermediate steps, providing a comprehensive assessment of\nmodel performance. Moreover, these intermediate steps are deterministic and can\nbe automatically verified. LogicGame defines game scenarios with varying\ndifficulty levels, from simple rule applications to complex reasoning chains,\nin order to offer a precise evaluation of model performance on rule\nunderstanding and multi-step execution. Utilizing LogicGame, we test various\nLLMs and identify notable shortcomings in their rule-based logical reasoning\nabilities.\n","authors":["Jiayi Gui","Yiming Liu","Jiale Cheng","Xiaotao Gu","Xiao Liu","Hongning Wang","Yuxiao Dong","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2408.15778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02813v1","updated":"2024-09-04T15:31:26Z","published":"2024-09-04T15:31:26Z","title":"MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding\n Benchmark","summary":" This paper introduces MMMU-Pro, a robust version of the Massive\nMulti-discipline Multimodal Understanding and Reasoning (MMMU) benchmark.\nMMMU-Pro rigorously assesses multimodal models' true understanding and\nreasoning capabilities through a three-step process based on MMMU: (1)\nfiltering out questions answerable by text-only models, (2) augmenting\ncandidate options, and (3) introducing a vision-only input setting where\nquestions are embedded within images. This setting challenges AI to truly \"see\"\nand \"read\" simultaneously, testing a fundamental human cognitive skill of\nseamlessly integrating visual and textual information. Results show that model\nperformance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8%\nto 26.9% across models. We explore the impact of OCR prompts and Chain of\nThought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT\ngenerally improves performance. MMMU-Pro provides a more rigorous evaluation\ntool, closely mimicking real-world scenarios and offering valuable directions\nfor future research in multimodal AI.\n","authors":["Xiang Yue","Tianyu Zheng","Yuansheng Ni","Yubo Wang","Kai Zhang","Shengbang Tong","Yuxuan Sun","Ming Yin","Botao Yu","Ge Zhang","Huan Sun","Yu Su","Wenhu Chen","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2409.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08349v3","updated":"2024-09-04T15:19:35Z","published":"2023-11-14T17:48:19Z","title":"AI-generated text boundary detection with RoFT","summary":" Due to the rapid development of large language models, people increasingly\noften encounter texts that may start as written by a human but continue as\nmachine-generated. Detecting the boundary between human-written and\nmachine-generated parts of such texts is a challenging problem that has not\nreceived much attention in literature. We attempt to bridge this gap and\nexamine several ways to adapt state of the art artificial text detection\nclassifiers to the boundary detection setting. We push all detectors to their\nlimits, using the Real or Fake text benchmark that contains short texts on\nseveral topics and includes generations of various language models. We use this\ndiversity to deeply examine the robustness of all detectors in cross-domain and\ncross-model settings to provide baselines and insights for future research. In\nparticular, we find that perplexity-based approaches to boundary detection tend\nto be more robust to peculiarities of domain-specific data than supervised\nfine-tuning of the RoBERTa model; we also find which features of the text\nconfuse boundary detection algorithms and negatively influence their\nperformance in cross-domain settings.\n","authors":["Laida Kushnareva","Tatiana Gaintseva","German Magai","Serguei Barannikov","Dmitry Abulkhanov","Kristian Kuznetsov","Eduard Tulchinskii","Irina Piontkovskaya","Sergey Nikolenko"],"pdf_url":"https://arxiv.org/pdf/2311.08349v3.pdf","comment":"Our official repository:\n https://github.com/SilverSolver/ai_boundary_detection"},{"id":"http://arxiv.org/abs/2409.02795v1","updated":"2024-09-04T15:11:55Z","published":"2024-09-04T15:11:55Z","title":"Towards a Unified View of Preference Learning for Large Language Models:\n A Survey","summary":" Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of\nthe crucial factors to achieve success is aligning the LLM's output with human\npreferences. This alignment process often requires only a small amount of data\nto efficiently enhance the LLM's performance. While effective, research in this\narea spans multiple domains, and the methods involved are relatively complex to\nunderstand. The relationships between different methods have been\nunder-explored, limiting the development of the preference alignment. In light\nof this, we break down the existing popular alignment strategies into different\ncomponents and provide a unified framework to study the current alignment\nstrategies, thereby establishing connections among them. In this survey, we\ndecompose all the strategies in preference learning into four components:\nmodel, data, feedback, and algorithm. This unified view offers an in-depth\nunderstanding of existing alignment algorithms and also opens up possibilities\nto synergize the strengths of different strategies. Furthermore, we present\ndetailed working examples of prevalent existing algorithms to facilitate a\ncomprehensive understanding for the readers. Finally, based on our unified\nperspective, we explore the challenges and future research directions for\naligning large language models with human preferences.\n","authors":["Bofei Gao","Feifan Song","Yibo Miao","Zefan Cai","Zhe Yang","Liang Chen","Helan Hu","Runxin Xu","Qingxiu Dong","Ce Zheng","Wen Xiao","Ge Zhang","Daoguang Zan","Keming Lu","Bowen Yu","Dayiheng Liu","Zeyu Cui","Jian Yang","Lei Sha","Houfeng Wang","Zhifang Sui","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2409.02795v1.pdf","comment":"Initial Commit, 21 pages"},{"id":"http://arxiv.org/abs/2409.00105v2","updated":"2024-09-04T14:40:14Z","published":"2024-08-27T14:40:16Z","title":"Negation Blindness in Large Language Models: Unveiling the NO Syndrome\n in Image Generation","summary":" Foundational Large Language Models (LLMs) have changed the way we perceive\ntechnology. They have been shown to excel in tasks ranging from poem writing\nand coding to essay generation and puzzle solving. With the incorporation of\nimage generation capability, they have become more comprehensive and versatile\nAI tools. At the same time, researchers are striving to identify the\nlimitations of these tools to improve them further. Currently identified flaws\ninclude hallucination, biases, and bypassing restricted commands to generate\nharmful content. In the present work, we have identified a fundamental\nlimitation related to the image generation ability of LLMs, and termed it The\nNO Syndrome. This negation blindness refers to LLMs inability to correctly\ncomprehend NO related natural language prompts to generate the desired images.\nInterestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found\nto be suffering from this syndrome. To demonstrate the generalization of this\nlimitation, we carried out simulation experiments and conducted entropy-based\nand benchmark statistical analysis tests on various LLMs in multiple languages,\nincluding English, Hindi, and French. We conclude that the NO syndrome is a\nsignificant flaw in current LLMs that needs to be addressed. A related finding\nof this study showed a consistent discrepancy between image and textual\nresponses as a result of this NO syndrome. We posit that the introduction of a\nnegation context-aware reinforcement learning based feedback loop between the\nLLMs textual response and generated image could help ensure the generated text\nis based on both the LLMs correct contextual understanding of the negation\nquery and the generated visual output.\n","authors":["Mohammad Nadeem","Shahab Saquib Sohail","Erik Cambria","Björn W. Schuller","Amir Hussain"],"pdf_url":"https://arxiv.org/pdf/2409.00105v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.02751v1","updated":"2024-09-04T14:30:13Z","published":"2024-09-04T14:30:13Z","title":"A Comparative Study of Pre-training and Self-training","summary":" Pre-training and self-training are two approaches to semi-supervised\nlearning. The comparison between pre-training and self-training has been\nexplored. However, the previous works led to confusing findings: self-training\noutperforms pre-training experienced on some tasks in computer vision, and\ncontrarily, pre-training outperforms self-training experienced on some tasks in\nnatural language processing, under certain conditions of incomparable settings.\nWe propose, comparatively and exhaustively, an ensemble method to empirical\nstudy all feasible training paradigms combining pre-training, self-training,\nand fine-tuning within consistent foundational settings comparable to data\naugmentation. We conduct experiments on six datasets, four data augmentation,\nand imbalanced data for sentiment analysis and natural language inference\ntasks. Our findings confirm that the pre-training and fine-tuning paradigm\nyields the best overall performances. Moreover, self-training offers no\nadditional benefits when combined with semi-supervised pre-training.\n","authors":["Yiheng Wang","Jiayu Lin","Zuoquan Lin"],"pdf_url":"https://arxiv.org/pdf/2409.02751v1.pdf","comment":"19 pages, 2 figures, 9 tables"},{"id":"http://arxiv.org/abs/2407.04183v2","updated":"2024-09-04T14:07:07Z","published":"2024-07-04T23:05:58Z","title":"Seeing Like an AI: How LLMs Apply (and Misapply) Wikipedia Neutrality\n Norms","summary":" Large language models (LLMs) are trained on broad corpora and then used in\ncommunities with specialized norms. Is providing LLMs with community rules\nenough for models to follow these norms? We evaluate LLMs' capacity to detect\n(Task 1) and correct (Task 2) biased Wikipedia edits according to Wikipedia's\nNeutral Point of View (NPOV) policy. LLMs struggled with bias detection,\nachieving only 64% accuracy on a balanced dataset. Models exhibited contrasting\nbiases (some under- and others over-predicted bias), suggesting distinct priors\nabout neutrality. LLMs performed better at generation, removing 79% of words\nremoved by Wikipedia editors. However, LLMs made additional changes beyond\nWikipedia editors' simpler neutralizations, resulting in high-recall but\nlow-precision editing. Interestingly, crowdworkers rated AI rewrites as more\nneutral (70%) and fluent (61%) than Wikipedia-editor rewrites. Qualitative\nanalysis found LLMs sometimes applied NPOV more comprehensively than Wikipedia\neditors but often made extraneous non-NPOV-related changes (such as grammar).\nLLMs may apply rules in ways that resonate with the public but diverge from\ncommunity experts. While potentially effective for generation, LLMs may reduce\neditor agency and increase moderation workload (e.g., verifying additions).\nEven when rules are easy to articulate, having LLMs apply them like community\nmembers may still be difficult.\n","authors":["Joshua Ashkinaze","Ruijia Guan","Laura Kurek","Eytan Adar","Ceren Budak","Eric Gilbert"],"pdf_url":"https://arxiv.org/pdf/2407.04183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02727v1","updated":"2024-09-04T14:01:48Z","published":"2024-09-04T14:01:48Z","title":"Pooling And Attention: What Are Effective Designs For LLm-Based\n Embedding Models?","summary":" The significant advancements of Large Language Models (LLMs) in generative\ntasks have led to a growing body of work exploring LLM-based embedding models.\nWhile these models, employing different pooling and attention strategies, have\nachieved state-of-the-art performance on public embedding benchmarks, questions\nstill arise about what constitutes an effective design for LLM-based embedding\nmodels. However, these models are often trained on different datasets, using\ndifferent LLM base models or training settings. Moreover, evaluations on public\nembedding benchmarks often fail to report statistical significance, making it\ndifficult to determine which designs truly contribute to final performance.\nThis complicates the process for practitioners seeking optimal training recipes\nfor LLM-based embedding models. In this study, we conduct a large-scale\nexperiment by training a series of LLM-based embedding models using the same\ntraining data and base model but differing in their pooling and attention\nstrategies. The results show that there is no one-size-fits-all solution: while\nbidirectional attention and an additional trainable pooling layer outperform in\ntext similarity and information retrieval tasks, they do not significantly\nsurpass simpler designs like EOS-last token pooling and default causal\nattention in clustering and classification tasks. Furthermore, we propose a new\npooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs\nof all hidden layers, rather than just the last layer, using a cross-attention\nnetwork. This method proves to be statistically superior in text similarity and\nretrieval tasks compared to existing pooling methods. Overall, this paper sheds\nlight on effective training strategies for LLM-based embedding models.\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02727v1.pdf","comment":"https://github.com/yixuantt/PoolingAndAttn"},{"id":"http://arxiv.org/abs/2409.02725v1","updated":"2024-09-04T13:59:48Z","published":"2024-09-04T13:59:48Z","title":"Pre-training data selection for biomedical domain adaptation using\n journal impact metrics","summary":" Domain adaptation is a widely used method in natural language processing\n(NLP) to improve the performance of a language model within a specific domain.\nThis method is particularly common in the biomedical domain, which sees regular\npublication of numerous scientific articles. PubMed, a significant corpus of\ntext, is frequently used in the biomedical domain. The primary objective of\nthis study is to explore whether refining a pre-training dataset using specific\nquality metrics for scientific papers can enhance the performance of the\nresulting model. To accomplish this, we employ two straightforward journal\nimpact metrics and conduct experiments by continually pre-training BERT on\nvarious subsets of the complete PubMed training set, we then evaluate the\nresulting models on biomedical language understanding tasks from the BLURB\nbenchmark. Our results show that pruning using journal impact metrics is not\nefficient. But we also show that pre-training using fewer abstracts (but with\nthe same number of training steps) does not necessarily decrease the resulting\nmodel's performance.\n","authors":["Mathieu Laï-king","Patrick Paroubek"],"pdf_url":"https://arxiv.org/pdf/2409.02725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02718v1","updated":"2024-09-04T13:54:38Z","published":"2024-09-04T13:54:38Z","title":"Alignment-Aware Model Extraction Attacks on Large Language Models","summary":" Model extraction attacks (MEAs) on large language models (LLMs) have received\nincreasing research attention lately. Existing attack methods on LLMs inherit\nthe extraction strategies from those designed for deep neural networks (DNNs)\nyet neglect the inconsistency of training tasks between MEA and LLMs'\nalignments. As such, they result in poor attack performances. To tackle this\nissue, we present Locality Reinforced Distillation (LoRD), a novel model\nextraction attack algorithm specifically for LLMs. In particular, we design a\npolicy-gradient-style training task, which utilizes victim models' responses as\na signal to guide the crafting of preference for the local model. Theoretical\nanalysis has shown that i) LoRD's convergence procedure in MEAs is consistent\nwith the alignments of LLMs, and ii) LoRD can reduce query complexity while\nmitigating watermark protection through exploration-based stealing. Extensive\nexperiments on domain-specific extractions demonstrate the superiority of our\nmethod by examining the extraction of various state-of-the-art commercial LLMs.\n","authors":["Zi Liang","Qingqing Ye","Yanyun Wang","Sen Zhang","Yaxin Xiao","Ronghua Li","Jianliang Xu","Haibo Hu"],"pdf_url":"https://arxiv.org/pdf/2409.02718v1.pdf","comment":"Source code: https://github.com/liangzid/alignmentExtraction"},{"id":"http://arxiv.org/abs/2409.02712v1","updated":"2024-09-04T13:49:45Z","published":"2024-09-04T13:49:45Z","title":"A Data Selection Approach for Enhancing Low Resource Machine Translation\n Using Cross-Lingual Sentence Representations","summary":" Machine translation in low-resource language pairs faces significant\nchallenges due to the scarcity of parallel corpora and linguistic resources.\nThis study focuses on the case of English-Marathi language pairs, where\nexisting datasets are notably noisy, impeding the performance of machine\ntranslation models. To mitigate the impact of data quality issues, we propose a\ndata filtering approach based on cross-lingual sentence representations. Our\nmethodology leverages a multilingual SBERT model to filter out problematic\ntranslations in the training data. Specifically, we employ an IndicSBERT\nsimilarity model to assess the semantic equivalence between original and\ntranslated sentences, allowing us to retain linguistically correct translations\nwhile discarding instances with substantial deviations. The results demonstrate\na significant improvement in translation quality over the baseline\npost-filtering with IndicSBERT. This illustrates how cross-lingual sentence\nrepresentations can reduce errors in machine translation scenarios with limited\nresources. By integrating multilingual sentence BERT models into the\ntranslation pipeline, this research contributes to advancing machine\ntranslation techniques in low-resource environments. The proposed method not\nonly addresses the challenges in English-Marathi language pairs but also\nprovides a valuable framework for enhancing translation quality in other\nlow-resource language translation tasks.\n","authors":["Nidhi Kowtal","Tejas Deshpande","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.02712v1.pdf","comment":"Accepted at I2CT 2024"},{"id":"http://arxiv.org/abs/2405.04160v2","updated":"2024-09-04T13:29:56Z","published":"2024-05-07T09:55:05Z","title":"A Causal Explainable Guardrails for Large Language Models","summary":" Large Language Models (LLMs) have shown impressive performance in natural\nlanguage tasks, but their outputs can exhibit undesirable attributes or biases.\nExisting methods for steering LLMs toward desired attributes often assume\nunbiased representations and rely solely on steering prompts. However, the\nrepresentations learned from pre-training can introduce semantic biases that\ninfluence the steering process, leading to suboptimal results. We propose\nLLMGuardrail, a novel framework that incorporates causal analysis and\nadversarial learning to obtain unbiased steering representations in LLMs.\nLLMGuardrail systematically identifies and blocks the confounding effects of\nbiases, enabling the extraction of unbiased steering representations.\nAdditionally, it includes an explainable component that provides insights into\nthe alignment between the generated output and the desired direction.\nExperiments demonstrate LLMGuardrail's effectiveness in steering LLMs toward\ndesired attributes while mitigating biases. Our work contributes to the\ndevelopment of safe and reliable LLMs that align with desired attributes.\n","authors":["Zhixuan Chu","Yan Wang","Longfei Li","Zhibo Wang","Zhan Qin","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2405.04160v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2409.02690v1","updated":"2024-09-04T13:23:50Z","published":"2024-09-04T13:23:50Z","title":"Detecting Calls to Action in Multimodal Content: Analysis of the 2021\n German Federal Election Campaign on Instagram","summary":" This study investigates the automated classification of Calls to Action\n(CTAs) within the 2021 German Instagram election campaign to advance the\nunderstanding of mobilization in social media contexts. We analyzed over 2,208\nInstagram stories and 712 posts using fine-tuned BERT models and OpenAI's GPT-4\nmodels. The fine-tuned BERT model incorporating synthetic training data\nachieved a macro F1 score of 0.93, demonstrating a robust classification\nperformance. Our analysis revealed that 49.58% of Instagram posts and 10.64% of\nstories contained CTAs, highlighting significant differences in mobilization\nstrategies between these content types. Additionally, we found that FDP and the\nGreens had the highest prevalence of CTAs in posts, whereas CDU and CSU led in\nstory CTAs.\n","authors":["Michael Achmann-Denkler","Jakob Fehle","Mario Haim","Christian Wolff"],"pdf_url":"https://arxiv.org/pdf/2409.02690v1.pdf","comment":"Accepted Archival Paper for the CPSS Workshop at KONVENS 2024. Camera\n Ready Submission"},{"id":"http://arxiv.org/abs/2409.02686v1","updated":"2024-09-04T13:17:09Z","published":"2024-09-04T13:17:09Z","title":"Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for\n Problem-Solving Improvement of LLMs","summary":" Large Language Models (LLMs) have demonstrated remarkable efficiency in\ntackling various tasks based on human instructions, but recent studies reveal\nthat these models often fail to achieve satisfactory results on questions\ninvolving reasoning, such as mathematics or physics questions. This phenomenon\nis usually attributed to the uncertainty regarding whether these models could\ngenuinely comprehend the knowledge embedded in the text or merely learn to\nreplicate the token distribution without a true understanding of the content.\nIn this paper, we delve into this problem and aim to enhance the reasoning\ncapabilities of LLMs. First, we investigate if the model has genuine reasoning\ncapabilities by visualizing the text generation process at the attention and\nrepresentation level. Then, we formulate the reasoning process of LLMs into a\ncausal framework, which provides a formal explanation of the problems we\nobserve in the visualization. Finally, building upon this causal framework, we\npropose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient\nfine-tuning (PEFT) method to enhance the model's reasoning capabilities by\nencouraging the model to extract the general problem-solving skills and apply\nthese skills to different questions. Experiments show that our method\noutperforms the baseline consistently across multiple benchmarks, and with only\n1.2M tunable parameters, we achieve better or comparable results to other\nfine-tuning methods. This demonstrates the effectiveness and efficiency of our\nmethod in improving the overall accuracy and reliability of LLMs.\n","authors":["Ruoyu Wang","Xiaoxuan Li","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11850v2","updated":"2024-09-04T13:14:57Z","published":"2024-08-13T08:32:06Z","title":"Parallel Speculative Decoding with Adaptive Draft Length","summary":" Speculative decoding (SD), where an extra draft model is employed to provide\nmultiple \\textit{draft} tokens first and then the original target model\nverifies these tokens in parallel, has shown great power for LLM inference\nacceleration. However, existing SD methods suffer from the mutual waiting\nproblem, i.e., the target model gets stuck when the draft model is\n\\textit{guessing} tokens, and vice versa. This problem is directly incurred by\nthe asynchronous execution of the draft model and the target model, and is\nexacerbated due to the fixed draft length in speculative decoding. To address\nthese challenges, we propose a conceptually simple, flexible, and general\nframework to boost speculative decoding, namely \\textbf{P}arallel\nsp\\textbf{E}culative decoding with \\textbf{A}daptive d\\textbf{R}aft\n\\textbf{L}ength (PEARL). Specifically, PEARL proposes \\textit{pre-verify} to\nverify the first draft token in advance during the drafting phase, and\n\\textit{post-verify} to generate more draft tokens during the verification\nphase. PEARL parallels the drafting phase and the verification phase via\napplying the two strategies, and achieves adaptive draft length for different\nscenarios, which effectively alleviates the mutual waiting problem. Moreover,\nwe theoretically demonstrate that the mean accepted tokens of PEARL is more\nthan existing \\textit{draft-then-verify} works. Experiments on various text\ngeneration benchmarks demonstrate the effectiveness of our \\name, leading to a\nsuperior speedup performance up to \\textbf{3.79$\\times$} and\n\\textbf{1.52$\\times$}, compared to auto-regressive decoding and vanilla\nspeculative decoding, respectively.\n","authors":["Tianyu Liu","Yun Li","Qitan Lv","Kai Liu","Jianchen Zhu","Winston Hu"],"pdf_url":"https://arxiv.org/pdf/2408.11850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02667v1","updated":"2024-09-04T12:48:30Z","published":"2024-09-04T12:48:30Z","title":"Creating Domain-Specific Translation Memories for Machine Translation\n Fine-tuning: The TRENCARD Bilingual Cardiology Corpus","summary":" This article investigates how translation memories (TM) can be created by\ntranslators or other language professionals in order to compile domain-specific\nparallel corpora , which can then be used in different scenarios, such as\nmachine translation training and fine-tuning, TM leveraging, and/or large\nlanguage model fine-tuning. The article introduces a semi-automatic TM\npreparation methodology leveraging primarily translation tools used by\ntranslators in favor of data quality and control by the translators. This\nsemi-automatic methodology is then used to build a cardiology-based Turkish ->\nEnglish corpus from bilingual abstracts of Turkish cardiology journals. The\nresulting corpus called TRENCARD Corpus has approximately 800,000 source words\nand 50,000 sentences. Using this methodology, translators can build their\ncustom TMs in a reasonable time and use them in their bilingual data requiring\ntasks.\n","authors":["Gokhan Dogru"],"pdf_url":"https://arxiv.org/pdf/2409.02667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09979v2","updated":"2024-09-04T12:33:24Z","published":"2024-06-14T12:41:07Z","title":"HIRO: Hierarchical Information Retrieval Optimization","summary":" Retrieval-Augmented Generation (RAG) has revolutionized natural language\nprocessing by dynamically integrating external knowledge into Large Language\nModels (LLMs), addressing their limitation of static training datasets. Recent\nimplementations of RAG leverage hierarchical data structures, which organize\ndocuments at various levels of summarization and information density. This\ncomplexity, however, can cause LLMs to \"choke\" on information overload,\nnecessitating more sophisticated querying mechanisms. In this context, we\nintroduce Hierarchical Information Retrieval Optimization (HIRO), a novel\nquerying approach that employs a Depth-First Search (DFS)-based recursive\nsimilarity score calculation and branch pruning. This method uniquely minimizes\nthe context delivered to the LLM without informational loss, effectively\nmanaging the challenge of excessive data. HIRO's refined approach is validated\nby a 10.85% improvement in performance on the NarrativeQA dataset.\n","authors":["Krish Goel","Mahek Chandak"],"pdf_url":"https://arxiv.org/pdf/2406.09979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02649v1","updated":"2024-09-04T12:26:26Z","published":"2024-09-04T12:26:26Z","title":"OpenFact at CheckThat! 2024: Combining Multiple Attack Methods for\n Effective Adversarial Text Generation","summary":" This paper presents the experiments and results for the CheckThat! Lab at\nCLEF 2024 Task 6: Robustness of Credibility Assessment with Adversarial\nExamples (InCrediblAE). The primary objective of this task was to generate\nadversarial examples in five problem domains in order to evaluate the\nrobustness of widely used text classification methods (fine-tuned BERT, BiLSTM,\nand RoBERTa) when applied to credibility assessment issues.\n This study explores the application of ensemble learning to enhance\nadversarial attacks on natural language processing (NLP) models. We\nsystematically tested and refined several adversarial attack methods, including\nBERT-Attack, Genetic algorithms, TextFooler, and CLARE, on five datasets across\nvarious misinformation tasks. By developing modified versions of BERT-Attack\nand hybrid methods, we achieved significant improvements in attack\neffectiveness. Our results demonstrate the potential of modification and\ncombining multiple methods to create more sophisticated and effective\nadversarial attack strategies, contributing to the development of more robust\nand secure systems.\n","authors":["Włodzimierz Lewoniewski","Piotr Stolarski","Milena Stróżyna","Elzbieta Lewańska","Aleksandra Wojewoda","Ewelina Księżniak","Marcin Sawiński"],"pdf_url":"https://arxiv.org/pdf/2409.02649v1.pdf","comment":"CLEF 2024 - Conference and Labs of the Evaluation Forum"},{"id":"http://arxiv.org/abs/2409.02645v1","updated":"2024-09-04T12:22:05Z","published":"2024-09-04T12:22:05Z","title":"A Survey on Emergent Language","summary":" The field of emergent language represents a novel area of research within the\ndomain of artificial intelligence, particularly within the context of\nmulti-agent reinforcement learning. Although the concept of studying language\nemergence is not new, early approaches were primarily concerned with explaining\nhuman language formation, with little consideration given to its potential\nutility for artificial agents. In contrast, studies based on reinforcement\nlearning aim to develop communicative capabilities in agents that are\ncomparable to or even superior to human language. Thus, they extend beyond the\nlearned statistical representations that are common in natural language\nprocessing research. This gives rise to a number of fundamental questions, from\nthe prerequisites for language emergence to the criteria for measuring its\nsuccess. This paper addresses these questions by providing a comprehensive\nreview of 181 scientific publications on emergent language in artificial\nintelligence. Its objective is to serve as a reference for researchers\ninterested in or proficient in the field. Consequently, the main contributions\nare the definition and overview of the prevailing terminology, the analysis of\nexisting evaluation methods and metrics, and the description of the identified\nresearch gaps.\n","authors":["Jannik Peters","Constantin Waubert de Puiseau","Hasan Tercan","Arya Gopikrishnan","Gustavo Adolpho Lucas De Carvalho","Christian Bitter","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.02645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00208v3","updated":"2024-09-04T11:48:04Z","published":"2023-11-01T00:38:26Z","title":"What Formal Languages Can Transformers Express? A Survey","summary":" As transformers have gained prominence in natural language processing, some\nresearchers have investigated theoretically what problems they can and cannot\nsolve, by treating problems as formal languages. Exploring such questions can\nhelp clarify the power of transformers relative to other models of computation,\ntheir fundamental capabilities and limits, and the impact of architectural\nchoices. Work in this subarea has made considerable progress in recent years.\nHere, we undertake a comprehensive survey of this work, documenting the diverse\nassumptions that underlie different results and providing a unified framework\nfor harmonizing seemingly contradictory findings.\n","authors":["Lena Strobl","William Merrill","Gail Weiss","David Chiang","Dana Angluin"],"pdf_url":"https://arxiv.org/pdf/2311.00208v3.pdf","comment":"One minor correction in {\\S}5.1"},{"id":"http://arxiv.org/abs/2308.07107v4","updated":"2024-09-04T11:39:56Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zheng Liu","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v4.pdf","comment":"updated to version 3"},{"id":"http://arxiv.org/abs/2409.02617v1","updated":"2024-09-04T11:19:17Z","published":"2024-09-04T11:19:17Z","title":"PUB: Plot Understanding Benchmark and Dataset for Evaluating Large\n Language Models on Synthetic Visual Data Interpretation","summary":" The ability of large language models (LLMs) to interpret visual\nrepresentations of data is crucial for advancing their application in data\nanalysis and decision-making processes. This paper presents a novel synthetic\ndataset designed to evaluate the proficiency of LLMs in interpreting various\nforms of data visualizations, including plots like time series, histograms,\nviolins, boxplots, and clusters. Our dataset is generated using controlled\nparameters to ensure comprehensive coverage of potential real-world scenarios.\nWe employ multimodal text prompts with questions related to visual data in\nimages to benchmark several state-of-the-art models like ChatGPT or Gemini,\nassessing their understanding and interpretative accuracy.\n To ensure data integrity, our benchmark dataset is generated automatically,\nmaking it entirely new and free from prior exposure to the models being tested.\nThis strategy allows us to evaluate the models' ability to truly interpret and\nunderstand the data, eliminating possibility of pre-learned responses, and\nallowing for an unbiased evaluation of the models' capabilities. We also\nintroduce quantitative metrics to assess the performance of the models,\nproviding a robust and comprehensive evaluation tool.\n Benchmarking several state-of-the-art LLMs with this dataset reveals varying\ndegrees of success, highlighting specific strengths and weaknesses in\ninterpreting diverse types of visual data. The results provide valuable\ninsights into the current capabilities of LLMs and identify key areas for\nimprovement. This work establishes a foundational benchmark for future research\nand development aimed at enhancing the visual interpretative abilities of\nlanguage models. In the future, improved LLMs with robust visual interpretation\nskills can significantly aid in automated data analysis, scientific research,\neducational tools, and business intelligence applications.\n","authors":["Aneta Pawelec","Victoria Sara Wesołowska","Zuzanna Bączek","Piotr Sankowski"],"pdf_url":"https://arxiv.org/pdf/2409.02617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00016v2","updated":"2024-09-04T10:42:35Z","published":"2024-07-28T15:45:08Z","title":"Towards a Universal Method for Meaningful Signal Detection","summary":" It is known that human speech and certain animal vocalizations can convey\nmeaningful content because we can decipher the content that a given utterance\ndoes convey. This paper explores an alternative approach to determining whether\na signal is meaningful, one that analyzes only the signal itself and is\nindependent of what the conveyed meaning might be. We devise a method that\ntakes a waveform as input and outputs a score indicating its degree of\n`meaningfulness`. We cluster contiguous portions of the input to minimize the\ntotal description length, and then take the length of the code of the assigned\ncluster labels as meaningfulness score. We evaluate our method empirically,\nagainst several baselines, and show that it is the only one to give a high\nscore to human speech in various languages and with various speakers, a\nmoderate score to animal vocalizations from birds and orcas, and a low score to\nambient noise from various sources.\n","authors":["Louis Mahon"],"pdf_url":"https://arxiv.org/pdf/2408.00016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02596v1","updated":"2024-09-04T10:27:07Z","published":"2024-09-04T10:27:07Z","title":"An Analysis of Linear Complexity Attention Substitutes with BEST-RQ","summary":" Self-Supervised Learning (SSL) has proven to be effective in various domains,\nincluding speech processing. However, SSL is computationally and memory\nexpensive. This is in part due the quadratic complexity of multi-head\nself-attention (MHSA). Alternatives for MHSA have been proposed and used in the\nspeech domain, but have yet to be investigated properly in an SSL setting. In\nthis work, we study the effects of replacing MHSA with recent state-of-the-art\nalternatives that have linear complexity, namely, HyperMixing, Fastformer,\nSummaryMixing, and Mamba. We evaluate these methods by looking at the speed,\nthe amount of VRAM consumed, and the performance on the SSL MP3S benchmark.\nResults show that these linear alternatives maintain competitive performance\ncompared to MHSA while, on average, decreasing VRAM consumption by around 20%\nto 60% and increasing speed from 7% to 65% for input sequences ranging from 20\nto 80 seconds.\n","authors":["Ryan Whetten","Titouan Parcollet","Adel Moumen","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2409.02596v1.pdf","comment":"Accepted in the IEEE Soken Language Technology Workshop 2024"},{"id":"http://arxiv.org/abs/2405.04296v2","updated":"2024-09-04T10:23:04Z","published":"2024-05-07T13:11:37Z","title":"Open Implementation and Study of BEST-RQ for Speech Processing","summary":" Self-Supervised Learning (SSL) has proven to be useful in various speech\ntasks. However, these methods are generally very demanding in terms of data,\nmemory, and computational resources. BERT-based Speech pre-Training with\nRandom-projection Quantizer (BEST-RQ), is an SSL method that has shown great\nperformance on Automatic Speech Recognition (ASR) while being simpler than\nother SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance,\ndetails are lacking in the original paper, such as the amount of GPU/TPU hours\nused in pre-training, and there is no official easy-to-use open-source\nimplementation. Furthermore, BEST-RQ has not been evaluated on other downstream\ntasks aside from ASR and speech translation. In this work, we describe a\nre-implementation of a Random-projection quantizer and perform a preliminary\nstudy with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the\ndetails and differences of our implementation. We show that a random projection\nquantizer can achieve similar downstream performance as wav2vec 2.0 while\ndecreasing training time by over a factor of two.\n","authors":["Ryan Whetten","Titouan Parcollet","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2405.04296v2.pdf","comment":"Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio,\n Speech and Beyond (SASB 2024)"},{"id":"http://arxiv.org/abs/2409.01227v2","updated":"2024-09-04T10:20:59Z","published":"2024-09-02T13:02:51Z","title":"Prompt Compression with Context-Aware Sentence Encoding for Fast and\n Improved LLM Inference","summary":" Large language models (LLMs) have triggered a new stream of research focusing\non compressing the context length to reduce the computational cost while\nensuring the retention of helpful information for LLMs to answer the given\nquestion. Token-based removal methods are one of the most prominent approaches\nin this direction, but risk losing the semantics of the context caused by\nintermediate token removal, especially under high compression ratios, while\nalso facing challenges in computational efficiency. In this work, we propose\ncontext-aware prompt compression (CPC), a sentence-level prompt compression\ntechnique where its key innovation is a novel context-aware sentence encoder\nthat provides a relevance score for each sentence for a given question. To\ntrain this encoder, we generate a new dataset consisting of questions,\npositives, and negative pairs where positives are sentences relevant to the\nquestion, while negatives are irrelevant context sentences. We train the\nencoder in a contrastive setup to learn context-aware sentence representations.\nOur method considerably outperforms prior works on prompt compression on\nbenchmark datasets and is up to 10.93x faster at inference compared to the best\ntoken-level compression method. We also find better improvement for shorter\nlength constraints in most benchmarks, showing the effectiveness of our\nproposed solution in the compression of relevant information in a shorter\ncontext. Finally, we release the code and the dataset for quick reproducibility\nand further development: https://github.com/Workday/cpc.\n","authors":["Barys Liskavets","Maxim Ushakov","Shuvendu Roy","Mark Klibanov","Ali Etemad","Shane Luke"],"pdf_url":"https://arxiv.org/pdf/2409.01227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06294v3","updated":"2024-09-04T10:16:57Z","published":"2023-05-10T16:31:35Z","title":"CADGE: Context-Aware Dialogue Generation Enhanced with Graph-Structured\n Knowledge Aggregation","summary":" Commonsense knowledge is crucial to many natural language processing tasks.\nExisting works usually incorporate graph knowledge with conventional graph\nneural networks (GNNs), leading to the text and graph knowledge encoding\nprocesses being separated in a serial pipeline. We argue that these separate\nrepresentation learning stages may be suboptimal for neural networks to learn\nthe overall context contained in both types of input knowledge. In this paper,\nwe propose a novel context-aware graph-attention model (Context-aware GAT),\nwhich can effectively incorporate global features of relevant knowledge graphs\nbased on a context-enhanced knowledge aggregation process. Specifically, our\nframework leverages a novel representation learning approach to process\nheterogeneous features - combining flattened graph knowledge with text. To the\nbest of our knowledge, this is the first attempt at hierarchically applying\ngraph knowledge aggregation on a connected subgraph in addition to contextual\ninformation to support commonsense dialogue generation. This framework shows\nsuperior performance compared to conventional GNN-based language frameworks.\nBoth automatic and human evaluation demonstrates that our proposed model has\nsignificant performance uplifts over state-of-the-art baselines.\n","authors":["Hongbo Zhang","Chen Tang","Tyler Loakman","Chenghua Lin","Stefan Goetze"],"pdf_url":"https://arxiv.org/pdf/2305.06294v3.pdf","comment":"Accepted by INLG 2024"},{"id":"http://arxiv.org/abs/2012.15079v2","updated":"2024-09-04T09:44:38Z","published":"2020-12-30T08:31:31Z","title":"Enhancing Sindhi Word Segmentation using Subword Representation Learning\n and Position-aware Self-attention","summary":" Sindhi word segmentation is a challenging task due to space omission and\ninsertion issues. The Sindhi language itself adds to this complexity. It's\ncursive and consists of characters with inherent joining and non-joining\nproperties, independent of word boundaries. Existing Sindhi word segmentation\nmethods rely on designing and combining hand-crafted features. However, these\nmethods have limitations, such as difficulty handling out-of-vocabulary words,\nlimited robustness for other languages, and inefficiency with large amounts of\nnoisy or raw text. Neural network-based models, in contrast, can automatically\ncapture word boundary information without requiring prior knowledge. In this\npaper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses\nword segmentation as a sequence labeling task. The SGNWS model incorporates\nsubword representation learning through a bidirectional long short-term memory\nencoder, position-aware self-attention, and a conditional random field. Our\nempirical results demonstrate that the SGNWS model achieves state-of-the-art\nperformance in Sindhi word segmentation on six datasets.\n","authors":["Wazir Ali","Jay Kumar","Saifullah Tumrani","Redhwan Nour","Adeeb Noor","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2012.15079v2.pdf","comment":"Journal Paper, 14 pages"},{"id":"http://arxiv.org/abs/2409.02569v1","updated":"2024-09-04T09:39:07Z","published":"2024-09-04T09:39:07Z","title":"More is More: Addition Bias in Large Language Models","summary":" In this paper, we investigate the presence of additive bias in Large Language\nModels (LLMs), drawing a parallel to the cognitive bias observed in humans\nwhere individuals tend to favor additive over subtractive changes. Using a\nseries of controlled experiments, we tested various LLMs, including GPT-3.5\nTurbo, Claude 3.5 Sonnet, Mistral, Math$\\Sigma$tral, and Llama 3.1, on tasks\ndesigned to measure their propensity for additive versus subtractive\nmodifications. Our findings demonstrate a significant preference for additive\nchanges across all tested models. For example, in a palindrome creation task,\nLlama 3.1 favored adding letters 97.85% of the time over removing them.\nSimilarly, in a Lego tower balancing task, GPT-3.5 Turbo chose to add a brick\n76.38% of the time rather than remove one. In a text summarization task,\nMistral 7B produced longer summaries in 59.40% to 75.10% of cases when asked to\nimprove its own or others' writing. These results indicate that, similar to\nhumans, LLMs exhibit a marked additive bias, which might have implications when\nLLMs are used on a large scale. Addittive bias might increase resource use and\nenvironmental impact, leading to higher economic costs due to overconsumption\nand waste. This bias should be considered in the development and application of\nLLMs to ensure balanced and efficient problem-solving approaches.\n","authors":["Luca Santagata","Cristiano De Nobili"],"pdf_url":"https://arxiv.org/pdf/2409.02569v1.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.00109v2","updated":"2024-09-04T09:27:05Z","published":"2023-07-26T18:58:53Z","title":"A Sentence is Worth a Thousand Pictures: Can Large Language Models\n Understand Hum4n L4ngu4ge and the W0rld behind W0rds?","summary":" Modern Artificial Intelligence applications show great potential for\nlanguage-related tasks that rely on next-word prediction. The current\ngeneration of Large Language Models (LLMs) have been linked to claims about\nhuman-like linguistic performance and their applications are hailed both as a\nstep towards artificial general intelligence and as a major advance in\nunderstanding the cognitive, and even neural basis of human language. To assess\nthese claims, first we analyze the contribution of LLMs as theoretically\ninformative representations of a target cognitive system vs. atheoretical\nmechanistic tools. Second, we evaluate the models' ability to see the bigger\npicture, through top-down feedback from higher levels of processing, which\nrequires grounding in previous expectations and past world experience. We\nhypothesize that since models lack grounded cognition, they cannot take\nadvantage of these features and instead solely rely on fixed associations\nbetween represented words and word vectors. To assess this, we designed and ran\na novel 'leet task' (l33t t4sk), which requires decoding sentences in which\nletters are systematically replaced by numbers. The results suggest that humans\nexcel in this task whereas models struggle, confirming our hypothesis. We\ninterpret the results by identifying the key abilities that are still missing\nfrom the current state of development of these models, which require solutions\nthat go beyond increased system scaling.\n","authors":["Evelina Leivada","Gary Marcus","Fritz Günther","Elliot Murphy"],"pdf_url":"https://arxiv.org/pdf/2308.00109v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09580v2","updated":"2024-09-04T09:20:04Z","published":"2022-12-19T16:13:52Z","title":"Exploring Interpretability of Independent Components of Word Embeddings\n with Automated Word Intruder Test","summary":" Independent Component Analysis (ICA) is an algorithm originally developed for\nfinding separate sources in a mixed signal, such as a recording of multiple\npeople in the same room speaking at the same time. Unlike Principal Component\nAnalysis (PCA), ICA permits the representation of a word as an unstructured set\nof features, without any particular feature being deemed more significant than\nthe others. In this paper, we used ICA to analyze word embeddings. We have\nfound that ICA can be used to find semantic features of the words, and these\nfeatures can easily be combined to search for words that satisfy the\ncombination. We show that most of the independent components represent such\nfeatures. To quantify the interpretability of the components, we use the word\nintruder test, performed both by humans and by large language models. We\npropose to use the automated version of the word intruder test as a fast and\ninexpensive way of quantifying vector interpretability without the need for\nhuman effort.\n","authors":["Tomáš Musil","David Mareček"],"pdf_url":"https://arxiv.org/pdf/2212.09580v2.pdf","comment":"Presented at LREC-COLING 2024, cite this version please:\n https://aclanthology.org/2024.lrec-main.605/"},{"id":"http://arxiv.org/abs/2409.02519v1","updated":"2024-09-04T08:27:43Z","published":"2024-09-04T08:27:43Z","title":"Language is Scary when Over-Analyzed: Unpacking Implied Misogynistic\n Reasoning with Argumentation Theory-Driven Prompts","summary":" We propose misogyny detection as an Argumentative Reasoning task and we\ninvestigate the capacity of large language models (LLMs) to understand the\nimplicit reasoning used to convey misogyny in both Italian and English. The\ncentral aim is to generate the missing reasoning link between a message and the\nimplied meanings encoding the misogyny. Our study uses argumentation theory as\na foundation to form a collection of prompts in both zero-shot and few-shot\nsettings. These prompts integrate different techniques, including\nchain-of-thought reasoning and augmented knowledge. Our findings show that LLMs\nfall short on reasoning capabilities about misogynistic comments and that they\nmostly rely on their implicit knowledge derived from internalized common\nstereotypes about women to generate implied assumptions, rather than on\ninductive reasoning.\n","authors":["Arianna Muti","Federico Ruggeri","Khalid Al-Khatib","Alberto Barrón-Cedeño","Tommaso Caselli"],"pdf_url":"https://arxiv.org/pdf/2409.02519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00084v2","updated":"2024-09-04T08:22:28Z","published":"2024-08-25T14:50:47Z","title":"Vision-Language and Large Language Model Performance in\n Gastroenterology: GPT, Claude, Llama, Phi, Mistral, Gemma, and Quantized\n Models","summary":" Background and Aims: This study evaluates the medical reasoning performance\nof large language models (LLMs) and vision language models (VLMs) in\ngastroenterology.\n Methods: We used 300 gastroenterology board exam-style multiple-choice\nquestions, 138 of which contain images to systematically assess the impact of\nmodel configurations and parameters and prompt engineering strategies utilizing\nGPT-3.5. Next, we assessed the performance of proprietary and open-source LLMs\n(versions), including GPT (3.5, 4, 4o, 4omini), Claude (3, 3.5), Gemini (1.0),\nMistral, Llama (2, 3, 3.1), Mixtral, and Phi (3), across different interfaces\n(web and API), computing environments (cloud and local), and model precisions\n(with and without quantization). Finally, we assessed accuracy using a\nsemiautomated pipeline.\n Results: Among the proprietary models, GPT-4o (73.7%) and Claude3.5-Sonnet\n(74.0%) achieved the highest accuracy, outperforming the top open-source\nmodels: Llama3.1-405b (64%), Llama3.1-70b (58.3%), and Mixtral-8x7b (54.3%).\nAmong the quantized open-source models, the 6-bit quantized Phi3-14b (48.7%)\nperformed best. The scores of the quantized models were comparable to those of\nthe full-precision models Llama2-7b, Llama2--13b, and Gemma2-9b. Notably, VLM\nperformance on image-containing questions did not improve when the images were\nprovided and worsened when LLM-generated captions were provided. In contrast, a\n10% increase in accuracy was observed when images were accompanied by\nhuman-crafted image descriptions.\n Conclusion: In conclusion, while LLMs exhibit robust zero-shot performance in\nmedical reasoning, the integration of visual data remains a challenge for VLMs.\nEffective deployment involves carefully determining optimal model\nconfigurations, encouraging users to consider either the high performance of\nproprietary models or the flexible adaptability of open-source models.\n","authors":["Seyed Amir Ahmad Safavi-Naini","Shuhaib Ali","Omer Shahab","Zahra Shahhoseini","Thomas Savage","Sara Rafiee","Jamil S Samaan","Reem Al Shabeeb","Farah Ladak","Jamie O Yang","Juan Echavarria","Sumbal Babar","Aasma Shaukat","Samuel Margolis","Nicholas P Tatonetti","Girish Nadkarni","Bara El Kurdi","Ali Soroush"],"pdf_url":"https://arxiv.org/pdf/2409.00084v2.pdf","comment":"Manuscript Pages: 34, Figures: 7, Tables: 2, Supplementary File\n Pages: 35, Data Transparency Statement: Code is available at:\n https://github.com/Sdamirsa/LLM-VLM-in-Gastroenterology . Study data from\n American College of Gastroenterology (ACG) are restricted and available upon\n request with ACG permission. Correction: updated abstract considering\n Llama3.1 results"},{"id":"http://arxiv.org/abs/2409.02481v1","updated":"2024-09-04T07:13:30Z","published":"2024-09-04T07:13:30Z","title":"Word and Phrase Features in Graph Convolutional Network for Automatic\n Question Classification","summary":" Effective question classification is crucial for AI-driven educational tools,\nenabling adaptive learning systems to categorize questions by skill area,\ndifficulty level, and competence. This classification not only supports\neducational diagnostics and analytics but also enhances complex tasks like\ninformation retrieval and question answering by associating questions with\nrelevant categories. Traditional methods, often based on word embeddings and\nconventional classifiers, struggle to capture the nuanced relationships in\nnatural language, leading to suboptimal performance. To address this, we\npropose a novel approach leveraging graph convolutional networks (GCNs), named\nPhrase Question-Graph Convolutional Network (PQ-GCN) to better model the\ninherent structure of questions. By representing questions as graphs -- where\nnodes signify words or phrases and edges denote syntactic or semantic\nrelationships -- our method allows GCNs to learn from the interconnected nature\nof language more effectively. Additionally, we explore the incorporation of\nphrase-based features to enhance classification accuracy, especially in\nlow-resource settings. Our findings demonstrate that GCNs, augmented with these\nfeatures, offer a promising solution for more accurate and context-aware\nquestion classification, bridging the gap between graph neural network research\nand practical educational applications.\n","authors":["Junyoung Lee","Ninad Dixit","Kaustav Chakrabarti","S. Supraja"],"pdf_url":"https://arxiv.org/pdf/2409.02481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02474v1","updated":"2024-09-04T06:46:31Z","published":"2024-09-04T06:46:31Z","title":"A Comparative Study on Large Language Models for Log Parsing","summary":" Background: Log messages provide valuable information about the status of\nsoftware systems. This information is provided in an unstructured fashion and\nautomated approaches are applied to extract relevant parameters. To ease this\nprocess, log parsing can be applied, which transforms log messages into\nstructured log templates. Recent advances in language models have led to\nseveral studies that apply ChatGPT to the task of log parsing with promising\nresults. However, the performance of other state-of-the-art large language\nmodels (LLMs) on the log parsing task remains unclear.\n Aims: In this study, we investigate the current capability of\nstate-of-the-art LLMs to perform log parsing.\n Method: We select six recent LLMs, including both paid proprietary (GPT-3.5,\nClaude 2.1) and four free-to-use open models, and compare their performance on\nsystem logs obtained from a selection of mature open-source projects. We design\ntwo different prompting approaches and apply the LLMs on 1, 354 log templates\nacross 16 different projects. We evaluate their effectiveness, in the number of\ncorrectly identified templates, and the syntactic similarity between the\ngenerated templates and the ground truth.\n Results: We found that free-to-use models are able to compete with paid\nmodels, with CodeLlama extracting 10% more log templates correctly than\nGPT-3.5. Moreover, we provide qualitative insights into the usability of\nlanguage models (e.g., how easy it is to use their responses).\n Conclusions: Our results reveal that some of the smaller, free-to-use LLMs\ncan considerably assist log parsing compared to their paid proprietary\ncompetitors, especially code-specialized models.\n","authors":["Merve Astekin","Max Hort","Leon Moonen"],"pdf_url":"https://arxiv.org/pdf/2409.02474v1.pdf","comment":"Accepted for publication in the 18th ACM/IEEE International Symposium\n on Empirical Software Engineering and Measurement (ESEM '24)"},{"id":"http://arxiv.org/abs/2409.02465v1","updated":"2024-09-04T06:28:22Z","published":"2024-09-04T06:28:22Z","title":"DetectiveQA: Evaluating Long-Context Reasoning on Detective Novels","summary":" With the rapid advancement of Large Language Models (LLMs), long-context\ninformation understanding and processing have become a hot topic in academia\nand industry. However, benchmarks for evaluating the ability of LLMs to handle\nlong-context information do not seem to have kept pace with the development of\nLLMs. Despite the emergence of various long-context evaluation benchmarks, the\ntypes of capability assessed are still limited, without new capability\ndimensions. In this paper, we introduce DetectiveQA, a narrative reasoning\nbenchmark featured with an average context length of over 100K tokens.\nDetectiveQA focuses on evaluating the long-context reasoning ability of LLMs,\nwhich not only requires a full understanding of context but also requires\nextracting important evidences from the context and reasoning according to\nextracted evidences to answer the given questions. This is a new dimension of\ncapability evaluation, which is more in line with the current intelligence\nlevel of LLMs. We use detective novels as data sources, which naturally have\nvarious reasoning elements. Finally, we manually annotated 600 questions in\nChinese and then also provided an English edition of the context information\nand questions. We evaluate many long-context LLMs on DetectiveQA, including\ncommercial and open-sourced models, and the results indicate that existing\nlong-context LLMs still require significant advancements to effectively process\ntrue long-context dependency questions.\n","authors":["Zhe Xu","Jiasheng Ye","Xiangyang Liu","Tianxiang Sun","Xiaoran Liu","Qipeng Guo","Linlin Li","Qun Liu","Xuanjing Huang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2409.02465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15412v5","updated":"2024-09-04T05:12:54Z","published":"2024-03-05T08:29:36Z","title":"Towards Measuring and Modeling \"Culture\" in LLMs: A Survey","summary":" We present a survey of more than 90 recent papers that aim to study cultural\nrepresentation and inclusion in large language models (LLMs). We observe that\nnone of the studies explicitly define \"culture, which is a complex,\nmultifaceted concept; instead, they probe the models on some specially designed\ndatasets which represent certain aspects of \"culture\". We call these aspects\nthe proxies of culture, and organize them across two dimensions of demographic\nand semantic proxies. We also categorize the probing methods employed. Our\nanalysis indicates that only certain aspects of ``culture,'' such as values and\nobjectives, have been studied, leaving several other interesting and important\nfacets, especially the multitude of semantic domains (Thompson et al., 2020)\nand aboutness (Hershcovich et al., 2022), unexplored. Two other crucial gaps\nare the lack of robustness of probing techniques and situated studies on the\nimpact of cultural mis- and under-representation in LLM-based applications.\n","authors":["Muhammad Farid Adilazuarda","Sagnik Mukherjee","Pradhyumna Lavania","Siddhant Singh","Alham Fikri Aji","Jacki O'Neill","Ashutosh Modi","Monojit Choudhury"],"pdf_url":"https://arxiv.org/pdf/2403.15412v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16672v3","updated":"2024-09-04T05:09:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce a novel architecture and a training framework to\nsupport long context window and multilingual retrieval. Our new model,\nJina-ColBERT-v2, demonstrates strong performance across a range of English and\nmultilingual retrieval tasks,\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Georgios Mastrapas","Saba Sturua","Isabelle Mohr","Andreas Koukounas","Mohammad Kalim Akram","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v3.pdf","comment":"8 pages, references at pp7,8; EMNLP workshop submission"},{"id":"http://arxiv.org/abs/2409.02449v1","updated":"2024-09-04T05:08:23Z","published":"2024-09-04T05:08:23Z","title":"What is lost in Normalization? Exploring Pitfalls in Multilingual ASR\n Model Evaluations","summary":" This paper explores the pitfalls in evaluating multilingual automatic speech\nrecognition (ASR) models, with a particular focus on Indic language scripts. We\ninvestigate the text normalization routine employed by leading ASR models,\nincluding OpenAI Whisper, Meta's MMS, Seamless, and Assembly AI's Conformer,\nand their unintended consequences on performance metrics. Our research reveals\nthat current text normalization practices, while aiming to standardize ASR\noutputs for fair comparison, by removing inconsistencies such as variations in\nspelling, punctuation, and special characters, are fundamentally flawed when\napplied to Indic scripts. Through empirical analysis using text similarity\nscores and in-depth linguistic examination, we demonstrate that these flaws\nlead to artificially inflated performance metrics for Indic languages. We\nconclude by proposing a shift towards developing normalization routines that\nleverage native linguistic expertise, ensuring more robust and accurate\nevaluations of multilingual ASR models.\n","authors":["Kavya Manohar","Leena G Pillai"],"pdf_url":"https://arxiv.org/pdf/2409.02449v1.pdf","comment":"Sumbitted to EMNLP 2024"},{"id":"http://arxiv.org/abs/2409.02428v1","updated":"2024-09-04T04:15:14Z","published":"2024-09-04T04:15:14Z","title":"Large Language Models as Efficient Reward Function Searchers for\n Custom-Environment Multi-Objective Reinforcement Learning","summary":" Leveraging large language models (LLMs) for designing reward functions\ndemonstrates significant potential. However, achieving effective design and\nimprovement of reward functions in reinforcement learning (RL) tasks with\ncomplex custom environments and multiple requirements presents considerable\nchallenges. In this paper, we enable LLMs to be effective white-box searchers,\nhighlighting their advanced semantic understanding capabilities. Specifically,\nwe generate reward components for each explicit user requirement and employ the\nreward critic to identify the correct code form. Then, LLMs assign weights to\nthe reward components to balance their values and iteratively search and\noptimize these weights based on the context provided by the training log\nanalyzer, while adaptively determining the search step size. We applied the\nframework to an underwater information collection RL task without direct human\nfeedback or reward examples (zero-shot). The reward critic successfully correct\nthe reward code with only one feedback for each requirement, effectively\npreventing irreparable errors that can occur when reward function feedback is\nprovided in aggregate. The effective initialization of weights enables the\nacquisition of different reward functions within the Pareto solution set\nwithout weight search. Even in the case where a weight is 100 times off, fewer\nthan four iterations are needed to obtain solutions that meet user\nrequirements. The framework also works well with most prompts utilizing GPT-3.5\nTurbo, since it does not require advanced numerical understanding or\ncalculation.\n","authors":["Guanwen Xie","Jingzehua Xu","Yiyuan Yang","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00369v2","updated":"2024-09-04T03:50:38Z","published":"2024-08-31T07:10:16Z","title":"An Empirical Study on Information Extraction using Large Language Models","summary":" Human-like large language models (LLMs), especially the most powerful and\npopular ones in OpenAI's GPT family, have proven to be very helpful for many\nnatural language processing (NLP) related tasks. Therefore, various attempts\nhave been made to apply LLMs to information extraction (IE), which is a\nfundamental NLP task that involves extracting information from unstructured\nplain text. To demonstrate the latest representative progress in LLMs'\ninformation extraction ability, we assess the information extraction ability of\nGPT-4 (the latest version of GPT at the time of writing this paper) from four\nperspectives: Performance, Evaluation Criteria, Robustness, and Error Types.\nOur results suggest a visible performance gap between GPT-4 and\nstate-of-the-art (SOTA) IE methods. To alleviate this problem, considering the\nLLMs' human-like characteristics, we propose and analyze the effects of a\nseries of simple prompt-based methods, which can be generalized to other LLMs\nand NLP tasks. Rich experiments show our methods' effectiveness and some of\ntheir remaining issues in improving GPT-4's information extraction ability.\n","authors":["Ridong Han","Chaohao Yang","Tao Peng","Prayag Tiwari","Xiang Wan","Lu Liu","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00369v2.pdf","comment":"This article has an original arxiv version entitled \"Is Information\n Extraction Solved by ChatGPT? An Analysis of Performance, Evaluation\n Criteria, Robustness and Errors\", whose url link is arXiv/2305.14450"},{"id":"http://arxiv.org/abs/2409.02413v1","updated":"2024-09-04T03:39:23Z","published":"2024-09-04T03:39:23Z","title":"Abstractive Text Summarization: State of the Art, Challenges, and\n Improvements","summary":" Specifically focusing on the landscape of abstractive text summarization, as\nopposed to extractive techniques, this survey presents a comprehensive\noverview, delving into state-of-the-art techniques, prevailing challenges, and\nprospective research directions. We categorize the techniques into traditional\nsequence-to-sequence models, pre-trained large language models, reinforcement\nlearning, hierarchical methods, and multi-modal summarization. Unlike prior\nworks that did not examine complexities, scalability and comparisons of\ntechniques in detail, this review takes a comprehensive approach encompassing\nstate-of-the-art methods, challenges, solutions, comparisons, limitations and\ncharts out future improvements - providing researchers an extensive overview to\nadvance abstractive summarization research. We provide vital comparison tables\nacross techniques categorized - offering insights into model complexity,\nscalability and appropriate applications. The paper highlights challenges such\nas inadequate meaning representation, factual consistency, controllable text\nsummarization, cross-lingual summarization, and evaluation metrics, among\nothers. Solutions leveraging knowledge incorporation and other innovative\nstrategies are proposed to address these challenges. The paper concludes by\nhighlighting emerging research areas like factual inconsistency,\ndomain-specific, cross-lingual, multilingual, and long-document summarization,\nas well as handling noisy data. Our objective is to provide researchers and\npractitioners with a structured overview of the domain, enabling them to better\nunderstand the current landscape and identify potential areas for further\nresearch and improvement.\n","authors":["Hassan Shakil","Ahmad Farooq","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2409.02413v1.pdf","comment":"9 Tables, 7 Figures"},{"id":"http://arxiv.org/abs/2409.00128v2","updated":"2024-09-04T03:21:07Z","published":"2024-08-29T05:18:50Z","title":"Can AI Replace Human Subjects? A Large-Scale Replication of\n Psychological Experiments with LLMs","summary":" Artificial Intelligence (AI) is increasingly being integrated into scientific\nresearch, particularly in the social sciences, where understanding human\nbehavior is critical. Large Language Models (LLMs) like GPT-4 have shown\npromise in replicating human-like responses in various psychological\nexperiments. However, the extent to which LLMs can effectively replace human\nsubjects across diverse experimental contexts remains unclear. Here, we conduct\na large-scale study replicating 154 psychological experiments from top social\nscience journals with 618 main effects and 138 interaction effects using GPT-4\nas a simulated participant. We find that GPT-4 successfully replicates 76.0\npercent of main effects and 47.0 percent of interaction effects observed in the\noriginal studies, closely mirroring human responses in both direction and\nsignificance. However, only 19.44 percent of GPT-4's replicated confidence\nintervals contain the original effect sizes, with the majority of replicated\neffect sizes exceeding the 95 percent confidence interval of the original\nstudies. Additionally, there is a 71.6 percent rate of unexpected significant\nresults where the original studies reported null findings, suggesting potential\noverestimation or false positives. Our results demonstrate the potential of\nLLMs as powerful tools in psychological research but also emphasize the need\nfor caution in interpreting AI-driven findings. While LLMs can complement human\nstudies, they cannot yet fully replace the nuanced insights provided by human\nsubjects.\n","authors":["Ziyan Cui","Ning Li","Huaikang Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.00128v2.pdf","comment":"5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.02393v1","updated":"2024-09-04T02:41:44Z","published":"2024-09-04T02:41:44Z","title":"Determination of language families using deep learning","summary":" We use a c-GAN (convolutional generative adversarial) neural network to\nanalyze transliterated text fragments of extant, dead comprehensible, and one\ndead non-deciphered (Cypro-Minoan) language to establish linguistic affinities.\nThe paper is agnostic with respect to translation and/or deciphering. However,\nthere is hope that the proposed approach can be useful for decipherment with\nmore sophisticated neural network techniques.\n","authors":["Peter B. Lerner"],"pdf_url":"https://arxiv.org/pdf/2409.02393v1.pdf","comment":"First draft. Comments are welcome"},{"id":"http://arxiv.org/abs/2409.02387v1","updated":"2024-09-04T02:30:12Z","published":"2024-09-04T02:30:12Z","title":"Large Language Models and Cognitive Science: A Comprehensive Review of\n Similarities, Differences, and Challenges","summary":" This comprehensive review explores the intersection of Large Language Models\n(LLMs) and cognitive science, examining similarities and differences between\nLLMs and human cognitive processes. We analyze methods for evaluating LLMs\ncognitive abilities and discuss their potential as cognitive models. The review\ncovers applications of LLMs in various cognitive fields, highlighting insights\ngained for cognitive science research. We assess cognitive biases and\nlimitations of LLMs, along with proposed methods for improving their\nperformance. The integration of LLMs with cognitive architectures is examined,\nrevealing promising avenues for enhancing artificial intelligence (AI)\ncapabilities. Key challenges and future research directions are identified,\nemphasizing the need for continued refinement of LLMs to better align with\nhuman cognition. This review provides a balanced perspective on the current\nstate and future potential of LLMs in advancing our understanding of both\nartificial and human intelligence.\n","authors":["Qian Niu","Junyu Liu","Ziqian Bi","Pohsun Feng","Benji Peng","Keyu Chen"],"pdf_url":"https://arxiv.org/pdf/2409.02387v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.16586v2","updated":"2024-09-04T02:24:08Z","published":"2024-08-29T14:49:13Z","title":"Enhancing Dialogue Generation in Werewolf Game Through Situation\n Analysis and Persuasion Strategies","summary":" Recent advancements in natural language processing, particularly with large\nlanguage models (LLMs) like GPT-4, have significantly enhanced dialogue\nsystems, enabling them to generate more natural and fluent conversations.\nDespite these improvements, challenges persist, such as managing continuous\ndialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024\naddresses these challenges by employing the Werewolf Game, an incomplete\ninformation game, to test the capabilities of LLMs in complex interactive\nenvironments. This paper introduces a LLM-based Werewolf Game AI, where each\nrole is supported by situation analysis to aid response generation.\nAdditionally, for the werewolf role, various persuasion strategies, including\nlogical appeal, credibility appeal, and emotional appeal, are employed to\neffectively persuade other players to align with its actions.\n","authors":["Zhiyang Qi","Michimasa Inaba"],"pdf_url":"https://arxiv.org/pdf/2408.16586v2.pdf","comment":"Accepted to the AIWolfDial2024 workshop at INLG 2024"},{"id":"http://arxiv.org/abs/2409.02384v1","updated":"2024-09-04T02:20:59Z","published":"2024-09-04T02:20:59Z","title":"STAB: Speech Tokenizer Assessment Benchmark","summary":" Representing speech as discrete tokens provides a framework for transforming\nspeech into a format that closely resembles text, thus enabling the use of\nspeech as an input to the widely successful large language models (LLMs).\nCurrently, while several speech tokenizers have been proposed, there is\nambiguity regarding the properties that are desired from a tokenizer for\nspecific downstream tasks and its overall generalizability. Evaluating the\nperformance of tokenizers across different downstream tasks is a\ncomputationally intensive effort that poses challenges for scalability. To\ncircumvent this requirement, we present STAB (Speech Tokenizer Assessment\nBenchmark), a systematic evaluation framework designed to assess speech\ntokenizers comprehensively and shed light on their inherent characteristics.\nThis framework provides a deeper understanding of the underlying mechanisms of\nspeech tokenization, thereby offering a valuable resource for expediting the\nadvancement of future tokenizer models and enabling comparative analysis using\na standardized benchmark. We evaluate the STAB metrics and correlate this with\ndownstream task performance across a range of speech tasks and tokenizer\nchoices.\n","authors":["Shikhar Vashishth","Harman Singh","Shikhar Bharadwaj","Sriram Ganapathy","Chulayuth Asawaroengchai","Kartik Audhkhasi","Andrew Rosenberg","Ankur Bapna","Bhuvana Ramabhadran"],"pdf_url":"https://arxiv.org/pdf/2409.02384v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2404.04298v2","updated":"2024-09-04T02:00:58Z","published":"2024-04-04T20:27:37Z","title":"SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated\n Responses","summary":" Can LLMs consistently improve their previous outputs for better results? For\nthis to be true, LLMs would need to be better at discriminating among\npreviously-generated alternatives, than generating initial responses. We\nexplore the validity of this hypothesis in practice. We first formulate a\nunified framework that allows us to compare the generative and discriminative\ncapability of any model on any task. In our resulting experimental analysis of\nseveral open-source and industrial LLMs, we observe that models are not\nreliably better at discriminating among previously-generated alternatives than\ngenerating initial responses. This finding challenges the notion that LLMs may\nbe able to enhance their performance only through their own judgment.\n","authors":["Dongwei Jiang","Jingyu Zhang","Orion Weller","Nathaniel Weir","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.04298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02375v1","updated":"2024-09-04T01:51:37Z","published":"2024-09-04T01:51:37Z","title":"How Privacy-Savvy Are Large Language Models? A Case Study on Compliance\n and Privacy Technical Review","summary":" The recent advances in large language models (LLMs) have significantly\nexpanded their applications across various fields such as language generation,\nsummarization, and complex question answering. However, their application to\nprivacy compliance and technical privacy reviews remains under-explored,\nraising critical concerns about their ability to adhere to global privacy\nstandards and protect sensitive user data. This paper seeks to address this gap\nby providing a comprehensive case study evaluating LLMs' performance in\nprivacy-related tasks such as privacy information extraction (PIE), legal and\nregulatory key point detection (KPD), and question answering (QA) with respect\nto privacy policies and data protection regulations. We introduce a Privacy\nTechnical Review (PTR) framework, highlighting its role in mitigating privacy\nrisks during the software development life-cycle. Through an empirical\nassessment, we investigate the capacity of several prominent LLMs, including\nBERT, GPT-3.5, GPT-4, and custom models, in executing privacy compliance checks\nand technical privacy reviews. Our experiments benchmark the models across\nmultiple dimensions, focusing on their precision, recall, and F1-scores in\nextracting privacy-sensitive information and detecting key regulatory\ncompliance points. While LLMs show promise in automating privacy reviews and\nidentifying regulatory discrepancies, significant gaps persist in their ability\nto fully comply with evolving legal standards. We provide actionable\nrecommendations for enhancing LLMs' capabilities in privacy compliance,\nemphasizing the need for robust model improvements and better integration with\nlegal and regulatory requirements. This study underscores the growing\nimportance of developing privacy-aware LLMs that can both support businesses in\ncompliance efforts and safeguard user privacy rights.\n","authors":["Xichou Zhu","Yang Liu","Zhou Shen","Yi Liu","Min Li","Yujun Chen","Benzi John","Zhenzhen Ma","Tao Hu","Bolong Yang","Manman Wang","Zongxing Xie","Peng Liu","Dan Cai","Junhui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02375v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.02370v1","updated":"2024-09-04T01:40:20Z","published":"2024-09-04T01:40:20Z","title":"Do Large Language Models Possess Sensitive to Sentiment?","summary":" Large Language Models (LLMs) have recently displayed their extraordinary\ncapabilities in language understanding. However, how to comprehensively assess\nthe sentiment capabilities of LLMs continues to be a challenge. This paper\ninvestigates the ability of LLMs to detect and react to sentiment in text\nmodal. As the integration of LLMs into diverse applications is on the rise, it\nbecomes highly critical to comprehend their sensitivity to emotional tone, as\nit can influence the user experience and the efficacy of sentiment-driven\ntasks. We conduct a series of experiments to evaluate the performance of\nseveral prominent LLMs in identifying and responding appropriately to\nsentiments like positive, negative, and neutral emotions. The models' outputs\nare analyzed across various sentiment benchmarks, and their responses are\ncompared with human evaluations. Our discoveries indicate that although LLMs\nshow a basic sensitivity to sentiment, there are substantial variations in\ntheir accuracy and consistency, emphasizing the requirement for further\nenhancements in their training processes to better capture subtle emotional\ncues. Take an example in our findings, in some cases, the models might wrongly\nclassify a strongly positive sentiment as neutral, or fail to recognize sarcasm\nor irony in the text. Such misclassifications highlight the complexity of\nsentiment analysis and the areas where the models need to be refined. Another\naspect is that different LLMs might perform differently on the same set of\ndata, depending on their architecture and training datasets. This variance\ncalls for a more in-depth study of the factors that contribute to the\nperformance differences and how they can be optimized.\n","authors":["Yang Liu","Xichou Zhu","Zhou Shen","Yi Liu","Min Li","Yujun Chen","Benzi John","Zhenzhen Ma","Tao Hu","Zhiyang Xu","Wei Luo","Junhui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02370v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.02361v1","updated":"2024-09-04T01:14:04Z","published":"2024-09-04T01:14:04Z","title":"Diversify-verify-adapt: Efficient and Robust Retrieval-Augmented\n Ambiguous Question Answering","summary":" The retrieval augmented generation (RAG) framework addresses an ambiguity in\nuser queries in QA systems by retrieving passages that cover all plausible\ninterpretations and generating comprehensive responses based on the passages.\nHowever, our preliminary studies reveal that a single retrieval process often\nsuffers from low quality results, as the retrieved passages frequently fail to\ncapture all plausible interpretations. Although the iterative RAG approach has\nbeen proposed to address this problem, it comes at the cost of significantly\nreduced efficiency. To address these issues, we propose the\ndiversify-verify-adapt (DIVA) framework. DIVA first diversifies the retrieved\npassages to encompass diverse interpretations. Subsequently, DIVA verifies the\nquality of the passages and adapts the most suitable approach tailored to their\nquality. This approach improves the QA systems accuracy and robustness by\nhandling low quality retrieval issue in ambiguous questions, while enhancing\nefficiency.\n","authors":["Yeonjun In","Sungchul Kim","Ryan A. Rossi","Md Mehrab Tanjim","Tong Yu","Ritwik Sinha","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2409.02361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15221v2","updated":"2024-09-04T00:58:59Z","published":"2024-08-27T17:33:30Z","title":"LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet","summary":" Recent large language model (LLM) defenses have greatly improved models'\nability to refuse harmful queries, even when adversarially attacked. However,\nLLM defenses are primarily evaluated against automated adversarial attacks in a\nsingle turn of conversation, an insufficient threat model for real-world\nmalicious use. We demonstrate that multi-turn human jailbreaks uncover\nsignificant vulnerabilities, exceeding 70% attack success rate (ASR) on\nHarmBench against defenses that report single-digit ASRs with automated\nsingle-turn attacks. Human jailbreaks also reveal vulnerabilities in machine\nunlearning defenses, successfully recovering dual-use biosecurity knowledge\nfrom unlearned models. We compile these results into Multi-Turn Human\nJailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.\nWe publicly release MHJ alongside a compendium of jailbreak tactics developed\nacross dozens of commercial red teaming engagements, supporting research\ntowards stronger LLM defenses.\n","authors":["Nathaniel Li","Ziwen Han","Ian Steneker","Willow Primack","Riley Goodside","Hugh Zhang","Zifan Wang","Cristina Menghini","Summer Yue"],"pdf_url":"https://arxiv.org/pdf/2408.15221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06266v4","updated":"2024-09-04T00:22:45Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02343v1","updated":"2024-09-04T00:10:36Z","published":"2024-09-04T00:10:36Z","title":"NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for\n Retrieval","summary":" $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval)\nfrom pre-trained embedding models is the predominant retrieval method for text\nand images, as well as Retrieval-Augmented Generation (RAG) pipelines. In\npractice, application developers often fine-tune the embeddings to improve\ntheir accuracy on the dataset and query workload in hand. Existing approaches\neither fine-tune the pre-trained model itself or, more efficiently, but at the\ncost of accuracy, train adaptor models to transform the output of the\npre-trained model. We present NUDGE, a family of novel non-parametric embedding\nfine-tuning approaches that are significantly more accurate and efficient than\nboth sets of existing approaches. NUDGE directly modifies the embeddings of\ndata records to maximize the accuracy of $k$-NN retrieval. We present a\nthorough theoretical and experimental study of NUDGE's non-parametric approach.\nWe show that even though the underlying problem is NP-Hard, constrained\nvariations can be solved efficiently. These constraints additionally ensure\nthat the changes to the embeddings are modest, avoiding large distortions to\nthe semantics learned during pre-training. In experiments across five\npre-trained models and nine standard text and image retrieval datasets, NUDGE\nruns in minutes and often improves NDCG@10 by more than 10% over existing\nfine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase\nin accuracy and runs 200x and 3x faster, respectively, over fine-tuning the\npre-trained model and training adaptors.\n","authors":["Sepanta Zeighami","Zac Wellmer","Aditya Parameswaran"],"pdf_url":"https://arxiv.org/pdf/2409.02343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05074v3","updated":"2024-09-04T23:47:08Z","published":"2024-08-09T14:02:24Z","title":"RT-Surv: Improving Mortality Prediction After Radiotherapy with Large\n Language Model Structuring of Large-Scale Unstructured Electronic Health\n Records","summary":" Accurate patient selection is critical in radiotherapy (RT) to prevent\nineffective treatments. Traditional survival prediction models, relying on\nstructured data, often lack precision. This study explores the potential of\nlarge language models (LLMs) to structure unstructured electronic health record\n(EHR) data, thereby improving survival prediction accuracy through\ncomprehensive clinical information integration. Data from 34,276 patients\ntreated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed,\nencompassing both structured and unstructured data. An open-source LLM was used\nto structure the unstructured EHR data via single-shot learning, with its\nperformance compared against a domain-specific medical LLM and a smaller\nvariant. Survival prediction models were developed using statistical, machine\nlearning, and deep learning approaches, incorporating both structured and\nLLM-structured data. Clinical experts evaluated the accuracy of the\nLLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring\nunstructured EHR data without additional training, significantly outperforming\nthe domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs\nwere more effective, particularly in extracting clinically relevant features\nlike general condition and disease extent, which closely correlated with\npatient survival. Incorporating LLM-structured clinical features into survival\nprediction models significantly improved accuracy, with the C-index of deep\nlearning models increasing from 0.737 to 0.820. These models also became more\ninterpretable by emphasizing clinically significant factors. This study shows\nthat general-domain LLMs, even without specific medical training, can\neffectively structure large-scale unstructured EHR data, substantially\nenhancing the accuracy and interpretability of clinical predictive models.\n","authors":["Sangjoon Park","Chan Woo Wee","Seo Hee Choi","Kyung Hwan Kim","Jee Suk Chang","Hong In Yoon","Ik Jae Lee","Yong Bae Kim","Jaeho Cho","Ki Chang Keum","Chang Geol Lee","Hwa Kyung Byun","Woong Sub Koom"],"pdf_url":"https://arxiv.org/pdf/2408.05074v3.pdf","comment":"23 pages, 2 tables, 4 figures"},{"id":"http://arxiv.org/abs/2409.03131v1","updated":"2024-09-04T23:45:10Z","published":"2024-09-04T23:45:10Z","title":"Well, that escalated quickly: The Single-Turn Crescendo Attack (STCA)","summary":" This paper explores a novel approach to adversarial attacks on large language\nmodels (LLM): the Single-Turn Crescendo Attack (STCA). The STCA builds upon the\nmulti-turn crescendo attack established by Mark Russinovich, Ahmed Salem, Ronen\nEldan. Traditional multi-turn adversarial strategies gradually escalate the\ncontext to elicit harmful or controversial responses from LLMs. However, this\npaper introduces a more efficient method where the escalation is condensed into\na single interaction. By carefully crafting the prompt to simulate an extended\ndialogue, the attack bypasses typical content moderation systems, leading to\nthe generation of responses that would normally be filtered out. I demonstrate\nthis technique through a few case studies. The results highlight\nvulnerabilities in current LLMs and underscore the need for more robust\nsafeguards. This work contributes to the broader discourse on responsible AI\n(RAI) safety and adversarial testing, providing insights and practical examples\nfor researchers and developers. This method is unexplored in the literature,\nmaking it a novel contribution to the field.\n","authors":["Alan Aqrawi"],"pdf_url":"https://arxiv.org/pdf/2409.03131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03115v1","updated":"2024-09-04T22:47:33Z","published":"2024-09-04T22:47:33Z","title":"Probing self-attention in self-supervised speech models for\n cross-linguistic differences","summary":" Speech models have gained traction thanks to increase in accuracy from novel\ntransformer architectures. While this impressive increase in performance across\nautomatic speech recognition (ASR) benchmarks is noteworthy, there is still\nmuch that is unknown about the use of attention mechanisms for speech-related\ntasks. For example, while it is assumed that these models are learning\nlanguage-independent (i.e., universal) speech representations, there has not\nyet been an in-depth exploration of what it would mean for the models to be\nlanguage-independent. In the current paper, we explore this question within the\nrealm of self-attention mechanisms of one small self-supervised speech\ntransformer model (TERA). We find that even with a small model, the attention\nheads learned are diverse ranging from almost entirely diagonal to almost\nentirely global regardless of the training language. We highlight some notable\ndifferences in attention patterns between Turkish and English and demonstrate\nthat the models do learn important phonological information during pretraining.\nWe also present a head ablation study which shows that models across languages\nprimarily rely on diagonal heads to classify phonemes.\n","authors":["Sai Gopinath","Joselyn Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2409.03115v1.pdf","comment":"10 pages, 18 figures"},{"id":"http://arxiv.org/abs/2409.00557v2","updated":"2024-09-04T20:34:27Z","published":"2024-08-31T23:06:12Z","title":"Learning to Ask: When LLMs Meet Unclear Instruction","summary":" Equipped with the capability to call functions, modern large language models\n(LLMs) can leverage external tools for addressing a range of tasks unattainable\nthrough language skills alone. However, the effective execution of these tools\nrelies heavily not just on the advanced capabilities of LLMs but also on\nprecise user instructions, which often cannot be ensured in the real world. To\nevaluate the performance of LLMs tool-use under imperfect instructions, we\nmeticulously examine the real-world instructions queried from users, analyze\nthe error patterns, and build a challenging tool-use benchmark called Noisy\nToolBench (NoisyToolBench). We find that due to the next-token prediction\ntraining objective, LLMs tend to arbitrarily generate the missed argument,\nwhich may lead to hallucinations and risks. To address this issue, we propose a\nnovel framework, Ask-when-Needed (AwN), which prompts LLMs to ask questions to\nusers whenever they encounter obstacles due to unclear instructions. Moreover,\nto reduce the manual labor involved in user-LLM interaction and assess LLMs\nperformance in tool utilization from both accuracy and efficiency perspectives,\nwe design an automated evaluation tool named ToolEvaluator. Our experiments\ndemonstrate that the AwN significantly outperforms existing frameworks for tool\nlearning in the NoisyToolBench. We will release all related code and datasets\nto support future research.\n","authors":["Wenxuan Wang","Juluan Shi","Chaozheng Wang","Cheryl Lee","Youliang Yuan","Jen-tse Huang","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2409.00557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.00984v2","updated":"2024-09-04T20:22:41Z","published":"2024-06-03T04:36:38Z","title":"Predicting Drug-Gene Relations via Analogy Tasks with Word Embeddings","summary":" Natural language processing (NLP) is utilized in a wide range of fields,\nwhere words in text are typically transformed into feature vectors called\nembeddings. BioConceptVec is a specific example of embeddings tailored for\nbiology, trained on approximately 30 million PubMed abstracts using models such\nas skip-gram. Generally, word embeddings are known to solve analogy tasks\nthrough simple vector arithmetic. For instance, $\\mathrm{\\textit{king}} -\n\\mathrm{\\textit{man}} + \\mathrm{\\textit{woman}}$ predicts\n$\\mathrm{\\textit{queen}}$. In this study, we demonstrate that BioConceptVec\nembeddings, along with our own embeddings trained on PubMed abstracts, contain\ninformation about drug-gene relations and can predict target genes from a given\ndrug through analogy computations. We also show that categorizing drugs and\ngenes using biological pathways improves performance. Furthermore, we\nillustrate that vectors derived from known relations in the past can predict\nunknown future relations in datasets divided by year. Despite the simplicity of\nimplementing analogy tasks as vector additions, our approach demonstrated\nperformance comparable to that of large language models such as GPT-4 in\npredicting drug-gene relations.\n","authors":["Hiroaki Yamagiwa","Ryoma Hashimoto","Kiwamu Arakane","Ken Murakami","Shou Soeda","Momose Oyama","Mariko Okada","Hidetoshi Shimodaira"],"pdf_url":"https://arxiv.org/pdf/2406.00984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03059v1","updated":"2024-09-04T20:18:59Z","published":"2024-09-04T20:18:59Z","title":"Quantification of stylistic differences in human- and ASR-produced\n transcripts of African American English","summary":" Common measures of accuracy used to assess the performance of automatic\nspeech recognition (ASR) systems, as well as human transcribers, conflate\nmultiple sources of error. Stylistic differences, such as verbatim vs\nnon-verbatim, can play a significant role in ASR performance evaluation when\ndifferences exist between training and test datasets. The problem is compounded\nfor speech from underrepresented varieties, where the speech to orthography\nmapping is not as standardized. We categorize the kinds of stylistic\ndifferences between 6 transcription versions, 4 human- and 2 ASR-produced, of\n10 hours of African American English (AAE) speech. Focusing on verbatim\nfeatures and AAE morphosyntactic features, we investigate the interactions of\nthese categories with how well transcripts can be compared via word error rate\n(WER). The results, and overall analysis, help clarify how ASR outputs are a\nfunction of the decisions made by the training data's human transcribers.\n","authors":["Annika Heuser","Tyler Kendall","Miguel del Rio","Quinten McNamara","Nishchal Bhandari","Corey Miller","Migüel Jetté"],"pdf_url":"https://arxiv.org/pdf/2409.03059v1.pdf","comment":"Published in Interspeech 2024 Proceedings, 5 pages excluding\n references, 5 figures"},{"id":"http://arxiv.org/abs/2406.09676v2","updated":"2024-09-04T20:04:05Z","published":"2024-06-14T02:58:19Z","title":"Optimizing Byte-level Representation for End-to-end ASR","summary":" We propose a novel approach to optimizing a byte-level representation for\nend-to-end automatic speech recognition (ASR). Byte-level representation is\noften used by large scale multilingual ASR systems when the character set of\nthe supported languages is large. The compactness and universality of\nbyte-level representation allow the ASR models to use smaller output\nvocabularies and therefore, provide more flexibility. UTF-8 is a commonly used\nbyte-level representation for multilingual ASR, but it is not designed to\noptimize machine learning tasks directly. By using auto-encoder and vector\nquantization, we show that we can optimize a byte-level representation for ASR\nand achieve better accuracy. Our proposed framework can incorporate information\nfrom different modalities, and provides an error correction mechanism. In an\nEnglish/Mandarin dictation task, we show that a bilingual ASR model built with\nthis approach can outperform UTF-8 representation by 5% relative in error rate.\n","authors":["Roger Hsiao","Liuhui Deng","Erik McDermott","Ruchir Travadi","Xiaodan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2406.09676v2.pdf","comment":"5 pages, 1 figure, IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.03046v1","updated":"2024-09-04T19:31:20Z","published":"2024-09-04T19:31:20Z","title":"Oddballness: universal anomaly detection with language models","summary":" We present a new method to detect anomalies in texts (in general: in\nsequences of any data), using language models, in a totally unsupervised\nmanner. The method considers probabilities (likelihoods) generated by a\nlanguage model, but instead of focusing on low-likelihood tokens, it considers\na new metric introduced in this paper: oddballness. Oddballness measures how\n``strange'' a given token is according to the language model. We demonstrate in\ngrammatical error detection tasks (a specific case of text anomaly detection)\nthat oddballness is better than just considering low-likelihood events, if a\ntotally unsupervised setup is assumed.\n","authors":["Filip Graliński","Ryszard Staruch","Krzysztof Jurkiewicz"],"pdf_url":"https://arxiv.org/pdf/2409.03046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01586v2","updated":"2024-09-04T19:30:59Z","published":"2024-09-03T03:59:22Z","title":"Booster: Tackling Harmful Fine-tuning for Large Language Models via\n Attenuating Harmful Perturbation","summary":" Harmful fine-tuning issue \\citep{qi2023fine} poses serious safety concerns\nfor Large language models' fine-tuning-as-a-service. While existing defenses\n\\citep{huang2024vaccine,rosati2024representation} have been proposed to\nmitigate the issue, their performances are still far away from satisfactory,\nand the root cause of the problem has not been fully recovered. For the first\ntime in the literature, we in this paper show that \\textit{harmful\nperturbation} over the model weights should be the root cause of\nalignment-broken of harmful fine-tuning. In order to attenuate the negative\nimpact of harmful perturbation, we propose an alignment-stage solution, dubbed\nBooster. Technically, along with the original alignment loss, we append a loss\nregularizer in the alignment stage's optimization. The regularizer ensures that\nthe model's harmful loss reduction before/after simulated harmful perturbation\nis attenuated, thereby mitigating the subsequent fine-tuning risk. Empirical\nresults show that Booster can effectively reduce the harmful score of the\nfine-tuned models while maintaining the performance of downstream tasks. Our\ncode is available at \\url{https://github.com/git-disl/Booster}.\n","authors":["Tiansheng Huang","Sihao Hu","Fatih Ilhan","Selim Furkan Tekin","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01695v3","updated":"2024-09-04T19:13:06Z","published":"2024-01-24T03:11:36Z","title":"Language-Guided World Models: A Model-Based Approach to AI Control","summary":" This paper introduces the concept of Language-Guided World Models (LWMs) --\nprobabilistic models that can simulate environments by reading texts. Agents\nequipped with these models provide humans with more extensive and efficient\ncontrol, allowing them to simultaneously alter agent behaviors in multiple\ntasks via natural verbal communication. In this work, we take initial steps in\ndeveloping robust LWMs that can generalize to compositionally novel language\ndescriptions. We design a challenging world modeling benchmark based on the\ngame of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that\nrequire varying degrees of compositional generalization. Our experiments reveal\nthe lack of generalizability of the state-of-the-art Transformer model, as it\noffers marginal improvements in simulation quality over a no-text baseline. We\ndevise a more robust model by fusing the Transformer with the EMMA attention\nmechanism (Hanjie et al., 2021). Our model substantially outperforms the\nTransformer and approaches the performance of a model with an oracle semantic\nparsing and grounding capability. To demonstrate the practicality of this model\nin improving AI safety and transparency, we simulate a scenario in which the\nmodel enables an agent to present plans to a human before execution, and to\nrevise plans based on their language feedback.\n","authors":["Alex Zhang","Khanh Nguyen","Jens Tuyls","Albert Lin","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2402.01695v3.pdf","comment":"SpLU-RoboNLP workshop at ACL 2024"},{"id":"http://arxiv.org/abs/2409.03021v1","updated":"2024-09-04T18:27:12Z","published":"2024-09-04T18:27:12Z","title":"CLUE: Concept-Level Uncertainty Estimation for Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable proficiency in\nvarious natural language generation (NLG) tasks. Previous studies suggest that\nLLMs' generation process involves uncertainty. However, existing approaches to\nuncertainty estimation mainly focus on sequence-level uncertainty, overlooking\nindividual pieces of information within sequences. These methods fall short in\nseparately assessing the uncertainty of each component in a sequence. In\nresponse, we propose a novel framework for Concept-Level Uncertainty Estimation\n(CLUE) for LLMs. We leverage LLMs to convert output sequences into\nconcept-level representations, breaking down sequences into individual concepts\nand measuring the uncertainty of each concept separately. We conduct\nexperiments to demonstrate that CLUE can provide more interpretable uncertainty\nestimation results compared with sentence-level uncertainty, and could be a\nuseful tool for various tasks such as hallucination detection and story\ngeneration.\n","authors":["Yu-Hsiang Wang","Andrew Bai","Che-Ping Tsai","Cho-Jui Hsieh"],"pdf_url":"https://arxiv.org/pdf/2409.03021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02976v1","updated":"2024-09-04T13:59:38Z","published":"2024-09-04T13:59:38Z","title":"Hallucination Detection in LLMs: Fast and Memory-Efficient Finetuned\n Models","summary":" Uncertainty estimation is a necessary component when implementing AI in\nhigh-risk settings, such as autonomous cars, medicine, or insurances. Large\nLanguage Models (LLMs) have seen a surge in popularity in recent years, but\nthey are subject to hallucinations, which may cause serious harm in high-risk\nsettings. Despite their success, LLMs are expensive to train and run: they need\na large amount of computations and memory, preventing the use of ensembling\nmethods in practice. In this work, we present a novel method that allows for\nfast and memory-friendly training of LLM ensembles. We show that the resulting\nensembles can detect hallucinations and are a viable approach in practice as\nonly one GPU is needed for training and inference.\n","authors":["Gabriel Y. Arteaga","Thomas B. Schön","Nicolas Pielawski"],"pdf_url":"https://arxiv.org/pdf/2409.02976v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.03797v1","updated":"2024-09-04T17:53:24Z","published":"2024-09-04T17:53:24Z","title":"NESTFUL: A Benchmark for Evaluating LLMs on Nested Sequences of API\n Calls","summary":" Autonomous agent applications powered by large language models (LLMs) have\nrecently risen to prominence as effective tools for addressing complex\nreal-world tasks. At their core, agentic workflows rely on LLMs to plan and\nexecute the use of tools and external Application Programming Interfaces (APIs)\nin sequence to arrive at the answer to a user's request. Various benchmarks and\nleaderboards have emerged to evaluate an LLM's capabilities for tool and API\nuse; however, most of these evaluations only track single or multiple isolated\nAPI calling capabilities. In this paper, we present NESTFUL, a benchmark to\nevaluate LLMs on nested sequences of API calls, i.e., sequences where the\noutput of one API call is passed as input to a subsequent call. NESTFUL has a\ntotal of 300 human annotated samples divided into two types - executable and\nnon-executable. The executable samples are curated manually by crawling\nRapid-APIs whereas the non-executable samples are hand picked by human\nannotators from data synthetically generated using an LLM. We evaluate\nstate-of-the-art LLMs with function calling abilities on NESTFUL. Our results\nshow that most models do not perform well on nested APIs in NESTFUL as compared\nto their performance on the simpler problem settings available in existing\nbenchmarks.\n","authors":["Kinjal Basu","Ibrahim Abdelaziz","Kelsey Bradford","Maxwell Crouse","Kiran Kate","Sadhana Kumaravel","Saurabh Goyal","Asim Munawar","Yara Rizk","Xin Wang","Luis Lastras","Pavan Kapanipathi"],"pdf_url":"https://arxiv.org/pdf/2409.03797v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.02919v1","updated":"2024-09-04T17:58:08Z","published":"2024-09-04T17:58:08Z","title":"HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical\n MLLM Prompts","summary":" The potential for higher-resolution image generation using pretrained\ndiffusion models is immense, yet these models often struggle with issues of\nobject repetition and structural artifacts especially when scaling to 4K\nresolution and higher. We figure out that the problem is caused by that, a\nsingle prompt for the generation of multiple scales provides insufficient\nefficacy. In response, we propose HiPrompt, a new tuning-free solution that\ntackles the above problems by introducing hierarchical prompts. The\nhierarchical prompts offer both global and local guidance. Specifically, the\nglobal guidance comes from the user input that describes the overall content,\nwhile the local guidance utilizes patch-wise descriptions from MLLMs to\nelaborately guide the regional structure and texture generation. Furthermore,\nduring the inverse denoising process, the generated noise is decomposed into\nlow- and high-frequency spatial components. These components are conditioned on\nmultiple prompt levels, including detailed patch-wise descriptions and broader\nimage-level prompts, facilitating prompt-guided denoising under hierarchical\nsemantic guidance. It further allows the generation to focus more on local\nspatial regions and ensures the generated images maintain coherent local and\nglobal semantics, structures, and textures with high definition. Extensive\nexperiments demonstrate that HiPrompt outperforms state-of-the-art works in\nhigher-resolution image generation, significantly reducing object repetition\nand enhancing structural quality.\n","authors":["Xinyu Liu","Yingqing He","Lanqing Guo","Xiang Li","Bu Jin","Peng Li","Yan Li","Chi-Min Chan","Qifeng Chen","Wei Xue","Wenhan Luo","Qingfeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2409.02919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02917v1","updated":"2024-09-04T17:53:42Z","published":"2024-09-04T17:53:42Z","title":"UC-NeRF: Uncertainty-aware Conditional Neural Radiance Fields from\n Endoscopic Sparse Views","summary":" Visualizing surgical scenes is crucial for revealing internal anatomical\nstructures during minimally invasive procedures. Novel View Synthesis is a\nvital technique that offers geometry and appearance reconstruction, enhancing\nunderstanding, planning, and decision-making in surgical scenes. Despite the\nimpressive achievements of Neural Radiance Field (NeRF), its direct application\nto surgical scenes produces unsatisfying results due to two challenges:\nendoscopic sparse views and significant photometric inconsistencies. In this\npaper, we propose uncertainty-aware conditional NeRF for novel view synthesis\nto tackle the severe shape-radiance ambiguity from sparse surgical views. The\ncore of UC-NeRF is to incorporate the multi-view uncertainty estimation to\ncondition the neural radiance field for modeling the severe photometric\ninconsistencies adaptively. Specifically, our UC-NeRF first builds a\nconsistency learner in the form of multi-view stereo network, to establish the\ngeometric correspondence from sparse views and generate uncertainty estimation\nand feature priors. In neural rendering, we design a base-adaptive NeRF network\nto exploit the uncertainty estimation for explicitly handling the photometric\ninconsistencies. Furthermore, an uncertainty-guided geometry distillation is\nemployed to enhance geometry learning. Experiments on the SCARED and Hamlyn\ndatasets demonstrate our superior performance in rendering appearance and\ngeometry, consistently outperforming the current state-of-the-art approaches.\nOur code will be released at \\url{https://github.com/wrld/UC-NeRF}.\n","authors":["Jiaxin Guo","Jiangliu Wang","Ruofeng Wei","Di Kang","Qi Dou","Yun-hui Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02914v1","updated":"2024-09-04T17:52:43Z","published":"2024-09-04T17:52:43Z","title":"Can LVLMs Obtain a Driver's License? A Benchmark Towards Reliable AGI\n for Autonomous Driving","summary":" Large Vision-Language Models (LVLMs) have recently garnered significant\nattention, with many efforts aimed at harnessing their general knowledge to\nenhance the interpretability and robustness of autonomous driving models.\nHowever, LVLMs typically rely on large, general-purpose datasets and lack the\nspecialized expertise required for professional and safe driving. Existing\nvision-language driving datasets focus primarily on scene understanding and\ndecision-making, without providing explicit guidance on traffic rules and\ndriving skills, which are critical aspects directly related to driving safety.\nTo bridge this gap, we propose IDKB, a large-scale dataset containing over one\nmillion data items collected from various countries, including driving\nhandbooks, theory test data, and simulated road test data. Much like the\nprocess of obtaining a driver's license, IDKB encompasses nearly all the\nexplicit knowledge needed for driving from theory to practice. In particular,\nwe conducted comprehensive tests on 15 LVLMs using IDKB to assess their\nreliability in the context of autonomous driving and provided extensive\nanalysis. We also fine-tuned popular models, achieving notable performance\nimprovements, which further validate the significance of our dataset. The\nproject page can be found at:\n\\url{https://4dvlab.github.io/project_page/idkb.html}\n","authors":["Yuhang Lu","Yichen Yao","Jiadong Tu","Jiangnan Shao","Yuexin Ma","Xinge Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.02914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02910v1","updated":"2024-09-04T17:49:54Z","published":"2024-09-04T17:49:54Z","title":"SITAR: Semi-supervised Image Transformer for Action Recognition","summary":" Recognizing actions from a limited set of labeled videos remains a challenge\nas annotating visual data is not only tedious but also can be expensive due to\nclassified nature. Moreover, handling spatio-temporal data using deep $3$D\ntransformers for this can introduce significant computational complexity. In\nthis paper, our objective is to address video action recognition in a\nsemi-supervised setting by leveraging only a handful of labeled videos along\nwith a collection of unlabeled videos in a compute efficient manner.\nSpecifically, we rearrange multiple frames from the input videos in row-column\nform to construct super images. Subsequently, we capitalize on the vast pool of\nunlabeled samples and employ contrastive learning on the encoded super images.\nOur proposed approach employs two pathways to generate representations for\ntemporally augmented super images originating from the same video.\nSpecifically, we utilize a 2D image-transformer to generate representations and\napply a contrastive loss function to minimize the similarity between\nrepresentations from different videos while maximizing the representations of\nidentical videos. Our method demonstrates superior performance compared to\nexisting state-of-the-art approaches for semi-supervised action recognition\nacross various benchmark datasets, all while significantly reducing\ncomputational costs.\n","authors":["Owais Iqbal","Omprakash Chakraborty","Aftab Hussain","Rameswar Panda","Abir Das"],"pdf_url":"https://arxiv.org/pdf/2409.02910v1.pdf","comment":"Accepted at ICPR 2024"},{"id":"http://arxiv.org/abs/2408.07832v3","updated":"2024-09-04T17:31:00Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets. The code is\navailable (https://github.com/batmanlab/Ladder).\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13113v2","updated":"2024-09-04T17:29:04Z","published":"2024-03-19T19:36:48Z","title":"Quantifying uncertainty in lung cancer segmentation with foundation\n models applied to mixed-domain datasets","summary":" Medical image foundation models have shown the ability to segment organs and\ntumors with minimal fine-tuning. These models are typically evaluated on\ntask-specific in-distribution (ID) datasets. However, reliable performance on\nID dataset does not guarantee robust generalization on out-of-distribution\n(OOD) datasets. Importantly, once deployed for clinical use, it is impractical\nto have `ground truth' delineations to assess ongoing performance drifts,\nespecially when images fall into OOD category due to different imaging\nprotocols. Hence, we introduced a comprehensive set of computationally fast\nmetrics to evaluate the performance of multiple foundation models (Swin UNETR,\nSimMIM, iBOT, SMIT) trained with self-supervised learning (SSL). SSL\npretraining was selected as this approach is applicable for large, diverse, and\nunlabeled image sets. All models were fine-tuned on identical datasets for lung\ntumor segmentation from computed tomography (CT) scans. SimMIM, iBOT, and SMIT\nused identical architecture, pretraining, and fine-tuning datasets to assess\nperformance variations with the choice of pretext tasks used in SSL. Evaluation\nwas performed on two public lung cancer datasets (LRAD: n = 140, 5Rater: n =\n21) with different image acquisitions and tumor stage compared to training data\n(n = 317 public resource with stage III-IV lung cancers) and a public\nnon-cancer dataset containing volumetric CT scans of patients with pulmonary\nembolism (n = 120). All models produced similarly accurate tumor segmentation\non the lung cancer testing datasets. SMIT produced a highest F1-score (LRAD:\n0.60, 5Rater: 0.64) and lowest entropy (LRAD: 0.06, 5Rater: 0.12), indicating\nhigher tumor detection rate and confident segmentations. In the OOD dataset,\nSMIT misdetected least number of tumors, indicated by median volume occupancy\nof 5.67 cc compared to second best method SimMIM of 9.97 cc.\n","authors":["Aneesh Rangnekar","Nishant Nadkarni","Jue Jiang","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2403.13113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02889v1","updated":"2024-09-04T17:25:21Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v1.pdf","comment":"19 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.02885v1","updated":"2024-09-04T17:15:44Z","published":"2024-09-04T17:15:44Z","title":"CanvOI, an Oncology Intelligence Foundation Model: Scaling FLOPS\n Differently","summary":" The rapidly evolving field of digital oncopathology faces significant\nchallenges, including the need to address diverse and complex clinical\nquestions, often involving rare conditions, with limited availability of\nlabeled data. These limitations hinder the development of robust AI-driven\ntools in the biomedical space, where accuracy in probabilistic determinations\nis of utmost importance. To address this, digital pathology foundation models\nhave begun to emerge, typically developed with the size and diversity of the\npre-training dataset and model parameters in mind. Here, we present CanvOI, a\nViT-g/10-based foundation model designed to enhance the capabilities of digital\npathology by addressing these challenges through a different approach.\nConsidering the unique nature of oncologic histopathological images and the\nrequirements from the embeddings to provide meaningful representations for\nMultiple Instance Learning (MIL) downstream models, we chose to modify the\ninput image characteristics. By introducing larger tile sizes (380 x 380\npixels) and smaller patch sizes (10 x 10 pixels), we were able to optimize the\nmodel's performance, pushing computational resources in a new direction and\nachieving state-of-the-art performance on cancer-related benchmarks. CanvOI\ndemonstrated a 1.5-7.4% improvement in averaged AUC compared to other leading\nfoundation models built for digital pathology. Moreover, our results\ndemonstrate that CanvOI significantly outperformed the other models, with the\nperformance gap widening substantially when trained on just 10% of the initial\ncohort. This work highlights an alternative approach that, if integrated with\ntraditional development approaches, has the potential to advance Oncology\nIntelligence (OI), overcome some of the current barriers and ultimately improve\nthe clinical outcome of cancer patients.\n","authors":["Jonathan Zalach","Inbal Gazy","Assaf Avinoam","Ron Sinai","Eran Shmuel","Inbar Gilboa","Christine Swisher","Naim Matasci","Reva Basho","David B. Agus"],"pdf_url":"https://arxiv.org/pdf/2409.02885v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02883v1","updated":"2024-09-04T17:08:04Z","published":"2024-09-04T17:08:04Z","title":"Multi-stream deep learning framework to predict mild cognitive\n impairment with Rey Complex Figure Test","summary":" Drawing tests like the Rey Complex Figure Test (RCFT) are widely used to\nassess cognitive functions such as visuospatial skills and memory, making them\nvaluable tools for detecting mild cognitive impairment (MCI). Despite their\nutility, existing predictive models based on these tests often suffer from\nlimitations like small sample sizes and lack of external validation, which\nundermine their reliability. We developed a multi-stream deep learning\nframework that integrates two distinct processing streams: a multi-head\nself-attention based spatial stream using raw RCFT images and a scoring stream\nemploying a previously developed automated scoring system. Our model was\ntrained on data from 1,740 subjects in the Korean cohort and validated on an\nexternal hospital dataset of 222 subjects from Korea. The proposed multi-stream\nmodel demonstrated superior performance over baseline models (AUC = 0.872,\nAccuracy = 0.781) in external validation. The integration of both spatial and\nscoring streams enables the model to capture intricate visual details from the\nraw images while also incorporating structured scoring data, which together\nenhance its ability to detect subtle cognitive impairments. This dual approach\nnot only improves predictive accuracy but also increases the robustness of the\nmodel, making it more reliable in diverse clinical settings. Our model has\npractical implications for clinical settings, where it could serve as a\ncost-effective tool for early MCI screening.\n","authors":["Junyoung Park","Eun Hyun Seo","Sunjun Kim","SangHak Yi","Kun Ho Lee","Sungho Won"],"pdf_url":"https://arxiv.org/pdf/2409.02883v1.pdf","comment":"20 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.02882v1","updated":"2024-09-04T17:07:46Z","published":"2024-09-04T17:07:46Z","title":"Benchmarking Spurious Bias in Few-Shot Image Classifiers","summary":" Few-shot image classifiers are designed to recognize and classify new data\nwith minimal supervision and limited data but often show reliance on spurious\ncorrelations between classes and spurious attributes, known as spurious bias.\nSpurious correlations commonly hold in certain samples and few-shot classifiers\ncan suffer from spurious bias induced from them. There is an absence of an\nautomatic benchmarking system to assess the robustness of few-shot classifiers\nagainst spurious bias. In this paper, we propose a systematic and rigorous\nbenchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied\ndegrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates\nfew-shot evaluation tasks with biased attributes so that using them for\npredictions can demonstrate poor performance. To construct these tasks, we\npropose attribute-based sample selection strategies based on a pre-trained\nvision-language model, eliminating the need for manual dataset curation. This\nallows FewSTAB to automatically benchmark spurious bias using any existing test\ndata. FewSTAB offers evaluation results in a new dimension along with a new\ndesign guideline for building robust classifiers. Moreover, it can benchmark\nspurious bias in varied degrees and enable designs for varied degrees of\nrobustness. Its effectiveness is demonstrated through experiments on ten\nfew-shot learning methods across three datasets. We hope our framework can\ninspire new designs of robust few-shot classifiers. Our code is available at\nhttps://github.com/gtzheng/FewSTAB.\n","authors":["Guangtao Zheng","Wenqian Ye","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02882v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02867v1","updated":"2024-09-04T16:50:48Z","published":"2024-09-04T16:50:48Z","title":"The Impact of Balancing Real and Synthetic Data on Accuracy and Fairness\n in Face Recognition","summary":" Over the recent years, the advancements in deep face recognition have fueled\nan increasing demand for large and diverse datasets. Nevertheless, the\nauthentic data acquired to create those datasets is typically sourced from the\nweb, which, in many cases, can lead to significant privacy issues due to the\nlack of explicit user consent. Furthermore, obtaining a demographically\nbalanced, large dataset is even more difficult because of the natural imbalance\nin the distribution of images from different demographic groups. In this paper,\nwe investigate the impact of demographically balanced authentic and synthetic\ndata, both individually and in combination, on the accuracy and fairness of\nface recognition models. Initially, several generative methods were used to\nbalance the demographic representations of the corresponding synthetic\ndatasets. Then a state-of-the-art face encoder was trained and evaluated using\n(combinations of) synthetic and authentic images. Our findings emphasized two\nmain points: (i) the increased effectiveness of training data generated by\ndiffusion-based models in enhancing accuracy, whether used alone or combined\nwith subsets of authentic data, and (ii) the minimal impact of incorporating\nbalanced data from pre-trained generative methods on fairness (in nearly all\ntested scenarios using combined datasets, fairness scores remained either\nunchanged or worsened, even when compared to unbalanced authentic datasets).\nSource code and data are available at \\url{https://cutt.ly/AeQy1K5G} for\nreproducibility.\n","authors":["Andrea Atzori","Pietro Cosseddu","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2409.02867v1.pdf","comment":"Accepted at Synthetic Data for Computer Vision Workshop - Side Event\n at ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02866v1","updated":"2024-09-04T16:47:16Z","published":"2024-09-04T16:47:16Z","title":"Hybrid-Segmentor: A Hybrid Approach to Automated Fine-Grained Crack\n Segmentation in Civil Infrastructure","summary":" Detecting and segmenting cracks in infrastructure, such as roads and\nbuildings, is crucial for safety and cost-effective maintenance. In spite of\nthe potential of deep learning, there are challenges in achieving precise\nresults and handling diverse crack types. With the proposed dataset and model,\nwe aim to enhance crack detection and infrastructure maintenance. We introduce\nHybrid-Segmentor, an encoder-decoder based approach that is capable of\nextracting both fine-grained local and global crack features. This allows the\nmodel to improve its generalization capabilities in distinguish various type of\nshapes, surfaces and sizes of cracks. To keep the computational performances\nlow for practical purposes, while maintaining the high the generalization\ncapabilities of the model, we incorporate a self-attention model at the encoder\nlevel, while reducing the complexity of the decoder component. The proposed\nmodel outperforms existing benchmark models across 5 quantitative metrics\n(accuracy 0.971, precision 0.804, recall 0.744, F1-score 0.770, and IoU score\n0.630), achieving state-of-the-art status.\n","authors":["June Moh Goo","Xenios Milidonis","Alessandro Artusi","Jan Boehm","Carlo Ciliberto"],"pdf_url":"https://arxiv.org/pdf/2409.02866v1.pdf","comment":"25 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.02851v1","updated":"2024-09-04T16:21:33Z","published":"2024-09-04T16:21:33Z","title":"Human-VDM: Learning Single-Image 3D Human Gaussian Splatting from Video\n Diffusion Models","summary":" Generating lifelike 3D humans from a single RGB image remains a challenging\ntask in computer vision, as it requires accurate modeling of geometry,\nhigh-quality texture, and plausible unseen parts. Existing methods typically\nuse multi-view diffusion models for 3D generation, but they often face\ninconsistent view issues, which hinder high-quality 3D human generation. To\naddress this, we propose Human-VDM, a novel method for generating 3D human from\na single RGB image using Video Diffusion Models. Human-VDM provides temporally\nconsistent views for 3D human generation using Gaussian Splatting. It consists\nof three modules: a view-consistent human video diffusion module, a video\naugmentation module, and a Gaussian Splatting module. First, a single image is\nfed into a human video diffusion module to generate a coherent human video.\nNext, the video augmentation module applies super-resolution and video\ninterpolation to enhance the textures and geometric smoothness of the generated\nvideo. Finally, the 3D Human Gaussian Splatting module learns lifelike humans\nunder the guidance of these high-resolution and view-consistent images.\nExperiments demonstrate that Human-VDM achieves high-quality 3D human from a\nsingle image, outperforming state-of-the-art methods in both generation quality\nand quantity. Project page: https://human-vdm.github.io/Human-VDM/\n","authors":["Zhibin Liu","Haoye Dong","Aviral Chharia","Hefeng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.02851v1.pdf","comment":"14 Pages, 8 figures, Project page:\n https://human-vdm.github.io/Human-VDM/"},{"id":"http://arxiv.org/abs/2409.02846v1","updated":"2024-09-04T16:17:45Z","published":"2024-09-04T16:17:45Z","title":"MaDis-Stereo: Enhanced Stereo Matching via Distilled Masked Image\n Modeling","summary":" In stereo matching, CNNs have traditionally served as the predominant\narchitectures. Although Transformer-based stereo models have been studied\nrecently, their performance still lags behind CNN-based stereo models due to\nthe inherent data scarcity issue in the stereo matching task. In this paper, we\npropose Masked Image Modeling Distilled Stereo matching model, termed\nMaDis-Stereo, that enhances locality inductive bias by leveraging Masked Image\nModeling (MIM) in training Transformer-based stereo model. Given randomly\nmasked stereo images as inputs, our method attempts to conduct both image\nreconstruction and depth prediction tasks. While this strategy is beneficial to\nresolving the data scarcity issue, the dual challenge of reconstructing masked\ntokens and subsequently performing stereo matching poses significant\nchallenges, particularly in terms of training stability. To address this, we\npropose to use an auxiliary network (teacher), updated via Exponential Moving\nAverage (EMA), along with the original stereo model (student), where teacher\npredictions serve as pseudo supervisory signals to effectively distill\nknowledge into the student model. State-of-the-arts performance is achieved\nwith the proposed method on several stereo matching such as ETH3D and KITTI\n2015. Additionally, to demonstrate that our model effectively leverages\nlocality inductive bias, we provide the attention distance measurement.\n","authors":["Jihye Ahn","Hyesong Choi","Soomin Kim","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02838v1","updated":"2024-09-04T16:06:23Z","published":"2024-09-04T16:06:23Z","title":"iConFormer: Dynamic Parameter-Efficient Tuning with Input-Conditioned\n Adaptation","summary":" Transfer learning based on full fine-tuning (FFT) of the pre-trained encoder\nand task-specific decoder becomes increasingly complex as deep models grow\nexponentially. Parameter efficient fine-tuning (PEFT) approaches using adapters\nconsisting of small learnable layers have emerged as an alternative to FFT,\nachieving comparable performance while maintaining high training efficiency.\nHowever, the inflexibility of the adapter with respect to input instances\nlimits its capability of learning task-specific information in diverse\ndownstream tasks. In this paper, we propose a novel PEFT approach,\ninput-Conditioned transFormer, termed iConFormer, that leverages a dynamic\nadapter conditioned on the input instances. To secure flexible learning ability\non input instances in various downstream tasks, we introduce an\ninput-Conditioned Network (iCoN) in the dynamic adapter that enables\ninstance-level feature transformation. To be specific, iCoN generates\nchannel-wise convolutional kernels for each feature and transform it using\nadaptive convolution process to effectively capture task-specific and\nfine-grained details tailor to downstream tasks. Experimental results\ndemonstrate that by tuning just 1.6% to 2.8% of the Transformer backbone\nparameters, iConFormer achieves performance comparable to FFT in monocular\ndepth estimation and semantic segmentation, while outperforming it in image\nclassification and instance segmentation. Also, the proposed method\nconsistently outperforms recent PEFT methods for all the tasks mentioned above.\n","authors":["Hayeon Jo","Hyesong Choi","Minhee Cho","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08784v2","updated":"2024-09-04T15:52:08Z","published":"2024-08-16T14:56:17Z","title":"Multi-task Learning Approach for Intracranial Hemorrhage Prognosis","summary":" Prognosis after intracranial hemorrhage (ICH) is influenced by a complex\ninterplay between imaging and tabular data. Rapid and reliable prognosis are\ncrucial for effective patient stratification and informed treatment\ndecision-making. In this study, we aim to enhance image-based prognosis by\nlearning a robust feature representation shared between prognosis and the\nclinical and demographic variables most highly correlated with it. Our approach\nmimics clinical decision-making by reinforcing the model to learn valuable\nprognostic data embedded in the image. We propose a 3D multi-task image model\nto predict prognosis, Glasgow Coma Scale and age, improving accuracy and\ninterpretability. Our method outperforms current state-of-the-art baseline\nimage models, and demonstrates superior performance in ICH prognosis compared\nto four board-certified neuroradiologists using only CT scans as input. We\nfurther validate our model with interpretability saliency maps. Code is\navailable at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git.\n","authors":["Miriam Cobo","Amaia Pérez del Barrio","Pablo Menéndez Fernández-Miranda","Pablo Sanz Bellón","Lara Lloret Iglesias","Wilson Silva"],"pdf_url":"https://arxiv.org/pdf/2408.08784v2.pdf","comment":"16 pages. Accepted at Machine Learning in Medical Imaging Workshop @\n MICCAI 2024 (MLMI2024). This is the submitted manuscript with added link to\n github repo, funding acknowledgements and authors' names and affiliations. No\n further post submission improvements or corrections were integrated. Final\n version not published yet"},{"id":"http://arxiv.org/abs/2409.02828v1","updated":"2024-09-04T15:50:16Z","published":"2024-09-04T15:50:16Z","title":"ExpLLM: Towards Chain of Thought for Facial Expression Recognition","summary":" Facial expression recognition (FER) is a critical task in multimedia with\nsignificant implications across various domains. However, analyzing the causes\nof facial expressions is essential for accurately recognizing them. Current\napproaches, such as those based on facial action units (AUs), typically provide\nAU names and intensities but lack insight into the interactions and\nrelationships between AUs and the overall expression. In this paper, we propose\na novel method called ExpLLM, which leverages large language models to generate\nan accurate chain of thought (CoT) for facial expression recognition.\nSpecifically, we have designed the CoT mechanism from three key perspectives:\nkey observations, overall emotional interpretation, and conclusion. The key\nobservations describe the AU's name, intensity, and associated emotions. The\noverall emotional interpretation provides an analysis based on multiple AUs and\ntheir interactions, identifying the dominant emotions and their relationships.\nFinally, the conclusion presents the final expression label derived from the\npreceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed\nto construct this expression CoT and generate instruction-description data for\ntraining our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets\ndemonstrate that ExpLLM outperforms current state-of-the-art FER methods.\nExpLLM also surpasses the latest GPT-4o in expression CoT generation,\nparticularly in recognizing micro-expressions where GPT-4o frequently fails.\n","authors":["Xing Lan","Jian Xue","Ji Qi","Dongmei Jiang","Ke Lu","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2409.02828v1.pdf","comment":"project page: https://starhiking.github.io/ExpLLM_Page/"},{"id":"http://arxiv.org/abs/2409.02826v1","updated":"2024-09-04T15:45:32Z","published":"2024-09-04T15:45:32Z","title":"Automatic facial axes standardization of 3D fetal ultrasound images","summary":" Craniofacial anomalies indicate early developmental disturbances and are\nusually linked to many genetic syndromes. Early diagnosis is critical, yet\nultrasound (US) examinations often fail to identify these features. This study\npresents an AI-driven tool to assist clinicians in standardizing fetal facial\naxes/planes in 3D US, reducing sonographer workload and facilitating the facial\nevaluation. Our network, structured into three blocks-feature extractor,\nrotation and translation regression, and spatial transformer-processes three\northogonal 2D slices to estimate the necessary transformations for\nstandardizing the facial planes in the 3D US. These transformations are applied\nto the original 3D US using a differentiable module (the spatial transformer\nblock), yielding a standardized 3D US and the corresponding 2D facial standard\nplanes. The dataset used consists of 1180 fetal facial 3D US images acquired\nbetween weeks 20 and 35 of gestation. Results show that our network\nconsiderably reduces inter-observer rotation variability in the test set, with\na mean geodesic angle difference of 14.12$^{\\circ}$ $\\pm$ 18.27$^{\\circ}$ and\nan Euclidean angle error of 7.45$^{\\circ}$ $\\pm$ 14.88$^{\\circ}$. These\nfindings demonstrate the network's ability to effectively standardize facial\naxes, crucial for consistent fetal facial assessments. In conclusion, the\nproposed network demonstrates potential for improving the consistency and\naccuracy of fetal facial assessments in clinical settings, facilitating early\nevaluation of craniofacial anomalies.\n","authors":["Antonia Alomar","Ricardo Rubio","Laura Salort","Gerard Albaiges","Antoni Payà","Gemma Piella","Federico Sukno"],"pdf_url":"https://arxiv.org/pdf/2409.02826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02825v1","updated":"2024-09-04T15:43:10Z","published":"2024-09-04T15:43:10Z","title":"Deep Learning Meets Satellite Images -- An Evaluation on Handcrafted and\n Learning-based Features for Multi-date Satellite Stereo Images","summary":" A critical step in the digital surface models(DSM) generation is feature\nmatching. Off-track (or multi-date) satellite stereo images, in particular, can\nchallenge the performance of feature matching due to spectral distortions\nbetween images, long baseline, and wide intersection angles. Feature matching\nmethods have evolved over the years from handcrafted methods (e.g., SIFT) to\nlearning-based methods (e.g., SuperPoint and SuperGlue). In this paper, we\ncompare the performance of different features, also known as feature extraction\nand matching methods, applied to satellite imagery. A wide range of stereo\npairs(~500) covering two separate study sites are used. SIFT, as a widely used\nclassic feature extraction and matching algorithm, is compared with seven\ndeep-learning matching methods: SuperGlue, LightGlue, LoFTR, ASpanFormer, DKM,\nGIM-LightGlue, and GIM-DKM. Results demonstrate that traditional matching\nmethods are still competitive in this age of deep learning, although for\nparticular scenarios learning-based methods are very promising.\n","authors":["Shuang Song","Luca Morelli","Xinyi Wu","Rongjun Qin","Hessah Albanwan","Fabio Remondino"],"pdf_url":"https://arxiv.org/pdf/2409.02825v1.pdf","comment":"ECCV2024 Workshop - TradiCV"},{"id":"http://arxiv.org/abs/2408.10283v2","updated":"2024-09-04T15:36:52Z","published":"2024-08-19T00:31:05Z","title":"SDE-based Multiplicative Noise Removal","summary":" Multiplicative noise, also known as speckle or pepper noise, commonly affects\nimages produced by synthetic aperture radar (SAR), lasers, or optical lenses.\nUnlike additive noise, which typically arises from thermal processes or\nexternal factors, multiplicative noise is inherent to the system, originating\nfrom the fluctuation in diffuse reflections. These fluctuations result in\nmultiple copies of the same signal with varying magnitudes being combined.\nConsequently, despeckling, or removing multiplicative noise, necessitates\ndifferent techniques compared to those used for additive noise removal.\n In this paper, we propose a novel approach using Stochastic Differential\nEquations based diffusion models to address multiplicative noise. We\ndemonstrate that multiplicative noise can be effectively modeled as a Geometric\nBrownian Motion process in the logarithmic domain. Utilizing the Fokker-Planck\nequation, we derive the corresponding reverse process for image denoising. To\nvalidate our method, we conduct extensive experiments on two different\ndatasets, comparing our approach to both classical signal processing techniques\nand contemporary CNN-based noise removal models. Our results indicate that the\nproposed method significantly outperforms existing methods on perception-based\nmetrics such as FID and LPIPS, while maintaining competitive performance on\ntraditional metrics like PSNR and SSIM.\n","authors":["An Vuong","Thinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.10283v2.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2408.14279v2","updated":"2024-09-04T15:33:15Z","published":"2024-08-26T13:55:42Z","title":"Learning Local Pattern Modularization for Point Cloud Reconstruction\n from Unseen Classes","summary":" It is challenging to reconstruct 3D point clouds in unseen classes from\nsingle 2D images. Instead of object-centered coordinate system, current methods\ngeneralized global priors learned in seen classes to reconstruct 3D shapes from\nunseen classes in viewer-centered coordinate system. However, the\nreconstruction accuracy and interpretability are still eager to get improved.\nTo resolve this issue, we introduce to learn local pattern modularization for\nreconstructing 3D shapes in unseen classes, which achieves both good\ngeneralization ability and high reconstruction accuracy. Our insight is to\nlearn a local prior which is class-agnostic and easy to generalize in\nobject-centered coordinate system. Specifically, the local prior is learned via\na process of learning and customizing local pattern modularization in seen\nclasses. During this process, we first learn a set of patterns in local\nregions, which is the basis in the object-centered coordinate system to\nrepresent an arbitrary region on shapes across different classes. Then, we\nmodularize each region on an initially reconstructed shape using the learned\nlocal patterns. Based on that, we customize the local pattern modularization\nusing the input image by refining the reconstruction with more details. Our\nmethod enables to reconstruct high fidelity point clouds from unseen classes in\nobject-centered coordinate system without requiring a large number of patterns\nor any additional information, such as segmentation supervision or camera\nposes. Our experimental results under widely used benchmarks show that our\nmethod achieves the state-of-the-art reconstruction accuracy for shapes from\nunseen classes. The code is available at https://github.com/chenchao15/Unseen.\n","authors":["Chao Chen","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2408.14279v2.pdf","comment":"14pages, 11figures, accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.02813v1","updated":"2024-09-04T15:31:26Z","published":"2024-09-04T15:31:26Z","title":"MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding\n Benchmark","summary":" This paper introduces MMMU-Pro, a robust version of the Massive\nMulti-discipline Multimodal Understanding and Reasoning (MMMU) benchmark.\nMMMU-Pro rigorously assesses multimodal models' true understanding and\nreasoning capabilities through a three-step process based on MMMU: (1)\nfiltering out questions answerable by text-only models, (2) augmenting\ncandidate options, and (3) introducing a vision-only input setting where\nquestions are embedded within images. This setting challenges AI to truly \"see\"\nand \"read\" simultaneously, testing a fundamental human cognitive skill of\nseamlessly integrating visual and textual information. Results show that model\nperformance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8%\nto 26.9% across models. We explore the impact of OCR prompts and Chain of\nThought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT\ngenerally improves performance. MMMU-Pro provides a more rigorous evaluation\ntool, closely mimicking real-world scenarios and offering valuable directions\nfor future research in multimodal AI.\n","authors":["Xiang Yue","Tianyu Zheng","Yuansheng Ni","Yubo Wang","Kai Zhang","Shengbang Tong","Yuxuan Sun","Ming Yin","Botao Yu","Ge Zhang","Huan Sun","Yu Su","Wenhu Chen","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2409.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01021v2","updated":"2024-09-04T15:25:27Z","published":"2024-09-02T08:01:32Z","title":"CONDA: Condensed Deep Association Learning for Co-Salient Object\n Detection","summary":" Inter-image association modeling is crucial for co-salient object detection.\nDespite satisfactory performance, previous methods still have limitations on\nsufficient inter-image association modeling. Because most of them focus on\nimage feature optimization under the guidance of heuristically calculated raw\ninter-image associations. They directly rely on raw associations which are not\nreliable in complex scenarios, and their image feature optimization approach is\nnot explicit for inter-image association modeling. To alleviate these\nlimitations, this paper proposes a deep association learning strategy that\ndeploys deep networks on raw associations to explicitly transform them into\ndeep association features. Specifically, we first create hyperassociations to\ncollect dense pixel-pair-wise raw associations and then deploys deep\naggregation networks on them. We design a progressive association generation\nmodule for this purpose with additional enhancement of the hyperassociation\ncalculation. More importantly, we propose a correspondence-induced association\ncondensation module that introduces a pretext task, i.e. semantic\ncorrespondence estimation, to condense the hyperassociations for computational\nburden reduction and noise elimination. We also design an object-aware cycle\nconsistency loss for high-quality correspondence estimations. Experimental\nresults in three benchmark datasets demonstrate the remarkable effectiveness of\nour proposed method with various training settings.\n","authors":["Long Li","Nian Liu","Dingwen Zhang","Zhongyu Li","Salman Khan","Rao Anwer","Hisham Cholakkal","Junwei Han","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2409.01021v2.pdf","comment":"There is an error. In Sec 4.1, the number of images in some dataset\n is incorrect and needs to be revised"},{"id":"http://arxiv.org/abs/2308.13495v3","updated":"2024-09-04T15:12:03Z","published":"2023-08-25T17:10:22Z","title":"Open Gaze: Open Source eye tracker for smartphone devices using Deep\n Learning","summary":" Eye tracking has been a pivotal tool in diverse fields such as vision\nresearch, language analysis, and usability assessment. The majority of prior\ninvestigations, however, have concentrated on expansive desktop displays\nemploying specialized, costly eye tracking hardware that lacks scalability.\nRemarkably little insight exists into ocular movement patterns on smartphones,\ndespite their widespread adoption and significant usage. In this manuscript, we\npresent an open-source implementation of a smartphone-based gaze tracker that\nemulates the methodology proposed by a GooglePaper (whose source code remains\nproprietary). Our focus is on attaining accuracy comparable to that attained\nthrough the GooglePaper's methodology, without the necessity for supplementary\nhardware. Through the integration of machine learning techniques, we unveil an\naccurate eye tracking solution that is native to smartphones. Our approach\ndemonstrates precision akin to the state-of-the-art mobile eye trackers, which\nare characterized by a cost that is two orders of magnitude higher. Leveraging\nthe vast MIT GazeCapture dataset, which is available through registration on\nthe dataset's website, we successfully replicate crucial findings from previous\nstudies concerning ocular motion behavior in oculomotor tasks and saliency\nanalyses during natural image observation. Furthermore, we emphasize the\napplicability of smartphone-based gaze tracking in discerning reading\ncomprehension challenges. Our findings exhibit the inherent potential to\namplify eye movement research by significant proportions, accommodating\nparticipation from thousands of subjects with explicit consent. This\nscalability not only fosters advancements in vision research, but also extends\nits benefits to domains such as accessibility enhancement and healthcare\napplications.\n","authors":["Sushmanth reddy","Jyothi Swaroop Reddy"],"pdf_url":"https://arxiv.org/pdf/2308.13495v3.pdf","comment":"This paper results are incorrectly reported. The paper is not\n authentic and conclusions are not correct"},{"id":"http://arxiv.org/abs/2311.12912v3","updated":"2024-09-04T15:09:32Z","published":"2023-11-21T17:27:20Z","title":"Q-Seg: Quantum Annealing-Based Unsupervised Image Segmentation","summary":" We present Q-Seg, a novel unsupervised image segmentation method based on\nquantum annealing, tailored for existing quantum hardware. We formulate the\npixel-wise segmentation problem, which assimilates spectral and spatial\ninformation of the image, as a graph-cut optimization task. Our method\nefficiently leverages the interconnected qubit topology of the D-Wave Advantage\ndevice, offering superior scalability over existing quantum approaches and\noutperforming several tested state-of-the-art classical methods. Empirical\nevaluations on synthetic datasets have shown that Q-Seg has better runtime\nperformance than the state-of-the-art classical optimizer Gurobi. The method\nhas also been tested on earth observation image segmentation, a critical area\nwith noisy and unreliable annotations. In the era of noisy intermediate-scale\nquantum, Q-Seg emerges as a reliable contender for real-world applications in\ncomparison to advanced techniques like Segment Anything. Consequently, Q-Seg\noffers a promising solution using available quantum hardware, especially in\nsituations constrained by limited labeled data and the need for efficient\ncomputational runtime.\n","authors":["Supreeth Mysore Venkatesh","Antonio Macaluso","Marlon Nuske","Matthias Klusch","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2311.12912v3.pdf","comment":"12 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.02792v1","updated":"2024-09-04T15:06:44Z","published":"2024-09-04T15:06:44Z","title":"UnLearning from Experience to Avoid Spurious Correlations","summary":" While deep neural networks can achieve state-of-the-art performance in many\ntasks, these models are more fragile than they appear. They are prone to\nlearning spurious correlations in their training data, leading to surprising\nfailure cases. In this paper, we propose a new approach that addresses the\nissue of spurious correlations: UnLearning from Experience (ULE). Our method is\nbased on using two classification models trained in parallel: student and\nteacher models. Both models receive the same batches of training data. The\nstudent model is trained with no constraints and pursues the spurious\ncorrelations in the data. The teacher model is trained to solve the same\nclassification problem while avoiding the mistakes of the student model. As\ntraining is done in parallel, the better the student model learns the spurious\ncorrelations, the more robust the teacher model becomes. The teacher model uses\nthe gradient of the student's output with respect to its input to unlearn\nmistakes made by the student. We show that our method is effective on the\nWaterbirds, CelebA, Spawrious and UrbanCars datasets.\n","authors":["Jeff Mitchell","Jesús Martínez del Rincón","Niall McLaughlin"],"pdf_url":"https://arxiv.org/pdf/2409.02792v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2401.11421v2","updated":"2024-09-04T15:01:15Z","published":"2024-01-21T07:57:04Z","title":"Enhancing the vision-language foundation model with key semantic\n knowledge-emphasized report refinement","summary":" Recently, vision-language representation learning has made remarkable\nadvancements in building up medical foundation models, holding immense\npotential for transforming the landscape of clinical research and medical care.\nThe underlying hypothesis is that the rich knowledge embedded in radiology\nreports can effectively assist and guide the learning process, reducing the\nneed for additional labels. However, these reports tend to be complex and\nsometimes even consist of redundant descriptions that make the representation\nlearning too challenging to capture the key semantic information. This paper\ndevelops a novel iterative vision-language representation learning framework by\nproposing a key semantic knowledge-emphasized report refinement method.\nParticularly, raw radiology reports are refined to highlight the key\ninformation according to a constructed clinical dictionary and two\nmodel-optimized knowledge-enhancement metrics. The iterative framework is\ndesigned to progressively learn, starting from gaining a general understanding\nof the patient's condition based on raw reports and gradually refines and\nextracts critical information essential to the fine-grained analysis tasks. The\neffectiveness of the proposed framework is validated on various downstream\nmedical image analysis tasks, including disease classification,\nregion-of-interest segmentation, and phrase grounding. Our framework surpasses\nseven state-of-the-art methods in both fine-tuning and zero-shot settings,\ndemonstrating its encouraging potential for different clinical applications.\n","authors":["Weijian Huang","Cheng Li","Hao Yang","Jiarun Liu","Yong Liang","Hairong Zheng","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08381v2","updated":"2024-09-04T14:52:59Z","published":"2024-08-15T18:54:31Z","title":"Pre-processing and Compression: Understanding Hidden Representation\n Refinement Across Imaging Domains via Intrinsic Dimension","summary":" In recent years, there has been interest in how geometric properties such as\nintrinsic dimension (ID) of a neural network's hidden representations change\nthrough its layers, and how such properties are predictive of important model\nbehavior such as generalization ability. However, evidence has begun to emerge\nthat such behavior can change significantly depending on the domain of the\nnetwork's training data, such as natural versus medical images. Here, we\nfurther this inquiry by exploring how the ID of a network's learned\nrepresentations changes through its layers, in essence, characterizing how the\nnetwork successively refines the information content of input data to be used\nfor predictions. Analyzing eleven natural and medical image datasets across six\nnetwork architectures, we find that how ID changes through the network differs\nnoticeably between natural and medical image models. Specifically, medical\nimage models peak in representation ID earlier in the network, implying a\ndifference in the image features and their abstractness that are typically used\nfor downstream tasks in these domains. Additionally, we discover a strong\ncorrelation of this peak representation ID with the ID of the data in its input\nspace, implying that the intrinsic information content of a model's learned\nrepresentations is guided by that of the data it was trained on. Overall, our\nfindings emphasize notable discrepancies in network behavior between natural\nand non-natural imaging domains regarding hidden representation information\ncontent, and provide further insights into how a network's learned features are\nshaped by its training data.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.08381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02770v1","updated":"2024-09-04T14:49:35Z","published":"2024-09-04T14:49:35Z","title":"Validation of musculoskeletal segmentation model with uncertainty\n estimation for bone and muscle assessment in hip-to-knee clinical CT images","summary":" Deep learning-based image segmentation has allowed for the fully automated,\naccurate, and rapid analysis of musculoskeletal (MSK) structures from medical\nimages. However, current approaches were either applied only to 2D\ncross-sectional images, addressed few structures, or were validated on small\ndatasets, which limit the application in large-scale databases. This study\naimed to validate an improved deep learning model for volumetric MSK\nsegmentation of the hip and thigh with uncertainty estimation from clinical\ncomputed tomography (CT) images. Databases of CT images from multiple\nmanufacturers/scanners, disease status, and patient positioning were used. The\nsegmentation accuracy, and accuracy in estimating the structures volume and\ndensity, i.e., mean HU, were evaluated. An approach for segmentation failure\ndetection based on predictive uncertainty was also investigated. The model has\nshown an overall improvement with respect to all segmentation accuracy and\nstructure volume/density evaluation metrics. The predictive uncertainty yielded\nlarge areas under the receiver operating characteristic (AUROC) curves\n(AUROCs>=.95) in detecting inaccurate and failed segmentations. The high\nsegmentation and muscle volume/density estimation accuracy, along with the high\naccuracy in failure detection based on the predictive uncertainty, exhibited\nthe model's reliability for analyzing individual MSK structures in large-scale\nCT databases.\n","authors":["Mazen Soufi","Yoshito Otake","Makoto Iwasa","Keisuke Uemura","Tomoki Hakotani","Masahiro Hashimoto","Yoshitake Yamada","Minoru Yamada","Yoichi Yokoyama","Masahiro Jinzaki","Suzushi Kusano","Masaki Takao","Seiji Okada","Nobuhiko Sugano","Yoshinobu Sato"],"pdf_url":"https://arxiv.org/pdf/2409.02770v1.pdf","comment":"29 pages, 7+10supp figures, 8 tables"},{"id":"http://arxiv.org/abs/2408.11571v2","updated":"2024-09-04T14:38:03Z","published":"2024-08-21T12:27:36Z","title":"CHOTA: A Higher Order Accuracy Metric for Cell Tracking","summary":" The evaluation of cell tracking results steers the development of tracking\nmethods, significantly impacting biomedical research. This is quantitatively\nachieved by means of evaluation metrics. Unfortunately, current metrics favor\nlocal correctness and weakly reward global coherence, impeding high-level\nbiological analysis. To also foster global coherence, we propose the CHOTA\nmetric (Cell-specific Higher Order Tracking Accuracy) which unifies the\nevaluation of all relevant aspects of cell tracking: cell detections and local\nassociations, global coherence, and lineage tracking. We achieve this by\nintroducing a new definition of the term 'trajectory' that includes the entire\ncell lineage and by including this into the well-established HOTA metric from\ngeneral multiple object tracking. Furthermore, we provide a detailed survey of\ncontemporary cell tracking metrics to compare our novel CHOTA metric and to\nshow its advantages. All metrics are extensively evaluated on state-of-the-art\nreal-data cell tracking results and synthetic results that simulate specific\ntracking errors. We show that CHOTA is sensitive to all tracking errors and\ngives a good indication of the biologically relevant capability of a method to\nreconstruct the full lineage of cells. It introduces a robust and comprehensive\nalternative to the currently used metrics in cell tracking. Python code is\navailable at https://github.com/CellTrackingChallenge/py-ctcmetrics .\n","authors":["Timo Kaiser","Vladimir Ulman","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2408.11571v2.pdf","comment":"Accepted at BIC Workshop at European Conference on Computer Vision\n 2024, 14 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.02699v1","updated":"2024-09-04T13:35:15Z","published":"2024-09-04T13:35:15Z","title":"CLDA: Collaborative Learning for Enhanced Unsupervised Domain Adaptation","summary":" Unsupervised Domain Adaptation (UDA) endeavors to bridge the gap between a\nmodel trained on a labeled source domain and its deployment in an unlabeled\ntarget domain. However, current high-performance models demand significant\nresources, resulting in prohibitive deployment costs and highlighting the need\nfor small yet effective models. For UDA of lightweight models, Knowledge\nDistillation (KD) in a Teacher-Student framework can be a common approach, but\nwe find that domain shift in UDA leads to a significant increase in non-salient\nparameters in the teacher model, degrading model's generalization ability and\ntransferring misleading information to the student model. Interestingly, we\nobserved that this phenomenon occurs considerably less in the student model.\nDriven by this insight, we introduce Collaborative Learning, a method that\nupdates the teacher's non-salient parameters using the student model and at the\nsame time enhance the student's performance using the updated teacher model.\nExperiments across various tasks and datasets show consistent performance\nimprovements for both student and teacher models. For example, in semantic\nsegmentation, CLDA achieves an improvement of +0.7% mIoU for teacher and +1.4%\nmIoU for student compared to the baseline model in the GTA to Cityscapes. In\nthe Synthia to Cityscapes, it achieves an improvement of +0.8% mIoU for teacher\nand +2.0% mIoU for student.\n","authors":["Minhee Cho","Hyesong Choi","Hayeon Jo","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02683v1","updated":"2024-09-04T13:15:10Z","published":"2024-09-04T13:15:10Z","title":"Rethinking HTG Evaluation: Bridging Generation and Recognition","summary":" The evaluation of generative models for natural image tasks has been\nextensively studied. Similar protocols and metrics are used in cases with\nunique particularities, such as Handwriting Generation, even if they might not\nbe completely appropriate. In this work, we introduce three measures tailored\nfor HTG evaluation, $ \\text{HTG}_{\\text{HTR}} $, $ \\text{HTG}_{\\text{style}} $,\nand $ \\text{HTG}_{\\text{OOV}} $, and argue that they are more expedient to\nevaluate the quality of generated handwritten images. The metrics rely on the\nrecognition error/accuracy of Handwriting Text Recognition and Writer\nIdentification models and emphasize writing style, textual content, and\ndiversity as the main aspects that adhere to the content of handwritten images.\nWe conduct comprehensive experiments on the IAM handwriting database,\nshowcasing that widely used metrics such as FID fail to properly quantify the\ndiversity and the practical utility of generated handwriting samples. Our\nfindings show that our metrics are richer in information and underscore the\nnecessity of standardized evaluation protocols in HTG. The proposed metrics\nprovide a more robust and informative protocol for assessing HTG quality,\ncontributing to improved performance in HTR. Code for the evaluation protocol\nis available at: https://github.com/koninik/HTG_evaluation.\n","authors":["Konstantina Nikolaidou","George Retsinas","Giorgos Sfikas","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2409.02683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02676v1","updated":"2024-09-04T13:06:40Z","published":"2024-09-04T13:06:40Z","title":"Improved Single Camera BEV Perception Using Multi-Camera Training","summary":" Bird's Eye View (BEV) map prediction is essential for downstream autonomous\ndriving tasks like trajectory prediction. In the past, this was accomplished\nthrough the use of a sophisticated sensor configuration that captured a\nsurround view from multiple cameras. However, in large-scale production, cost\nefficiency is an optimization goal, so that using fewer cameras becomes more\nrelevant. But the consequence of fewer input images correlates with a\nperformance drop. This raises the problem of developing a BEV perception model\nthat provides a sufficient performance on a low-cost sensor setup. Although,\nprimarily relevant for inference time on production cars, this cost restriction\nis less problematic on a test vehicle during training. Therefore, the objective\nof our approach is to reduce the aforementioned performance drop as much as\npossible using a modern multi-camera surround view model reduced for\nsingle-camera inference. The approach includes three features, a modern masking\ntechnique, a cyclic Learning Rate (LR) schedule, and a feature reconstruction\nloss for supervising the transition from six-camera inputs to one-camera input\nduring training. Our method outperforms versions trained strictly with one\ncamera or strictly with six-camera surround view for single-camera inference\nresulting in reduced hallucination and better quality of the BEV map.\n","authors":["Daniel Busch","Ido Freeman","Richard Meyes","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.02676v1.pdf","comment":"This Paper has been accepted to the 27th IEEE International\n Conference on Intelligent Transportation Systems (ITSC 2024)"},{"id":"http://arxiv.org/abs/2409.02675v1","updated":"2024-09-04T13:05:00Z","published":"2024-09-04T13:05:00Z","title":"Multi-Head Attention Residual Unfolded Network for Model-Based\n Pansharpening","summary":" The objective of pansharpening and hypersharpening is to accurately combine a\nhigh-resolution panchromatic (PAN) image with a low-resolution multispectral\n(MS) or hyperspectral (HS) image, respectively. Unfolding fusion methods\nintegrate the powerful representation capabilities of deep learning with the\nrobustness of model-based approaches. These techniques involve unrolling the\nsteps of the optimization scheme derived from the minimization of an energy\ninto a deep learning framework, resulting in efficient and highly interpretable\narchitectures. In this paper, we propose a model-based deep unfolded method for\nsatellite image fusion. Our approach is based on a variational formulation that\nincorporates the classic observation model for MS/HS data, a high-frequency\ninjection constraint based on the PAN image, and an arbitrary convex prior. For\nthe unfolding stage, we introduce upsampling and downsampling layers that use\ngeometric information encoded in the PAN image through residual networks. The\nbackbone of our method is a multi-head attention residual network (MARNet),\nwhich replaces the proximity operator in the optimization scheme and combines\nmultiple head attentions with residual learning to exploit image\nself-similarities via nonlocal operators defined in terms of patches.\nAdditionally, we incorporate a post-processing module based on the MARNet\narchitecture to further enhance the quality of the fused images. Experimental\nresults on PRISMA, Quickbird, and WorldView2 datasets demonstrate the superior\nperformance of our method and its ability to generalize across different sensor\nconfigurations and varying spatial and spectral resolutions. The source code\nwill be available at https://github.com/TAMI-UIB/MARNet.\n","authors":["Ivan Pereira-Sánchez","Eloi Sans","Julia Navarro","Joan Duran"],"pdf_url":"https://arxiv.org/pdf/2409.02675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00917v2","updated":"2024-09-04T13:04:03Z","published":"2024-09-02T03:15:19Z","title":"Large Scale Unsupervised Brain MRI Image Registration Solution for\n Learn2Reg 2024","summary":" In this paper, we summarize the methods and experimental results we proposed\nfor Task 2 in the learn2reg 2024 Challenge. This task focuses on unsupervised\nregistration of anatomical structures in brain MRI images between different\npatients. The difficulty lies in: (1) without segmentation labels, and (2) a\nlarge amount of data. To address these challenges, we built an efficient\nbackbone network and explored several schemes to further enhance registration\naccuracy. Under the guidance of the NCC loss function and smoothness\nregularization loss function, we obtained a smooth and reasonable deformation\nfield. According to the leaderboard, our method achieved a Dice coefficient of\n77.34%, which is 1.4% higher than the TransMorph. Overall, we won second place\non the leaderboard for Task 2.\n","authors":["Yuxi Zhang","Xiang Chen","Jiazheng Wang","Min Liu","Yaonan Wang","Dongdong Liu","Renjiu Hu","Hang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.00917v2.pdf","comment":"MICCAI Learn2Reg 2024 Challenge & WBIR 2024 Workshop on Biomedical\n Imaging Registration"},{"id":"http://arxiv.org/abs/2405.11614v2","updated":"2024-09-04T13:02:15Z","published":"2024-05-19T17:09:43Z","title":"Nickel and Diming Your GAN: A Dual-Method Approach to Enhancing GAN\n Efficiency via Knowledge Distillation","summary":" In this paper, we address the challenge of compressing generative adversarial\nnetworks (GANs) for deployment in resource-constrained environments by\nproposing two novel methodologies: Distribution Matching for Efficient\ncompression (DiME) and Network Interactive Compression via Knowledge Exchange\nand Learning (NICKEL). DiME employs foundation models as embedding kernels for\nefficient distribution matching, leveraging maximum mean discrepancy to\nfacilitate effective knowledge distillation. Simultaneously, NICKEL employs an\ninteractive compression method that enhances the communication between the\nstudent generator and discriminator, achieving a balanced and stable\ncompression process. Our comprehensive evaluation on the StyleGAN2 architecture\nwith the FFHQ dataset shows the effectiveness of our approach, with NICKEL &\nDiME achieving FID scores of 10.45 and 15.93 at compression rates of 95.73% and\n98.92%, respectively. Remarkably, our methods sustain generative quality even\nat an extreme compression rate of 99.69%, surpassing the previous\nstate-of-the-art performance by a large margin. These findings not only\ndemonstrate our methodologies' capacity to significantly lower GANs'\ncomputational demands but also pave the way for deploying high-quality GAN\nmodels in settings with limited resources. Our code will be released soon.\n","authors":["Sangyeop Yeo","Yoojin Jang","Jaejun Yoo"],"pdf_url":"https://arxiv.org/pdf/2405.11614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01821v2","updated":"2024-09-04T12:58:11Z","published":"2024-09-03T12:03:45Z","title":"When Does Visual Prompting Outperform Linear Probing for Vision-Language\n Models? A Likelihood Perspective","summary":" Adapting pre-trained models to new tasks can exhibit varying effectiveness\nacross datasets. Visual prompting, a state-of-the-art parameter-efficient\ntransfer learning method, can significantly improve the performance of\nout-of-distribution tasks. On the other hand, linear probing, a standard\ntransfer learning method, can sometimes become the best approach. We propose a\nlog-likelihood ratio (LLR) approach to analyze the comparative benefits of\nvisual prompting and linear probing. By employing the LLR score alongside\nresource-efficient visual prompts approximations, our cost-effective measure\nattains up to a 100-fold reduction in run time compared to full training, while\nachieving prediction accuracies up to 91%. The source code is available at\nhttps://github.com/IBM/VP-LLR.\n","authors":["Hsi-Ai Tsao","Lei Hsiung","Pin-Yu Chen","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2409.01821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02664v1","updated":"2024-09-04T12:46:30Z","published":"2024-09-04T12:46:30Z","title":"Standing on the Shoulders of Giants: Reprogramming Visual-Language Model\n for General Deepfake Detection","summary":" The proliferation of deepfake faces poses huge potential negative impacts on\nour daily lives. Despite substantial advancements in deepfake detection over\nthese years, the generalizability of existing methods against forgeries from\nunseen datasets or created by emerging generative models remains constrained.\nIn this paper, inspired by the zero-shot advantages of Vision-Language Models\n(VLMs), we propose a novel approach that repurposes a well-trained VLM for\ngeneral deepfake detection. Motivated by the model reprogramming paradigm that\nmanipulates the model prediction via data perturbations, our method can\nreprogram a pretrained VLM model (e.g., CLIP) solely based on manipulating its\ninput without tuning the inner parameters. Furthermore, we insert a pseudo-word\nguided by facial identity into the text prompt. Extensive experiments on\nseveral popular benchmarks demonstrate that (1) the cross-dataset and\ncross-manipulation performances of deepfake detection can be significantly and\nconsistently improved (e.g., over 88% AUC in cross-dataset setting from FF++ to\nWildDeepfake) using a pre-trained CLIP model with our proposed reprogramming\nmethod; (2) our superior performances are at less cost of trainable parameters,\nmaking it a promising approach for real-world applications.\n","authors":["Kaiqing Lin","Yuzhen Lin","Weixiang Li","Taiping Yao","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2409.02664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02657v1","updated":"2024-09-04T12:30:25Z","published":"2024-09-04T12:30:25Z","title":"PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for\n One-Shot Talking Head Generation","summary":" While previous audio-driven talking head generation (THG) methods generate\nhead poses from driving audio, the generated poses or lips cannot match the\naudio well or are not editable. In this study, we propose \\textbf{PoseTalk}, a\nTHG system that can freely generate lip-synchronized talking head videos with\nfree head poses conditioned on text prompts and audio. The core insight of our\nmethod is using head pose to connect visual, linguistic, and audio signals.\nFirst, we propose to generate poses from both audio and text prompts, where the\naudio offers short-term variations and rhythm correspondence of the head\nmovements and the text prompts describe the long-term semantics of head\nmotions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to\ngenerate motion latent from text prompts and audio cues in a pose latent space.\nSecond, we observe a loss-imbalance problem: the loss for the lip region\ncontributes less than 4\\% of the total reconstruction loss caused by both pose\nand lip, making optimization lean towards head movements rather than lip\nshapes. To address this issue, we propose a refinement-based learning strategy\nto synthesize natural talking videos using two cascaded networks, i.e.,\nCoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce\nanimated images in novel poses and the RefineNet focuses on learning finer lip\nmotions by progressively estimating lip motions from low-to-high resolutions,\nyielding improved lip-synchronization performance. Experiments demonstrate our\npose prediction strategy achieves better pose diversity and realness compared\nto text-only or audio-only, and our video generator model outperforms\nstate-of-the-art methods in synthesizing talking videos with natural head\nmotions. Project: https://junleen.github.io/projects/posetalk.\n","authors":["Jun Ling","Yiwen Wang","Han Xue","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2409.02657v1.pdf","comment":"7+5 pages, 15 figures"},{"id":"http://arxiv.org/abs/2409.02653v1","updated":"2024-09-04T12:28:44Z","published":"2024-09-04T12:28:44Z","title":"Skip-and-Play: Depth-Driven Pose-Preserved Image Generation for Any\n Objects","summary":" The emergence of diffusion models has enabled the generation of diverse\nhigh-quality images solely from text, prompting subsequent efforts to enhance\nthe controllability of these models. Despite the improvement in\ncontrollability, pose control remains limited to specific objects (e.g.,\nhumans) or poses (e.g., frontal view) due to the fact that pose is generally\ncontrolled via camera parameters (e.g., rotation angle) or keypoints (e.g.,\neyes, nose). Specifically, camera parameters-conditional pose control models\ngenerate unrealistic images depending on the object, owing to the small size of\n3D datasets for training. Also, keypoint-based approaches encounter challenges\nin acquiring reliable keypoints for various objects (e.g., church) or poses\n(e.g., back view). To address these limitations, we propose depth-based pose\ncontrol, as depth maps are easily obtainable from a single depth estimation\nmodel regardless of objects and poses, unlike camera parameters and keypoints.\nHowever, depth-based pose control confronts issues of shape dependency, as\ndepth maps influence not only the pose but also the shape of the generated\nimages. To tackle this issue, we propose Skip-and-Play (SnP), designed via\nanalysis of the impact of three components of depth-conditional ControlNet on\nthe pose and the shape of the generated images. To be specific, based on the\nanalysis, we selectively skip parts of the components to mitigate shape\ndependency on the depth map while preserving the pose. Through various\nexperiments, we demonstrate the superiority of SnP over baselines and showcase\nthe ability of SnP to generate images of diverse objects and poses. Remarkably,\nSnP exhibits the ability to generate images even when the objects in the\ncondition (e.g., a horse) and the prompt (e.g., a hedgehog) differ from each\nother.\n","authors":["Kyungmin Jo","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2409.02653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02648v1","updated":"2024-09-04T12:26:19Z","published":"2024-09-04T12:26:19Z","title":"Creating a Microstructure Latent Space with Rich Material Information\n for Multiphase Alloy Design","summary":" The intricate microstructure serves as the cornerstone for the\ncomposition/processing-structure-property (CPSP) connection in multiphase\nalloys. Traditional alloy design methods often overlook microstructural\ndetails, which diminishes the reliability and effectiveness of the outcomes.\nThis study introduces an improved alloy design algorithm that integrates\nauthentic microstructural information to establish precise CPSP relationships.\nThe approach utilizes a deep-learning framework based on a variational\nautoencoder to map real microstructural data to a latent space, enabling the\nprediction of composition, processing steps, and material properties from the\nlatent space vector. By integrating this deep learning model with a specific\nsampling strategy in the latent space, a novel, microstructure-centered\nalgorithm for multiphase alloy design is developed. This algorithm is\ndemonstrated through the design of a unified dual-phase steel, and the results\nare assessed at three performance levels. Moreover, an exploration into the\nlatent vector space of the model highlights its seamless interpolation ability\nand its rich material information content. Notably, the current configuration\nof the latent space is particularly advantageous for alloy design, offering an\nexhaustive representation of microstructure, composition, processing, and\nproperty variations essential for multiphase alloys.\n","authors":["Xudong Ma","Yuqi Zhang","Chenchong Wang","Ming Wang","Mingxin Huang","Wei Xu"],"pdf_url":"https://arxiv.org/pdf/2409.02648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02647v1","updated":"2024-09-04T12:23:47Z","published":"2024-09-04T12:23:47Z","title":"Learning-Based Error Detection System for Advanced Vehicle Instrument\n Cluster Rendering","summary":" The automotive industry is currently expanding digital display options with\nevery new model that comes onto the market. This entails not just an expansion\nin dimensions, resolution, and customization choices, but also the capability\nto employ novel display effects like overlays while assembling the content of\nthe display cluster. Unfortunately, this raises the need for appropriate\nmonitoring systems that can detect rendering errors and apply appropriate\ncountermeasures when required. Classical solutions such as Cyclic Redundancy\nChecks (CRC) will soon be no longer viable as any sort of alpha blending,\nwarping of scaling of content can cause unwanted CRC violations. Therefore, we\npropose a novel monitoring approach to verify correctness of displayed content\nusing telltales (e.g. warning signs) as example. It uses a learning-based\napproach to separate \"good\" telltales, i.e. those that a human driver will\nunderstand correctly, and \"corrupted\" telltales, i.e. those that will not be\nvisible or perceived correctly. As a result, it possesses inherent resilience\nagainst individual pixel errors and implicitly supports changing backgrounds,\noverlay or scaling effects. This is underlined by our experimental study where\nall \"corrupted\" test patterns were correctly classified, while no false alarms\nwere triggered.\n","authors":["Cornelius Bürkle","Fabian Oboril","Kay-Ulrich Scholl"],"pdf_url":"https://arxiv.org/pdf/2409.02647v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.02638v1","updated":"2024-09-04T12:06:33Z","published":"2024-09-04T12:06:33Z","title":"MADiff: Motion-Aware Mamba Diffusion Models for Hand Trajectory\n Prediction on Egocentric Videos","summary":" Understanding human intentions and actions through egocentric videos is\nimportant on the path to embodied artificial intelligence. As a branch of\negocentric vision techniques, hand trajectory prediction plays a vital role in\ncomprehending human motion patterns, benefiting downstream tasks in extended\nreality and robot manipulation. However, capturing high-level human intentions\nconsistent with reasonable temporal causality is challenging when only\negocentric videos are available. This difficulty is exacerbated under camera\negomotion interference and the absence of affordance labels to explicitly guide\nthe optimization of hand waypoint distribution. In this work, we propose a\nnovel hand trajectory prediction method dubbed MADiff, which forecasts future\nhand waypoints with diffusion models. The devised denoising operation in the\nlatent space is achieved by our proposed motion-aware Mamba, where the camera\nwearer's egomotion is integrated to achieve motion-driven selective scan\n(MDSS). To discern the relationship between hands and scenarios without\nexplicit affordance supervision, we leverage a foundation model that fuses\nvisual and language features to capture high-level semantics from video clips.\nComprehensive experiments conducted on five public datasets with the existing\nand our proposed new evaluation metrics demonstrate that MADiff predicts\ncomparably reasonable hand trajectories compared to the state-of-the-art\nbaselines, and achieves real-time performance. We will release our code and\npretrained models of MADiff at the project page:\nhttps://irmvlab.github.io/madiff.github.io.\n","authors":["Junyi Ma","Xieyuanli Chen","Wentao Bao","Jingyi Xu","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00180v4","updated":"2024-09-04T11:56:13Z","published":"2023-03-01T02:14:20Z","title":"MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN\n for Precise Facial Expression Intensity Estimation","summary":" This paper presents MMA-MRNNet, a novel deep learning architecture for\ndynamic multi-output Facial Expression Intensity Estimation (FEIE) from video\ndata. Traditional approaches to this task often rely on complex 3-D CNNs, which\nrequire extensive pre-training and assume that facial expressions are uniformly\ndistributed across all frames of a video. These methods struggle to handle\nvideos of varying lengths, often resorting to ad-hoc strategies that either\ndiscard valuable information or introduce bias. MMA-MRNNet addresses these\nchallenges through a two-stage process. First, the Multiple Models of Affect\n(MMA) extractor component is a Multi-Task Learning CNN that concurrently\nestimates valence-arousal, recognizes basic facial expressions, and detects\naction units in each frame. These representations are then processed by a\nMasked RNN component, which captures temporal dependencies and dynamically\nupdates weights according to the true length of the input video, ensuring that\nonly the most relevant features are used for the final prediction. The proposed\nunimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction\ndataset and demonstrated significantly superior performance, surpassing\nstate-of-the-art methods by a wide margin, regardless of whether they were\nunimodal, multimodal, or ensemble approaches. Finally, we demonstrated the\neffectiveness of the MMA component of our proposed method across multiple\nin-the-wild datasets, where it consistently outperformed all state-of-the-art\nmethods across various metrics.\n","authors":["Dimitrios Kollias","Andreas Psaroudakis","Anastasios Arsenos","Paraskevi Theofilou","Chunchang Shao","Guanyu Hu","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2303.00180v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02634v1","updated":"2024-09-04T11:55:14Z","published":"2024-09-04T11:55:14Z","title":"Loopy: Taming Audio-Driven Portrait Avatar with Long-Term Motion\n Dependency","summary":" With the introduction of diffusion-based video generation techniques,\naudio-conditioned human video generation has recently achieved significant\nbreakthroughs in both the naturalness of motion and the synthesis of portrait\ndetails. Due to the limited control of audio signals in driving human motion,\nexisting methods often add auxiliary spatial signals to stabilize movements,\nwhich may compromise the naturalness and freedom of motion. In this paper, we\npropose an end-to-end audio-only conditioned video diffusion model named Loopy.\nSpecifically, we designed an inter- and intra-clip temporal module and an\naudio-to-latents module, enabling the model to leverage long-term motion\ninformation from the data to learn natural motion patterns and improving\naudio-portrait movement correlation. This method removes the need for manually\nspecified spatial motion templates used in existing methods to constrain motion\nduring inference. Extensive experiments show that Loopy outperforms recent\naudio-driven portrait diffusion models, delivering more lifelike and\nhigh-quality results across various scenarios.\n","authors":["Jianwen Jiang","Chao Liang","Jiaqi Yang","Gaojie Lin","Tianyun Zhong","Yanbo Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.02634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02629v1","updated":"2024-09-04T11:47:00Z","published":"2024-09-04T11:47:00Z","title":"AdvSecureNet: A Python Toolkit for Adversarial Machine Learning","summary":" Machine learning models are vulnerable to adversarial attacks. Several tools\nhave been developed to research these vulnerabilities, but they often lack\ncomprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch\nbased toolkit for adversarial machine learning that is the first to natively\nsupport multi-GPU setups for attacks, defenses, and evaluation. It is the first\ntoolkit that supports both CLI and API interfaces and external YAML\nconfiguration files to enhance versatility and reproducibility. The toolkit\nincludes multiple attacks, defenses and evaluation metrics. Rigiorous software\nengineering practices are followed to ensure high code quality and\nmaintainability. The project is available as an open-source project on GitHub\nat https://github.com/melihcatal/advsecurenet and installable via PyPI.\n","authors":["Melih Catal","Manuel Günther"],"pdf_url":"https://arxiv.org/pdf/2409.02629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16582v2","updated":"2024-09-04T11:14:18Z","published":"2024-03-25T09:49:42Z","title":"In the Search for Optimal Multi-view Learning Models for Crop\n Classification with Global Remote Sensing Data","summary":" Studying and analyzing cropland is a difficult task due to its dynamic and\nheterogeneous growth behavior. Usually, diverse data sources can be collected\nfor its estimation. Although deep learning models have proven to excel in the\ncrop classification task, they face substantial challenges when dealing with\nmultiple inputs, named Multi-View Learning (MVL). The methods used in the MVL\nscenario can be structured based on the encoder architecture, the fusion\nstrategy, and the optimization technique. The literature has primarily focused\non using specific encoder architectures for local regions, lacking a deeper\nexploration of other components in the MVL methodology. In contrast, we\ninvestigate the simultaneous selection of the fusion strategy and encoder\narchitecture, assessing global-scale cropland and crop-type classifications. We\nuse a range of five fusion strategies (Input, Feature, Decision, Ensemble,\nHybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible\nconfigurations in the MVL method. We use the CropHarvest dataset for\nvalidation, which provides optical, radar, weather time series, and topographic\ninformation as input data. We found that in scenarios with a limited number of\nlabeled samples, a unique configuration is insufficient for all the cases.\nInstead, a specialized combination should be meticulously sought, including an\nencoder and fusion strategy. To streamline this search process, we suggest\nidentifying the optimal encoder architecture tailored for a particular fusion\nstrategy, and then determining the most suitable fusion strategy for the\nclassification task. We provide a methodological framework for researchers\nexploring crop classification through an MVL methodology.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.16582v2.pdf","comment":"submitted to journal"},{"id":"http://arxiv.org/abs/2407.15512v2","updated":"2024-09-04T11:01:47Z","published":"2024-07-22T09:58:29Z","title":"Increasing the Robustness of Model Predictions to Missing Sensors in\n Earth Observation","summary":" Multi-sensor ML models for EO aim to enhance prediction accuracy by\nintegrating data from various sources. However, the presence of missing data\nposes a significant challenge, particularly in non-persistent sensors that can\nbe affected by external factors. Existing literature has explored strategies\nlike temporal dropout and sensor-invariant models to address the generalization\nto missing data issues. Inspired by these works, we study two novel methods\ntailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and\nEnsemble Sensor Invariant (ESensI). Through experimentation on three\nmulti-sensor temporal EO datasets, we demonstrate that these methods\neffectively increase the robustness of model predictions to missing sensors.\nParticularly, we focus on how the predictive performance of models drops when\nsensors are missing at different levels. We observe that ensemble multi-sensor\nmodels are the most robust to the lack of sensors. In addition, the sensor\ndropout component in ISensD shows promising robustness results.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2407.15512v2.pdf","comment":"Accepted at the MACLEAN workshop in the ECML/PKDD 2024"},{"id":"http://arxiv.org/abs/2401.15113v3","updated":"2024-09-04T10:59:10Z","published":"2024-01-25T20:41:17Z","title":"Scalable Glacier Mapping using Deep Learning and Open Earth Observation\n Data Matches the Accuracy of Manual Delineation","summary":" Accurate global glacier mapping is critical for understanding climate change\nimpacts. Despite its importance, automated glacier mapping at a global scale\nremains largely unexplored. Here we address this gap and propose\nGlacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep\nlearning model, and five strategies for multitemporal global-scale glacier\nmapping using open satellite imagery. Assessing the spatial, temporal and\ncross-sensor generalisation shows that our best strategy achieves intersection\nover union >0.85 on previously unobserved images in most cases, which drops to\n>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90\nfor regions dominated by clean ice. A comparative validation against human\nexpert uncertainties in terms of area and distance deviations underscores\nGlaViTU performance, approaching or matching expert-level delineation. Adding\nsynthetic aperture radar data, namely, backscatter and interferometric\ncoherence, increases the accuracy in all regions where available. The\ncalibrated confidence for glacier extents is reported making the predictions\nmore reliable and interpretable. We also release a benchmark dataset that\ncovers 9% of glaciers worldwide. Our results support efforts towards automated\nmultitemporal and global glacier mapping.\n","authors":["Konstantin A. Maslov","Claudio Persello","Thomas Schellenberger","Alfred Stein"],"pdf_url":"https://arxiv.org/pdf/2401.15113v3.pdf","comment":"after major revision, expanded validation"},{"id":"http://arxiv.org/abs/2409.02611v1","updated":"2024-09-04T10:56:05Z","published":"2024-09-04T10:56:05Z","title":"GoT-CQA: Graph-of-Thought Guided Compositional Reasoning for Chart\n Question Answering","summary":" Chart Question Answering (CQA) aims at answering questions based on the\nvisual chart content, which plays an important role in chart sumarization,\nbusiness data analysis, and data report generation. CQA is a challenging\nmulti-modal task because of the strong context dependence and complex reasoning\nrequirement. The former refers to answering this question strictly based on the\nanalysis of the visual content or internal data of the given chart, while the\nlatter emphasizes the various logical and numerical reasoning involved in\nanswer prediction process. In this paper, we pay more attention on the complex\nreasoning in CQA task, and propose a novel Graph-of-Thought (GoT) guided\ncompositional reasoning model called GoT-CQA to overcome this problem. At\nfirst, we transform the chart-oriented question into a directed acyclic GoT\ncomposed of multiple operator nodes, including localization, numerical and\nlogical operator. It intuitively reflects the human brain's solution process to\nthis question. After that, we design an efficient auto-compositional reasoning\nframework guided by the GoT, to excute the multi-step reasoning operations in\nvarious types of questions. Comprehensive experiments on ChartQA and PlotQA-D\ndatasets show that GoT-CQA achieves outstanding performance, especially in\ncomplex human-written and reasoning questions, comparing with the latest\npopular baselines.\n","authors":["Lingling Zhang","Muye Huang","QianYing Wang","Yaxian Wang","Wenjun Wu","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02608v1","updated":"2024-09-04T10:45:33Z","published":"2024-09-04T10:45:33Z","title":"A Medical Multimodal Large Language Model for Pediatric Pneumonia","summary":" Pediatric pneumonia is the leading cause of death among children under five\nyears worldwide, imposing a substantial burden on affected families. Currently,\nthere are three significant hurdles in diagnosing and treating pediatric\npneumonia. Firstly, pediatric pneumonia shares similar symptoms with other\nrespiratory diseases, making rapid and accurate differential diagnosis\nchallenging. Secondly, primary hospitals often lack sufficient medical\nresources and experienced doctors. Lastly, providing personalized diagnostic\nreports and treatment recommendations is labor-intensive and time-consuming. To\ntackle these challenges, we proposed a Medical Multimodal Large Language Model\nfor Pediatric Pneumonia (P2Med-MLLM). It was capable of handling diverse\nclinical tasks, such as generating free-text radiology reports and medical\nrecords within a unified framework. Specifically, P2Med-MLLM can process both\npure text and image-text data, trained on an extensive and large-scale dataset\n(P2Med-MD), including real clinical information from 163,999 outpatient and\n8,684 inpatient cases. This dataset comprised 2D chest X-ray images, 3D chest\nCT images, corresponding radiology reports, and outpatient and inpatient\nrecords. We designed a three-stage training strategy to enable P2Med-MLLM to\ncomprehend medical knowledge and follow instructions for various clinical\ntasks. To rigorously evaluate P2Med-MLLM's performance, we developed\nP2Med-MBench, a benchmark consisting of 642 meticulously verified samples by\npediatric pulmonology specialists, covering six clinical decision-support tasks\nand a balanced variety of diseases. The automated scoring results demonstrated\nthe superiority of P2Med-MLLM. This work plays a crucial role in assisting\nprimary care doctors with prompt disease diagnosis and treatment planning,\nreducing severe symptom mortality rates, and optimizing the allocation of\nmedical resources.\n","authors":["Weiwei Tian","Xinyu Huang","Tianhao Cheng","Wen He","Jinwu Fang","Rui Feng","Daoying Geng","Xiaobo Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02608v1.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2408.16766v2","updated":"2024-09-04T10:42:41Z","published":"2024-08-29T17:59:30Z","title":"CSGO: Content-Style Composition in Text-to-Image Generation","summary":" The diffusion model has shown exceptional capabilities in controlled image\ngeneration, which has further fueled interest in image style transfer. Existing\nworks mainly focus on training free-based methods (e.g., image inversion) due\nto the scarcity of specific data. In this study, we present a data construction\npipeline for content-style-stylized image triplets that generates and\nautomatically cleanses stylized data triplets. Based on this pipeline, we\nconstruct a dataset IMAGStyle, the first large-scale style transfer dataset\ncontaining 210k image triplets, available for the community to explore and\nresearch. Equipped with IMAGStyle, we propose CSGO, a style transfer model\nbased on end-to-end training, which explicitly decouples content and style\nfeatures employing independent feature injection. The unified CSGO implements\nimage-driven style transfer, text-driven stylized synthesis, and text\nediting-driven stylized synthesis. Extensive experiments demonstrate the\neffectiveness of our approach in enhancing style control capabilities in image\ngeneration. Additional visualization and access to the source code can be\nlocated on the project page: \\url{https://csgo-gen.github.io/}.\n","authors":["Peng Xing","Haofan Wang","Yanpeng Sun","Qixun Wang","Xu Bai","Hao Ai","Renyuan Huang","Zechao Li"],"pdf_url":"https://arxiv.org/pdf/2408.16766v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01574v3","updated":"2024-09-04T10:42:23Z","published":"2023-09-04T12:53:54Z","title":"Object-Size-Driven Design of Convolutional Neural Networks: Virtual Axle\n Detection based on Raw Data","summary":" As infrastructure ages, the need for efficient monitoring methods becomes\nincreasingly critical. Bridge Weigh-In-Motion (BWIM) systems are crucial for\ncost-efficient load and thus residual service life determination of road and\nrailway infrastructure. However, conventional BWIM systems require additional\nsensors for axle detection, which have to be installed in potentially\ninaccessible locations or in locations that interfere with bridge operation.\nThis study addresses this challenge by replacing dedicated axle detectors with\na novel approach to real-time detection of train axles using sensors\narbitrarily placed on bridges. The proposed Virtual Axle Detector with Enhanced\nReceptive Field (VADER) has been validated on a single-track railway bridge,\ndemonstrating that it achieves to detect 99.9% of axles with a spatial error of\n3.69cm using only acceleration measurements. Using raw data as input\noutperforms the state-of-the-art spectrogram-based method in both speed and\nmemory usage by 99%, making real-time application feasible for the first time.\nAdditionally, we introduce the Maximum Receptive Field (MRF) rule, a novel\napproach to optimise hyperparameters of Convolutional Neural Networks (CNNs)\nbased on the size of objects, which in this case relates to the fundamental\nfrequency of a bridge. The MRF rule effectively narrows the hyperparameter\nsearch space, potentially replacing the need for extensive hyperparameter\ntuning. Since the MRF rule is theoretically applicable to all unstructured\ndata, it could have implications for a wide range of deep learning problems\nfrom earthquake prediction to object recognition.\n","authors":["Henik Riedel","Robert Steven Lorenzen","Clemens Hübler"],"pdf_url":"https://arxiv.org/pdf/2309.01574v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02599v1","updated":"2024-09-04T10:30:11Z","published":"2024-09-04T10:30:11Z","title":"A Fashion Item Recommendation Model in Hyperbolic Space","summary":" In this work, we propose a fashion item recommendation model that\nincorporates hyperbolic geometry into user and item representations. Using\nhyperbolic space, our model aims to capture implicit hierarchies among items\nbased on their visual data and users' purchase history. During training, we\napply a multi-task learning framework that considers both hyperbolic and\nEuclidean distances in the loss function. Our experiments on three data sets\nshow that our model performs better than previous models trained in Euclidean\nspace only, confirming the effectiveness of our model. Our ablation studies\nshow that multi-task learning plays a key role, and removing the Euclidean loss\nsubstantially deteriorates the model performance.\n","authors":["Ryotaro Shimizu","Yu Wang","Masanari Kimura","Yuki Hirakawa","Takashi Wada","Yuki Saito","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.02599v1.pdf","comment":"This work was presented at the CVFAD Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2409.02598v1","updated":"2024-09-04T10:29:59Z","published":"2024-09-04T10:29:59Z","title":"SurgTrack: CAD-Free 3D Tracking of Real-world Surgical Instruments","summary":" Vision-based surgical navigation has received increasing attention due to its\nnon-invasive, cost-effective, and flexible advantages. In particular, a\ncritical element of the vision-based navigation system is tracking surgical\ninstruments. Compared with 2D instrument tracking methods, 3D instrument\ntracking has broader value in clinical practice, but is also more challenging\ndue to weak texture, occlusion, and lack of Computer-Aided Design (CAD) models\nfor 3D registration. To solve these challenges, we propose the SurgTrack, a\ntwo-stage 3D instrument tracking method for CAD-free and robust real-world\napplications. In the first registration stage, we incorporate an Instrument\nSigned Distance Field (SDF) modeling the 3D representation of instruments,\nachieving CAD-freed 3D registration. Due to this, we can obtain the location\nand orientation of instruments in the 3D space by matching the video stream\nwith the registered SDF model. In the second tracking stage, we devise a\nposture graph optimization module, leveraging the historical tracking results\nof the posture memory pool to optimize the tracking results and improve the\nocclusion robustness. Furthermore, we collect the Instrument3D dataset to\ncomprehensively evaluate the 3D tracking of surgical instruments. The extensive\nexperiments validate the superiority and scalability of our SurgTrack, by\noutperforming the state-of-the-arts with a remarkable improvement. The code and\ndataset are available at https://github.com/wenwucode/SurgTrack.\n","authors":["Wenwu Guo","Jinlin Wu","Zhen Chen","Qingxiang Zhao","Miao Xu","Zhen Lei","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02584v1","updated":"2024-09-04T10:06:42Z","published":"2024-09-04T10:06:42Z","title":"BMI Prediction from Handwritten English Characters Using a Convolutional\n Neural Network","summary":" A person's Body Mass Index, or BMI, is the most widely used parameter for\nassessing their health. BMI is a crucial predictor of potential diseases that\nmay arise at higher body fat levels because it is correlated with body fat.\nConversely, a community's or an individual's nutritional status can be\ndetermined using the BMI. Although deep learning models are used in several\nstudies to estimate BMI from face photos and other data, no previous research\nestablished a clear connection between deep learning techniques for handwriting\nanalysis and BMI prediction. This article addresses this research gap with a\ndeep learning approach to estimating BMI from handwritten characters by\ndeveloping a convolutional neural network (CNN). A dataset containing samples\nfrom 48 people in lowercase English scripts is successfully captured for the\nBMI prediction task. The proposed CNN-based approach reports a commendable\naccuracy of 99.92%. Performance comparison with other popular CNN architectures\nreveals that AlexNet and InceptionV3 achieve the second and third-best\nperformance, with the accuracy of 99.69% and 99.53%, respectively.\n","authors":["N. T. Diba","N. Akter","S. A. H. Chowdhury","J. E. Giti"],"pdf_url":"https://arxiv.org/pdf/2409.02584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02581v1","updated":"2024-09-04T10:03:11Z","published":"2024-09-04T10:03:11Z","title":"Object Gaussian for Monocular 6D Pose Estimation from Sparse Views","summary":" Monocular object pose estimation, as a pivotal task in computer vision and\nrobotics, heavily depends on accurate 2D-3D correspondences, which often demand\ncostly CAD models that may not be readily available. Object 3D reconstruction\nmethods offer an alternative, among which recent advancements in 3D Gaussian\nSplatting (3DGS) afford a compelling potential. Yet its performance still\nsuffers and tends to overfit with fewer input views. Embracing this challenge,\nwe introduce SGPose, a novel framework for sparse view object pose estimation\nusing Gaussian-based methods. Given as few as ten views, SGPose generates a\ngeometric-aware representation by starting with a random cuboid initialization,\neschewing reliance on Structure-from-Motion (SfM) pipeline-derived geometry as\nrequired by traditional 3DGS methods. SGPose removes the dependence on CAD\nmodels by regressing dense 2D-3D correspondences between images and the\nreconstructed model from sparse input and random initialization, while the\ngeometric-consistent depth supervision and online synthetic view warping are\nkey to the success. Experiments on typical benchmarks, especially on the\nOcclusion LM-O dataset, demonstrate that SGPose outperforms existing methods\neven under sparse view constraints, under-scoring its potential in real-world\napplications.\n","authors":["Luqing Luo","Shichu Sun","Jiangang Yang","Linfang Zheng","Jinwei Du","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02574v1","updated":"2024-09-04T09:48:27Z","published":"2024-09-04T09:48:27Z","title":"Solving Video Inverse Problems Using Image Diffusion Models","summary":" Recently, diffusion model-based inverse problem solvers (DIS) have emerged as\nstate-of-the-art approaches for addressing inverse problems, including image\nsuper-resolution, deblurring, inpainting, etc. However, their application to\nvideo inverse problems arising from spatio-temporal degradation remains largely\nunexplored due to the challenges in training video diffusion models. To address\nthis issue, here we introduce an innovative video inverse solver that leverages\nonly image diffusion models. Specifically, by drawing inspiration from the\nsuccess of the recent decomposed diffusion sampler (DDS), our method treats the\ntime dimension of a video as the batch dimension of image diffusion models and\nsolves spatio-temporal optimization problems within denoised spatio-temporal\nbatches derived from each image diffusion model. Moreover, we introduce a\nbatch-consistent diffusion sampling strategy that encourages consistency across\nbatches by synchronizing the stochastic noise components in image diffusion\nmodels. Our approach synergistically combines batch-consistent sampling with\nsimultaneous optimization of denoised spatio-temporal batches at each reverse\ndiffusion step, resulting in a novel and efficient diffusion sampling strategy\nfor video inverse problems. Experimental results demonstrate that our method\neffectively addresses various spatio-temporal degradations in video inverse\nproblems, achieving state-of-the-art reconstructions. Project page:\nhttps://solving-video-inverse.github.io/main/\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2409.02574v1.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2312.06726v4","updated":"2024-09-04T09:45:08Z","published":"2023-12-11T05:57:09Z","title":"Filter & Align: Leveraging Human Knowledge to Curate Image-Text Data","summary":" The increasing availability of image-text pairs has largely fueled the rapid\nadvancement in vision-language foundation models. However, the vast scale of\nthese datasets inevitably introduces significant variability in data quality,\nwhich can adversely affect the model performance. This highlights the critical\nrole of data filtering, not only to enhance training efficiency but also to\nimprove overall data quality. Existing methods typically rely on metrics such\nas CLIP Score and BLIP Score, which are derived from pre-trained models.\nHowever, these models are often trained on uncurated, noisy datasets, which can\nperpetuate errors and misalignments in the filtered dataset. We present a novel\nalgorithm that incorporates human knowledge on image-text alignment to guide\nfiltering vast corpus of web-crawled image-text datasets into a compact and\nhigh-quality form. To systemically capture human preferences on image-text\nalignments, we collect a diverse image-text dataset where each image is\nassociated with multiple captions from various sources, and establish a\ncomprehensive set of both subjective and objective criteria for critically\nguiding the alignment assessment from labelers. Additionally, we train a reward\nmodel on these human-preference annotations to internalize the nuanced human\nunderstanding of image-text alignment. The resulting reward model thus can act\nas a human-like referee to filter image-text pairs. Extensive experiments\ndemonstrate that we can maintain, sometimes even improve, model performance\nwhile compressing the image-text datasets up to ~90%. An impressive example is\nthat, by aggressively reducing the total training sample from 130M to only\n15.5M, our BLIP-B/16 models consistently show an average improvement of 2.9% on\nretrieval tasks and 11.5% on captioning tasks compared to full-size-dataset\ncounterparts.\n","authors":["Lei Zhang","Fangxun Shu","Tianyang Liu","Sucheng Ren","Hao Jiang","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.06726v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10904v2","updated":"2024-09-04T09:42:07Z","published":"2024-04-16T20:51:36Z","title":"Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression\n Recognition","summary":" Human communication is multi-modal; e.g., face-to-face interaction involves\nauditory signals (speech) and visual signals (face movements and hand\ngestures). Hence, it is essential to exploit multiple modalities when designing\nmachine learning-based facial expression recognition systems. In addition,\ngiven the ever-growing quantities of video data that capture human facial\nexpressions, such systems should utilize raw unlabeled videos without requiring\nexpensive annotations. Therefore, in this work, we employ a multitask\nmulti-modal self-supervised learning method for facial expression recognition\nfrom in-the-wild video data. Our model combines three self-supervised objective\nfunctions: First, a multi-modal contrastive loss, that pulls diverse data\nmodalities of the same video together in the representation space. Second, a\nmulti-modal clustering loss that preserves the semantic structure of input data\nin the representation space. Finally, a multi-modal data reconstruction loss.\nWe conduct a comprehensive study on this multimodal multi-task self-supervised\nlearning method on three facial expression recognition benchmarks. To that end,\nwe examine the performance of learning through different combinations of\nself-supervised tasks on the facial expression recognition downstream task. Our\nmodel ConCluGen outperforms several multi-modal self-supervised and fully\nsupervised baselines on the CMU-MOSEI dataset. Our results generally show that\nmulti-modal self-supervision tasks offer large performance gains for\nchallenging tasks such as facial expression recognition, while also reducing\nthe amount of manual annotations required. We release our pre-trained models as\nwell as source code publicly\n","authors":["Marah Halawa","Florian Blume","Pia Bideau","Martin Maier","Rasha Abdel Rahman","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2404.10904v2.pdf","comment":"The paper will appear in the CVPR 2024 workshops proceedings"},{"id":"http://arxiv.org/abs/2409.02567v1","updated":"2024-09-04T09:35:09Z","published":"2024-09-04T09:35:09Z","title":"Evaluation Study on SAM 2 for Class-agnostic Instance-level Segmentation","summary":" Segment Anything Model (SAM) has demonstrated powerful zero-shot segmentation\nperformance in natural scenes. The recently released Segment Anything Model 2\n(SAM2) has further heightened researchers' expectations towards image\nsegmentation capabilities. To evaluate the performance of SAM2 on\nclass-agnostic instance-level segmentation tasks, we adopt different prompt\nstrategies for SAM2 to cope with instance-level tasks for three relevant\nscenarios: Salient Instance Segmentation (SIS), Camouflaged Instance\nSegmentation (CIS), and Shadow Instance Detection (SID). In addition, to\nfurther explore the effectiveness of SAM2 in segmenting granular object\nstructures, we also conduct detailed tests on the high-resolution Dichotomous\nImage Segmentation (DIS) benchmark to assess the fine-grained segmentation\ncapability. Qualitative and quantitative experimental results indicate that the\nperformance of SAM2 varies significantly across different scenarios. Besides,\nSAM2 is not particularly sensitive to segmenting high-resolution fine details.\nWe hope this technique report can drive the emergence of SAM2-based adapters,\naiming to enhance the performance ceiling of large vision models on\nclass-agnostic instance segmentation tasks.\n","authors":["Tiantian Zhang","Zhangjun Zhou","Jialun Pei"],"pdf_url":"https://arxiv.org/pdf/2409.02567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17472v2","updated":"2024-09-04T09:34:13Z","published":"2024-06-25T11:30:31Z","title":"UHD-IQA Benchmark Database: Pushing the Boundaries of Blind Photo\n Quality Assessment","summary":" We introduce a novel Image Quality Assessment (IQA) dataset comprising 6073\nUHD-1 (4K) images, annotated at a fixed width of 3840 pixels. Contrary to\nexisting No-Reference (NR) IQA datasets, ours focuses on highly aesthetic\nphotos of high technical quality, filling a gap in the literature. The images,\ncarefully curated to exclude synthetic content, are sufficiently diverse to\ntrain general NR-IQA models. Importantly, the dataset is annotated with\nperceptual quality ratings obtained through a crowdsourcing study. Ten expert\nraters, comprising photographers and graphics artists, assessed each image at\nleast twice in multiple sessions spanning several days, resulting in 20 highly\nreliable ratings per image. Annotators were rigorously selected based on\nseveral metrics, including self-consistency, to ensure their reliability. The\ndataset includes rich metadata with user and machine-generated tags from over\n5,000 categories and popularity indicators such as favorites, likes, downloads,\nand views. With its unique characteristics, such as its focus on high-quality\nimages, reliable crowdsourced annotations, and high annotation resolution, our\ndataset opens up new opportunities for advancing perceptual image quality\nassessment research and developing practical NR-IQA models that apply to modern\nphotos. Our dataset is available at\nhttps://database.mmsp-kn.de/uhd-iqa-benchmark-database.html\n","authors":["Vlad Hosu","Lorenzo Agnolucci","Oliver Wiedemann","Daisuke Iso","Dietmar Saupe"],"pdf_url":"https://arxiv.org/pdf/2406.17472v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02566v1","updated":"2024-09-04T09:32:40Z","published":"2024-09-04T09:32:40Z","title":"How Do You Perceive My Face? Recognizing Facial Expressions in\n Multi-Modal Context by Modeling Mental Representations","summary":" Facial expression perception in humans inherently relies on prior knowledge\nand contextual cues, contributing to efficient and flexible processing. For\ninstance, multi-modal emotional context (such as voice color, affective text,\nbody pose, etc.) can prompt people to perceive emotional expressions in\nobjectively neutral faces. Drawing inspiration from this, we introduce a novel\napproach for facial expression classification that goes beyond simple\nclassification tasks. Our model accurately classifies a perceived face and\nsynthesizes the corresponding mental representation perceived by a human when\nobserving a face in context. With this, our model offers visual insights into\nits internal decision-making process. We achieve this by learning two\nindependent representations of content and context using a VAE-GAN\narchitecture. Subsequently, we propose a novel attention mechanism for\ncontext-dependent feature adaptation. The adapted representation is used for\nclassification and to generate a context-augmented expression. We evaluate\nsynthesized expressions in a human study, showing that our model effectively\nproduces approximations of human mental representations. We achieve\nState-of-the-Art classification accuracies of 81.01% on the RAVDESS dataset and\n79.34% on the MEAD dataset. We make our code publicly available.\n","authors":["Florian Blume","Runfeng Qu","Pia Bideau","Martin Maier","Rasha Abdel Rahman","Olaf Hellwich"],"pdf_url":"https://arxiv.org/pdf/2409.02566v1.pdf","comment":"GCPR 2024"},{"id":"http://arxiv.org/abs/2409.02562v1","updated":"2024-09-04T09:29:24Z","published":"2024-09-04T09:29:24Z","title":"Interacting Multiple Model-based Joint Homography Matrix and Multiple\n Object State Estimation","summary":" A novel MOT algorithm, IMM Joint Homography State Estimation (IMM-JHSE), is\nproposed. By jointly modelling the camera projection matrix as part of track\nstate vectors, IMM-JHSE removes the explicit influence of camera motion\ncompensation techniques on predicted track position states, which was prevalent\nin previous approaches. Expanding upon this, static and dynamic camera motion\nmodels are combined through the use of an IMM filter. A simple bounding box\nmotion model is used to predict bounding box positions to incorporate image\nplane information. In addition to applying an IMM to camera motion, a\nnon-standard IMM approach is applied where bounding-box-based BIoU scores are\nmixed with ground-plane-based Mahalanobis distances in an IMM-like fashion to\nperform association only. Finally, IMM-JHSE makes use of dynamic process and\nmeasurement noise estimation techniques. IMM-JHSE improves upon related\ntechniques on the DanceTrack and KITTI-car datasets, increasing HOTA by 2.64\nand 2.11, respectively, while offering competitive performance on the MOT17,\nMOT20 and KITTI-pedestrian datasets.\n","authors":["Paul Johannes Claasen","Johan Pieter de Villiers"],"pdf_url":"https://arxiv.org/pdf/2409.02562v1.pdf","comment":"Preprint submitted to Information Fusion"},{"id":"http://arxiv.org/abs/2303.17249v4","updated":"2024-09-04T09:27:35Z","published":"2023-03-30T09:29:03Z","title":"Model-agnostic explainable artificial intelligence for object detection\n in image data","summary":" In recent years, deep neural networks have been widely used for building\nhigh-performance Artificial Intelligence (AI) systems for computer vision\napplications. Object detection is a fundamental task in computer vision, which\nhas been greatly progressed through developing large and intricate AI models.\nHowever, the lack of transparency is a big challenge that may not allow the\nwidespread adoption of these models. Explainable artificial intelligence is a\nfield of research where methods are developed to help users understand the\nbehavior, decision logics, and vulnerabilities of AI systems. Previously, few\nexplanation methods were developed for object detection based on random\nmasking. However, random masks may raise some issues regarding the actual\nimportance of pixels within an image. In this paper, we design and implement a\nblack-box explanation method named Black-box Object Detection Explanation by\nMasking (BODEM) through adopting a hierarchical random masking approach for\nobject detection systems. We propose a hierarchical random masking framework in\nwhich coarse-grained masks are used in lower levels to find salient regions\nwithin an image, and fine-grained mask are used to refine the salient regions\nin higher levels. Experimentations on various object detection datasets and\nmodels showed that BODEM can effectively explain the behavior of object\ndetectors. Moreover, our method outperformed Detector Randomized Input Sampling\nfor Explanation (D-RISE) and Local Interpretable Model-agnostic Explanations\n(LIME) with respect to different quantitative measures of explanation\neffectiveness. The experimental results demonstrate that BODEM can be an\neffective method for explaining and validating object detection systems in\nblack-box testing scenarios.\n","authors":["Milad Moradi","Ke Yan","David Colwell","Matthias Samwald","Rhona Asgari"],"pdf_url":"https://arxiv.org/pdf/2303.17249v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02555v1","updated":"2024-09-04T09:21:13Z","published":"2024-09-04T09:21:13Z","title":"Low-Resolution Object Recognition with Cross-Resolution Relational\n Contrastive Distillation","summary":" Recognizing objects in low-resolution images is a challenging task due to the\nlack of informative details. Recent studies have shown that knowledge\ndistillation approaches can effectively transfer knowledge from a\nhigh-resolution teacher model to a low-resolution student model by aligning\ncross-resolution representations. However, these approaches still face\nlimitations in adapting to the situation where the recognized objects exhibit\nsignificant representation discrepancies between training and testing images.\nIn this study, we propose a cross-resolution relational contrastive\ndistillation approach to facilitate low-resolution object recognition. Our\napproach enables the student model to mimic the behavior of a well-trained\nteacher model which delivers high accuracy in identifying high-resolution\nobjects. To extract sufficient knowledge, the student learning is supervised\nwith contrastive relational distillation loss, which preserves the similarities\nin various relational structures in contrastive representation space. In this\nmanner, the capability of recovering missing details of familiar low-resolution\nobjects can be effectively enhanced, leading to a better knowledge transfer.\nExtensive experiments on low-resolution object classification and\nlow-resolution face recognition clearly demonstrate the effectiveness and\nadaptability of our approach.\n","authors":["Kangkai Zhang","Shiming Ge","Ruixin Shi","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02555v1.pdf","comment":"This paper is accepted by IEEE Transactions on Circuits and Systems\n for Video Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2409.02546v1","updated":"2024-09-04T09:03:47Z","published":"2024-09-04T09:03:47Z","title":"Real-Time Dynamic Scale-Aware Fusion Detection Network: Take Road Damage\n Detection as an example","summary":" Unmanned Aerial Vehicle (UAV)-based Road Damage Detection (RDD) is important\nfor daily maintenance and safety in cities, especially in terms of\nsignificantly reducing labor costs. However, current UAV-based RDD research is\nstill faces many challenges. For example, the damage with irregular size and\ndirection, the masking of damage by the background, and the difficulty of\ndistinguishing damage from the background significantly affect the ability of\nUAV to detect road damage in daily inspection. To solve these problems and\nimprove the performance of UAV in real-time road damage detection, we design\nand propose three corresponding modules: a feature extraction module that\nflexibly adapts to shape and background; a module that fuses multiscale\nperception and adapts to shape and background ; an efficient downsampling\nmodule. Based on these modules, we designed a multi-scale, adaptive road damage\ndetection model with the ability to automatically remove background\ninterference, called Dynamic Scale-Aware Fusion Detection Model (RT-DSAFDet).\nExperimental results on the UAV-PDD2023 public dataset show that our model\nRT-DSAFDet achieves a mAP50 of 54.2%, which is 11.1% higher than that of\nYOLOv10-m, an efficient variant of the latest real-time object detection model\nYOLOv10, while the amount of parameters is reduced to 1.8M and FLOPs to 4.6G,\nwith a decreased by 88% and 93%, respectively. Furthermore, on the large\ngeneralized object detection public dataset MS COCO2017 also shows the\nsuperiority of our model with mAP50-95 is the same as YOLOv9-t, but with 0.5%\nhigher mAP50, 10% less parameters volume, and 40% less FLOPs.\n","authors":["Weichao Pan","Xu Wang","Wenqing Huan"],"pdf_url":"https://arxiv.org/pdf/2409.02546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13085v2","updated":"2024-09-04T09:02:33Z","published":"2024-08-23T14:12:03Z","title":"Map-Free Visual Relocalization Enhanced by Instance Knowledge and Depth\n Knowledge","summary":" Map-free relocalization technology is crucial for applications in autonomous\nnavigation and augmented reality, but relying on pre-built maps is often\nimpractical. It faces significant challenges due to limitations in matching\nmethods and the inherent lack of scale in monocular images. These issues lead\nto substantial rotational and metric errors and even localization failures in\nreal-world scenarios. Large matching errors significantly impact the overall\nrelocalization process, affecting both rotational and translational accuracy.\nDue to the inherent limitations of the camera itself, recovering the metric\nscale from a single image is crucial, as this significantly impacts the\ntranslation error. To address these challenges, we propose a map-free\nrelocalization method enhanced by instance knowledge and depth knowledge. By\nleveraging instance-based matching information to improve global matching\nresults, our method significantly reduces the possibility of mismatching across\ndifferent objects. The robustness of instance knowledge across the scene helps\nthe feature point matching model focus on relevant regions and enhance matching\naccuracy. Additionally, we use estimated metric depth from a single image to\nreduce metric errors and improve scale recovery accuracy. By integrating\nmethods dedicated to mitigating large translational and rotational errors, our\napproach demonstrates superior performance in map-free relocalization\ntechniques.\n","authors":["Mingyu Xiao","Runze Chen","Haiyong Luo","Fang Zhao","Juan Wang","Xuepeng Ma"],"pdf_url":"https://arxiv.org/pdf/2408.13085v2.pdf","comment":"17 pages,6 figures"},{"id":"http://arxiv.org/abs/2409.02545v1","updated":"2024-09-04T09:02:01Z","published":"2024-09-04T09:02:01Z","title":"UniTT-Stereo: Unified Training of Transformer for Enhanced Stereo\n Matching","summary":" Unlike other vision tasks where Transformer-based approaches are becoming\nincreasingly common, stereo depth estimation is still dominated by\nconvolution-based approaches. This is mainly due to the limited availability of\nreal-world ground truth for stereo matching, which is a limiting factor in\nimproving the performance of Transformer-based stereo approaches. In this\npaper, we propose UniTT-Stereo, a method to maximize the potential of\nTransformer-based stereo architectures by unifying self-supervised learning\nused for pre-training with stereo matching framework based on supervised\nlearning. To be specific, we explore the effectiveness of reconstructing\nfeatures of masked portions in an input image and at the same time predicting\ncorresponding points in another image from the perspective of locality\ninductive bias, which is crucial in training models with limited training data.\nMoreover, to address these challenging tasks of reconstruction-and-prediction,\nwe present a new strategy to vary a masking ratio when training the stereo\nmodel with stereo-tailored losses. State-of-the-art performance of UniTT-Stereo\nis validated on various benchmarks such as ETH3D, KITTI 2012, and KITTI 2015\ndatasets. Lastly, to investigate the advantages of the proposed approach, we\nprovide a frequency analysis of feature maps and the analysis of locality\ninductive bias based on attention maps.\n","authors":["Soomin Kim","Hyesong Choi","Jihye Ahn","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02543v1","updated":"2024-09-04T09:01:21Z","published":"2024-09-04T09:01:21Z","title":"StyleTokenizer: Defining Image Style by a Single Instance for\n Controlling Diffusion Models","summary":" Despite the burst of innovative methods for controlling the diffusion\nprocess, effectively controlling image styles in text-to-image generation\nremains a challenging task. Many adapter-based methods impose image\nrepresentation conditions on the denoising process to accomplish image control.\nHowever these conditions are not aligned with the word embedding space, leading\nto interference between image and text control conditions and the potential\nloss of semantic information from the text prompt. Addressing this issue\ninvolves two key challenges. Firstly, how to inject the style representation\nwithout compromising the effectiveness of text representation in control.\nSecondly, how to obtain the accurate style representation from a single\nreference image. To tackle these challenges, we introduce StyleTokenizer, a\nzero-shot style control image generation method that aligns style\nrepresentation with text representation using a style tokenizer. This alignment\neffectively minimizes the impact on the effectiveness of text prompts.\nFurthermore, we collect a well-labeled style dataset named Style30k to train a\nstyle feature extractor capable of accurately representing style while\nexcluding other content information. Experimental results demonstrate that our\nmethod fully grasps the style characteristics of the reference image,\ngenerating appealing images that are consistent with both the target image\nstyle and text prompt. The code and dataset are available at\nhttps://github.com/alipay/style-tokenizer.\n","authors":["Wen Li","Muyuan Fang","Cheng Zou","Biao Gong","Ruobing Zheng","Meng Wang","Jingdong Chen","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02543v1.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2409.02529v1","updated":"2024-09-04T08:42:42Z","published":"2024-09-04T08:42:42Z","title":"Sample what you cant compress","summary":" For learned image representations, basic autoencoders often produce blurry\nresults. Reconstruction quality can be improved by incorporating additional\npenalties such as adversarial (GAN) and perceptual losses. Arguably, these\napproaches lack a principled interpretation. Concurrently, in generative\nsettings diffusion has demonstrated a remarkable ability to create crisp, high\nquality results and has solid theoretical underpinnings (from variational\ninference to direct study as the Fisher Divergence). Our work combines\nautoencoder representation learning with diffusion and is, to our knowledge,\nthe first to demonstrate the efficacy of jointly learning a continuous encoder\nand decoder under a diffusion-based loss. We demonstrate that this approach\nyields better reconstruction quality as compared to GAN-based autoencoders\nwhile being easier to tune. We also show that the resulting representation is\neasier to model with a latent diffusion model as compared to the representation\nobtained from a state-of-the-art GAN-based loss. Since our decoder is\nstochastic, it can generate details not encoded in the otherwise deterministic\nlatent representation; we therefore name our approach \"Sample what you can't\ncompress\", or SWYCC for short.\n","authors":["Vighnesh Birodkar","Gabriel Barcik","James Lyon","Sergey Ioffe","David Minnen","Joshua V. Dillon"],"pdf_url":"https://arxiv.org/pdf/2409.02529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11965v2","updated":"2024-09-04T08:35:32Z","published":"2024-08-21T19:36:27Z","title":"CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT\n Volumes","summary":" The rapid increase of computed tomography (CT) scans and their time-consuming\nmanual analysis have created an urgent need for robust automated analysis\ntechniques in clinical settings. These aim to assist radiologists and help them\nmanaging their growing workload. Existing methods typically generate entire\nreports directly from 3D CT images, without explicitly focusing on observed\nabnormalities. This unguided approach often results in repetitive content or\nincomplete reports, failing to prioritize anomaly-specific descriptions. We\npropose a new anomaly-guided report generation model, which first predicts\nabnormalities and then generates targeted descriptions for each. Evaluation on\na public dataset demonstrates significant improvements in report quality and\nclinical relevance. We extend our work by conducting an ablation study to\ndemonstrate its effectiveness.\n","authors":["Theo Di Piazza"],"pdf_url":"https://arxiv.org/pdf/2408.11965v2.pdf","comment":"15 pages, 9 figures, submitted to ISBI 2025"},{"id":"http://arxiv.org/abs/2409.02513v1","updated":"2024-09-04T08:24:53Z","published":"2024-09-04T08:24:53Z","title":"SG-MIM: Structured Knowledge Guided Efficient Pre-training for Dense\n Prediction","summary":" Masked Image Modeling (MIM) techniques have redefined the landscape of\ncomputer vision, enabling pre-trained models to achieve exceptional performance\nacross a broad spectrum of tasks. Despite their success, the full potential of\nMIM-based methods in dense prediction tasks, particularly in depth estimation,\nremains untapped. Existing MIM approaches primarily rely on single-image\ninputs, which makes it challenging to capture the crucial structured\ninformation, leading to suboptimal performance in tasks requiring fine-grained\nfeature representation. To address these limitations, we propose SG-MIM, a\nnovel Structured knowledge Guided Masked Image Modeling framework designed to\nenhance dense prediction tasks by utilizing structured knowledge alongside\nimages. SG-MIM employs a lightweight relational guidance framework, allowing it\nto guide structured knowledge individually at the feature level rather than\nnaively combining at the pixel level within the same architecture, as is common\nin traditional multi-modal pre-training methods. This approach enables the\nmodel to efficiently capture essential information while minimizing\ndiscrepancies between pre-training and downstream tasks. Furthermore, SG-MIM\nemploys a selective masking strategy to incorporate structured knowledge,\nmaximizing the synergy between general representation learning and structured\nknowledge-specific learning. Our method requires no additional annotations,\nmaking it a versatile and efficient solution for a wide range of applications.\nOur evaluations on the KITTI, NYU-v2, and ADE20k datasets demonstrate SG-MIM's\nsuperiority in monocular depth estimation and semantic segmentation.\n","authors":["Sumin Son","Hyesong Choi","Dongbo Min"],"pdf_url":"https://arxiv.org/pdf/2409.02513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03651v2","updated":"2024-09-04T08:23:00Z","published":"2024-08-07T09:30:51Z","title":"Path-SAM2: Transfer SAM2 for digital pathology semantic segmentation","summary":" The semantic segmentation task in pathology plays an indispensable role in\nassisting physicians in determining the condition of tissue lesions. With the\nproposal of Segment Anything Model (SAM), more and more foundation models have\nseen rapid development in the field of image segmentation. Recently, SAM2 has\ngarnered widespread attention in both natural image and medical image\nsegmentation. Compared to SAM, it has significantly improved in terms of\nsegmentation accuracy and generalization performance. We compared the\nfoundational models based on SAM and found that their performance in semantic\nsegmentation of pathological images was hardly satisfactory. In this paper, we\npropose Path-SAM2, which for the first time adapts the SAM2 model to cater to\nthe task of pathological semantic segmentation. We integrate the largest\npretrained vision encoder for histopathology (UNI) with the original SAM2\nencoder, adding more pathology-based prior knowledge. Additionally, we\nintroduce a learnable Kolmogorov-Arnold Networks (KAN) classification module to\nreplace the manual prompt process. In three adenoma pathological datasets,\nPath-SAM2 has achieved state-of-the-art performance.This study demonstrates the\ngreat potential of adapting SAM2 to pathology image segmentation tasks. We plan\nto release the code and model weights for this paper at:\nhttps://github.com/simzhangbest/SAM2PATH\n","authors":["Mingya Zhang","Liang Wang","Zhihao Chen","Yiyuan Ge","Xianping Tao"],"pdf_url":"https://arxiv.org/pdf/2408.03651v2.pdf","comment":"5 pages , 5 figures"},{"id":"http://arxiv.org/abs/2409.02508v1","updated":"2024-09-04T08:08:21Z","published":"2024-09-04T08:08:21Z","title":"TLD: A Vehicle Tail Light signal Dataset and Benchmark","summary":" Understanding other drivers' intentions is crucial for safe driving. The role\nof taillights in conveying these intentions is underemphasized in current\nautonomous driving systems. Accurately identifying taillight signals is\nessential for predicting vehicle behavior and preventing collisions.\nOpen-source taillight datasets are scarce, often small and inconsistently\nannotated. To address this gap, we introduce a new large-scale taillight\ndataset called TLD. Sourced globally, our dataset covers diverse traffic\nscenarios. To our knowledge, TLD is the first dataset to separately annotate\nbrake lights and turn signals in real driving scenarios. We collected 17.78\nhours of driving videos from the internet. This dataset consists of 152k\nlabeled image frames sampled at a rate of 2 Hz, along with 1.5 million\nunlabeled frames interspersed throughout. Additionally, we have developed a\ntwo-stage vehicle light detection model consisting of two primary modules: a\nvehicle detector and a taillight classifier. Initially, YOLOv10 and DeepSORT\ncaptured consecutive vehicle images over time. Subsequently, the two\nclassifiers work simultaneously to determine the states of the brake lights and\nturn signals. A post-processing procedure is then used to eliminate noise\ncaused by misidentifications and provide the taillight states of the vehicle\nwithin a given time frame. Our method shows exceptional performance on our\ndataset, establishing a benchmark for vehicle taillight detection. The dataset\nis available at https://huggingface.co/datasets/ChaiJohn/TLD/tree/main\n","authors":["Jinhao Chai","Shiyi Mu","Shugong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.02508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02497v1","updated":"2024-09-04T07:46:42Z","published":"2024-09-04T07:46:42Z","title":"A Learnable Color Correction Matrix for RAW Reconstruction","summary":" Autonomous driving algorithms usually employ sRGB images as model input due\nto their compatibility with the human visual system. However, visually pleasing\nsRGB images are possibly sub-optimal for downstream tasks when compared to RAW\nimages. The availability of RAW images is constrained by the difficulties in\ncollecting real-world driving data and the associated challenges of annotation.\nTo address this limitation and support research in RAW-domain driving\nperception, we design a novel and ultra-lightweight RAW reconstruction method.\nThe proposed model introduces a learnable color correction matrix (CCM), which\nuses only a single convolutional layer to approximate the complex inverse image\nsignal processor (ISP). Experimental results demonstrate that simulated RAW\n(simRAW) images generated by our method provide performance improvements\nequivalent to those produced by more complex inverse ISP methods when\npretraining RAW-domain object detectors, which highlights the effectiveness and\npracticality of our approach.\n","authors":["Anqi Liu","Shiyi Mu","Shugong Xu"],"pdf_url":"https://arxiv.org/pdf/2409.02497v1.pdf","comment":"Accepted by BMVC2024"},{"id":"http://arxiv.org/abs/2409.02494v1","updated":"2024-09-04T07:45:06Z","published":"2024-09-04T07:45:06Z","title":"Plane2Depth: Hierarchical Adaptive Plane Guidance for Monocular Depth\n Estimation","summary":" Monocular depth estimation aims to infer a dense depth map from a single\nimage, which is a fundamental and prevalent task in computer vision. Many\nprevious works have shown impressive depth estimation results through carefully\ndesigned network structures, but they usually ignore the planar information and\ntherefore perform poorly in low-texture areas of indoor scenes. In this paper,\nwe propose Plane2Depth, which adaptively utilizes plane information to improve\ndepth prediction within a hierarchical framework. Specifically, in the proposed\nplane guided depth generator (PGDG), we design a set of plane queries as\nprototypes to softly model planes in the scene and predict per-pixel plane\ncoefficients. Then the predicted plane coefficients can be converted into\nmetric depth values with the pinhole camera model. In the proposed adaptive\nplane query aggregation (APGA) module, we introduce a novel feature interaction\napproach to improve the aggregation of multi-scale plane features in a top-down\nmanner. Extensive experiments show that our method can achieve outstanding\nperformance, especially in low-texture or repetitive areas. Furthermore, under\nthe same backbone network, our method outperforms the state-of-the-art methods\non the NYU-Depth-v2 dataset, achieves competitive results with state-of-the-art\nmethods KITTI dataset and can be generalized to unseen scenes effectively.\n","authors":["Li Liu","Ruijie Zhu","Jiacheng Deng","Ziyang Song","Wenfei Yang","Tianzhu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02494v1.pdf","comment":"14 pages, 12 figures, 8 tables"},{"id":"http://arxiv.org/abs/2207.13137v2","updated":"2024-09-04T07:41:41Z","published":"2022-07-19T03:58:00Z","title":"Bayesian Evidential Learning for Few-Shot Classification","summary":" Few-Shot Classification(FSC) aims to generalize from base classes to novel\nclasses given very limited labeled samples, which is an important step on the\npath toward human-like machine learning. State-of-the-art solutions involve\nlearning to find a good metric and representation space to compute the distance\nbetween samples. Despite the promising accuracy performance, how to model\nuncertainty for metric-based FSC methods effectively is still a challenge. To\nmodel uncertainty, We place a distribution over class probability based on the\ntheory of evidence. As a result, uncertainty modeling and metric learning can\nbe decoupled. To reduce the uncertainty of classification, we propose a\nBayesian evidence fusion theorem. Given observed samples, the network learns to\nget posterior distribution parameters given the prior parameters produced by\nthe pre-trained network. Detailed gradient analysis shows that our method\nprovides a smooth optimization target and can capture the uncertainty. The\nproposed method is agnostic to metric learning strategies and can be\nimplemented as a plug-and-play module. We integrate our method into several\nnewest FSC methods and demonstrate the improved accuracy and uncertainty\nquantification on standard FSC benchmarks.\n","authors":["Xiongkun Linghu","Yan Bai","Yihang Lou","Shengsen Wu","Jinze Li","Jianzhong He","Tao Bai"],"pdf_url":"https://arxiv.org/pdf/2207.13137v2.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2409.02492v1","updated":"2024-09-04T07:35:12Z","published":"2024-09-04T07:35:12Z","title":"Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of\n Data-Driven Optimization Routine","summary":" Diffusion tensor imaging (DTI) holds significant importance in clinical\ndiagnosis and neuroscience research. However, conventional model-based fitting\nmethods often suffer from sensitivity to noise, leading to decreased accuracy\nin estimating DTI parameters. While traditional data-driven deep learning\nmethods have shown potential in terms of accuracy and efficiency, their limited\ngeneralization to out-of-training-distribution data impedes their broader\napplication due to the diverse scan protocols used across centers, scanners,\nand studies. This work aims to tackle these challenges and promote the use of\nDTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI\ncombines the weighted linear least squares fitting algorithm and regularization\nby denoising technique. The former fits DW images from diverse acquisition\nsettings into diffusion tensor field, while the latter applies a deep\nlearning-based denoiser to regularize the diffusion tensor field instead of the\nDW images, which is free from the limitation of fixed-channel assignment of the\nnetwork. The optimization object is solved using the alternating direction\nmethod of multipliers and then unrolled to construct a deep neural network,\nleveraging a data-driven strategy to learn network parameters. Extensive\nvalidation experiments are conducted utilizing both internally simulated\ndatasets and externally obtained in-vivo datasets. The results, encompassing\nboth qualitative and quantitative analyses, showcase that the proposed method\nattains state-of-the-art performance in DTI parameter estimation. Notably, it\ndemonstrates superior generalization, accuracy, and efficiency, rendering it\nhighly reliable for widespread application in the field.\n","authors":["Jialong Li","Zhicheng Zhang","Yunwei Chen","Qiqi Lu","Ye Wu","Xiaoming Liu","QianJin Feng","Yanqiu Feng","Xinyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02490v1","updated":"2024-09-04T07:33:09Z","published":"2024-09-04T07:33:09Z","title":"TP-GMOT: Tracking Generic Multiple Object by Textual Prompt with\n Motion-Appearance Cost (MAC) SORT","summary":" While Multi-Object Tracking (MOT) has made substantial advancements, it is\nlimited by heavy reliance on prior knowledge and limited to predefined\ncategories. In contrast, Generic Multiple Object Tracking (GMOT), tracking\nmultiple objects with similar appearance, requires less prior information about\nthe targets but faces challenges with variants like viewpoint, lighting,\nocclusion, and resolution. Our contributions commence with the introduction of\nthe \\textbf{\\text{Refer-GMOT dataset}} a collection of videos, each accompanied\nby fine-grained textual descriptions of their attributes. Subsequently, we\nintroduce a novel text prompt-based open-vocabulary GMOT framework, called\n\\textbf{\\text{TP-GMOT}}, which can track never-seen object categories with zero\ntraining examples. Within \\text{TP-GMOT} framework, we introduce two novel\ncomponents: (i) {\\textbf{\\text{TP-OD}}, an object detection by a textual\nprompt}, for accurately detecting unseen objects with specific characteristics.\n(ii) Motion-Appearance Cost SORT \\textbf{\\text{MAC-SORT}}, a novel object\nassociation approach that adeptly integrates motion and appearance-based\nmatching strategies to tackle the complex task of tracking multiple generic\nobjects with high similarity. Our contributions are benchmarked on the\n\\text{Refer-GMOT} dataset for GMOT task. Additionally, to assess the\ngeneralizability of the proposed \\text{TP-GMOT} framework and the effectiveness\nof \\text{MAC-SORT} tracker, we conduct ablation studies on the DanceTrack and\nMOT20 datasets for the MOT task. Our dataset, code, and models will be publicly\navailable at: https://fsoft-aic.github.io/TP-GMOT\n","authors":["Duy Le Dinh Anh","Kim Hoang Tran","Ngan Hoang Le"],"pdf_url":"https://arxiv.org/pdf/2409.02490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02486v1","updated":"2024-09-04T07:25:50Z","published":"2024-09-04T07:25:50Z","title":"Boosting Generalizability towards Zero-Shot Cross-Dataset Single-Image\n Indoor Depth by Meta-Initialization","summary":" Indoor robots rely on depth to perform tasks like navigation or obstacle\ndetection, and single-image depth estimation is widely used to assist\nperception. Most indoor single-image depth prediction focuses less on model\ngeneralizability to unseen datasets, concerned with in-the-wild robustness for\nsystem deployment. This work leverages gradient-based meta-learning to gain\nhigher generalizability on zero-shot cross-dataset inference. Unlike the\nmost-studied meta-learning of image classification associated with explicit\nclass labels, no explicit task boundaries exist for continuous depth values\ntied to highly varying indoor environments regarding object arrangement and\nscene composition. We propose fine-grained task that treats each RGB-D\nmini-batch as a task in our meta-learning formulation. We first show that our\nmethod on limited data induces a much better prior (max 27.8% in RMSE). Then,\nfinetuning on meta-learned initialization consistently outperforms baselines\nwithout the meta approach. Aiming at generalization, we propose zero-shot\ncross-dataset protocols and validate higher generalizability induced by our\nmeta-initialization, as a simple and useful plugin to many existing depth\nestimation methods. The work at the intersection of depth and meta-learning\npotentially drives both research to step closer to practical robotic and\nmachine perception usage.\n","authors":["Cho-Ying Wu","Yiqi Zhong","Junying Wang","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2409.02486v1.pdf","comment":"IROS 2024. The version supersedes 2305.07269. arXiv admin note: text\n overlap with arXiv:2305.07269"},{"id":"http://arxiv.org/abs/2409.02483v1","updated":"2024-09-04T07:20:01Z","published":"2024-09-04T07:20:01Z","title":"TASAR: Transferable Attack on Skeletal Action Recognition","summary":" Skeletal sequences, as well-structured representations of human behaviors,\nare crucial in Human Activity Recognition (HAR). The transferability of\nadversarial skeletal sequences enables attacks in real-world HAR scenarios,\nsuch as autonomous driving, intelligent surveillance, and human-computer\ninteractions. However, existing Skeleton-based HAR (S-HAR) attacks exhibit weak\nadversarial transferability and, therefore, cannot be considered true\ntransfer-based S-HAR attacks. More importantly, the reason for this failure\nremains unclear. In this paper, we study this phenomenon through the lens of\nloss surface, and find that its sharpness contributes to the poor\ntransferability in S-HAR. Inspired by this observation, we assume and\nempirically validate that smoothening the rugged loss landscape could\npotentially improve adversarial transferability in S-HAR. To this end, we\npropose the first Transfer-based Attack on Skeletal Action Recognition, TASAR.\nTASAR explores the smoothed model posterior without re-training the pre-trained\nsurrogates, which is achieved by a new post-train Dual Bayesian optimization\nstrategy. Furthermore, unlike previous transfer-based attacks that treat each\nframe independently and overlook temporal coherence within sequences, TASAR\nincorporates motion dynamics into the Bayesian attack gradient, effectively\ndisrupting the spatial-temporal coherence of S-HARs. To exhaustively evaluate\nthe effectiveness of existing methods and our method, we build the first\nlarge-scale robust S-HAR benchmark, comprising 7 S-HAR models, 10 attack\nmethods, 3 S-HAR datasets and 2 defense models. Extensive results demonstrate\nthe superiority of TASAR. Our benchmark enables easy comparisons for future\nstudies, with the code available in the supplementary material.\n","authors":["Yunfeng Diao","Baiqi Wu","Ruixuan Zhang","Ajian Liu","Xingxing Wei","Meng Wang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02483v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2407.08572"},{"id":"http://arxiv.org/abs/2409.02482v1","updated":"2024-09-04T07:18:26Z","published":"2024-09-04T07:18:26Z","title":"Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes","summary":" High-quality real-time view synthesis methods are based on volume rendering,\nsplatting, or surface rendering. While surface-based methods generally are the\nfastest, they cannot faithfully model fuzzy geometry like hair. In turn,\nalpha-blending techniques excel at representing fuzzy materials but require an\nunbounded number of samples per ray (P1). Further overheads are induced by\nempty space skipping in volume rendering (P2) and sorting input primitives in\nsplatting (P3). These problems are exacerbated on low-performance graphics\nhardware, e.g. on mobile devices. We present a novel representation for\nreal-time view synthesis where the (P1) number of sampling locations is small\nand bounded, (P2) sampling locations are efficiently found via rasterization,\nand (P3) rendering is sorting-free. We achieve this by representing objects as\nsemi-transparent multi-layer meshes, rendered in fixed layer order from\noutermost to innermost. We model mesh layers as SDF shells with optimal spacing\nlearned during training. After baking, we fit UV textures to the corresponding\nmeshes. We show that our method can represent challenging fuzzy objects while\nachieving higher frame rates than volume-based and splatting-based methods on\nlow-end and mobile devices.\n","authors":["Stefano Esposito","Anpei Chen","Christian Reiser","Samuel Rota Bulò","Lorenzo Porzi","Katja Schwarz","Christian Richardt","Michael Zollhöfer","Peter Kontschieder","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2409.02482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11936v2","updated":"2024-09-04T07:17:15Z","published":"2024-03-18T16:34:38Z","title":"AI-Assisted Cervical Cancer Screening","summary":" Visual Inspection with Acetic Acid (VIA) remains the most feasible cervical\ncancer screening test in resource-constrained settings of low- and\nmiddle-income countries (LMICs), which are often performed screening camps or\nprimary/community health centers by nurses instead of the preferred but\nunavailable expert Gynecologist. To address the highly subjective nature of the\ntest, various handheld devices integrating cameras or smartphones have been\nrecently explored to capture cervical images during VIA and aid decision-making\nvia telemedicine or AI models. Most studies proposing AI models retrospectively\nuse a relatively small number of already collected images from specific\ndevices, digital cameras, or smartphones; the challenges and protocol for\nquality image acquisition during VIA in resource-constrained camp settings,\nchallenges in getting gold standard, data imbalance, etc. are often overlooked.\nWe present a novel approach and describe the end-to-end design process to build\na robust smartphone-based AI-assisted system that does not require buying a\nseparate integrated device: the proposed protocol for quality image acquisition\nin resource-constrained settings, dataset collected from 1,430 women during VIA\nperformed by nurses in screening camps, preprocessing pipeline, and training\nand evaluation of a deep-learning-based classification model aimed to identify\n(pre)cancerous lesions. Our work shows that the readily available smartphones\nand a suitable protocol can capture the cervix images with the required details\nfor the VIA test well; the deep-learning-based classification model provides\npromising results to assist nurses in VIA screening; and provides a direction\nfor large-scale data collection and validation in resource-constrained\nsettings.\n","authors":["Kanchan Poudel","Lisasha Poudel","Prabin Raj Shakya","Atit Poudel","Archana Shrestha","Bishesh Khanal"],"pdf_url":"https://arxiv.org/pdf/2403.11936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13393v3","updated":"2024-09-04T06:32:00Z","published":"2024-06-19T09:36:18Z","title":"Style-NeRF2NeRF: 3D Style Transfer From Style-Aligned Multi-View Images","summary":" We propose a simple yet effective pipeline for stylizing a 3D scene,\nharnessing the power of 2D image diffusion models. Given a NeRF model\nreconstructed from a set of multi-view images, we perform 3D style transfer by\nrefining the source NeRF model using stylized images generated by a\nstyle-aligned image-to-image diffusion model. Given a target style prompt, we\nfirst generate perceptually similar multi-view images by leveraging a\ndepth-conditioned diffusion model with an attention-sharing mechanism. Next,\nbased on the stylized multi-view images, we propose to guide the style transfer\nprocess with the sliced Wasserstein loss based on the feature maps extracted\nfrom a pre-trained CNN model. Our pipeline consists of decoupled steps,\nallowing users to test various prompt ideas and preview the stylized 3D result\nbefore proceeding to the NeRF fine-tuning stage. We demonstrate that our method\ncan transfer diverse artistic styles to real-world 3D scenes with competitive\nquality. Result videos are also available on our project page:\nhttps://haruolabs.github.io/style-n2n/\n","authors":["Haruo Fujiwara","Yusuke Mukuta","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2406.13393v3.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2405.09777v3","updated":"2024-09-04T06:09:15Z","published":"2024-05-16T02:46:19Z","title":"Rethinking Barely-Supervised Volumetric Medical Image Segmentation from\n an Unsupervised Domain Adaptation Perspective","summary":" This paper investigates an extremely challenging problem: barely-supervised\nvolumetric medical image segmentation (BSS). A BSS training dataset consists of\ntwo parts: 1) a barely-annotated labeled set, where each labeled image contains\nonly a single-slice annotation, and 2) an unlabeled set comprising numerous\nunlabeled volumetric images. State-of-the-art BSS methods employ a\nregistration-based paradigm, which uses inter-slice image registration to\npropagate single-slice annotations into volumetric pseudo labels, constructing\na completely annotated labeled set, to which a semi-supervised segmentation\nscheme can be applied. However, the paradigm has a critical limitation: the\npseudo-labels generated by image registration are unreliable and noisy.\nMotivated by this, we propose a new perspective: instead of solving BSS within\na semi-supervised learning scheme, this work formulates BSS as an unsupervised\ndomain adaptation problem. To this end, we propose a novel BSS framework,\n\\textbf{B}arely-supervised learning \\textbf{via} unsupervised domain\n\\textbf{A}daptation (BvA), as an alternative to the dominant registration\nparadigm. Specifically, we first design a novel noise-free labeled data\nconstruction algorithm (NFC) for slice-to-volume labeled data synthesis. Then,\nwe introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the\ndomain shifts. Extensive experiments demonstrate that our method provides a\npromising alternative for BSS. Remarkably, the proposed method, trained on the\nleft atrial segmentation dataset with \\textbf{only one} barely-labeled image,\nachieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%.\nThe code is available at https://github.com/Senyh/BvA.\n","authors":["Zhiqiang Shen","Peng Cao","Junming Su","Jinzhu Yang","Osmar R. Zaiane"],"pdf_url":"https://arxiv.org/pdf/2405.09777v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06170v2","updated":"2024-09-04T05:53:17Z","published":"2024-08-12T14:16:10Z","title":"Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment\n Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging","summary":" Purpose:\n To evaluate the zero-shot performance of Segment Anything Model 2 (SAM 2) in\n3D segmentation of abdominal organs in CT scans, and to investigate the effects\nof prompt settings on segmentation results.\n Materials and Methods:\n Using a subset of the TotalSegmentator CT dataset (n = 123) from eight\ninstitutions, we assessed SAM 2's ability to segment eight abdominal organs.\nSegmentation was initiated from three different z-coordinate levels (caudal,\nmid, and cranial levels) of each organ. Performance was measured using the Dice\nsimilarity coefficient (DSC). We also analyzed the impact of \"negative\nprompts,\" which explicitly exclude certain regions from the segmentation\nprocess, on accuracy. Additionally, we analyzed organ volumes to contextualize\nthe segmentation performance.\n Results:\n As a zero-shot approach, larger organs with clear boundaries demonstrated\nhigh segmentation performance, with mean(median) DSCs as follows: liver\n0.821(0.898), left kidney 0.870(0.921), right kidney 0.862(0.935), and spleen\n0.891(0.932). Smaller organs showed lower performance: gallbladder\n0.531(0.590), pancreas 0.361(0.359), and adrenal glands, right 0.203(0.109),\nleft 0.308(0.231). The initial slice for segmentation and the use of negative\nprompts significantly influenced the results. By removing negative prompts from\nthe input, the DSCs significantly decreased for six organs. Moderate positive\ncorrelations were observed between volume sizes and DSCs.\n Conclusion:\n SAM 2 demonstrated promising zero-shot performance in segmenting certain\nabdominal organs in CT scans, particularly larger organs with clear boundaries.\nPerformance was significantly influenced by input negative prompts and initial\nslice selection, highlighting the importance of optimizing these factors for\neffective segmentation.\n","authors":["Yosuke Yamagishi","Shouhei Hanaoka","Tomohiro Kikuchi","Takahiro Nakao","Yuta Nakamura","Yukihiro Nomura","Soichiro Miki","Takeharu Yoshikawa","Osamu Abe"],"pdf_url":"https://arxiv.org/pdf/2408.06170v2.pdf","comment":"20 pages, 7 figures (including 2 supplemental figure), 4 tables"},{"id":"http://arxiv.org/abs/2405.08621v3","updated":"2024-09-04T05:28:51Z","published":"2024-05-14T14:01:15Z","title":"RMT-BVQA: Recurrent Memory Transformer-based Blind Video Quality\n Assessment for Enhanced Video Content","summary":" With recent advances in deep learning, numerous algorithms have been\ndeveloped to enhance video quality, reduce visual artifacts, and improve\nperceptual quality. However, little research has been reported on the quality\nassessment of enhanced content - the evaluation of enhancement methods is often\nbased on quality metrics that were designed for compression applications. In\nthis paper, we propose a novel blind deep video quality assessment (VQA) method\nspecifically for enhanced video content. It employs a new Recurrent Memory\nTransformer (RMT) based network architecture to obtain video quality\nrepresentations, which is optimized through a novel content-quality-aware\ncontrastive learning strategy based on a new database containing 13K training\npatches with enhanced content. The extracted quality representations are then\ncombined through linear regression to generate video-level quality indices. The\nproposed method, RMT-BVQA, has been evaluated on the VDPVE (VQA Dataset for\nPerceptual Video Enhancement) database through a five-fold cross validation.\nThe results show its superior correlation performance when compared to ten\nexisting no-reference quality metrics.\n","authors":["Tianhao Peng","Chen Feng","Duolikun Danier","Fan Zhang","Benoit Vallade","Alex Mackin","David Bull"],"pdf_url":"https://arxiv.org/pdf/2405.08621v3.pdf","comment":"This paper has been accepted by the ECCV 2024 AIM Advances in Image\n Manipulation workshop"},{"id":"http://arxiv.org/abs/2405.04274v2","updated":"2024-09-04T05:24:25Z","published":"2024-05-07T12:42:23Z","title":"Group-aware Parameter-efficient Updating for Content-Adaptive Neural\n Video Compression","summary":" Content-adaptive compression is crucial for enhancing the adaptability of the\npre-trained neural codec for various contents. Although these methods have been\nvery practical in neural image compression (NIC), their application in neural\nvideo compression (NVC) is still limited due to two main aspects: 1), video\ncompression relies heavily on temporal redundancy, therefore updating just one\nor a few frames can lead to significant errors accumulating over time; 2), NVC\nframeworks are generally more complex, with many large components that are not\neasy to update quickly during encoding. To address the previously mentioned\nchallenges, we have developed a content-adaptive NVC technique called\nGroup-aware Parameter-Efficient Updating (GPU). Initially, to minimize error\naccumulation, we adopt a group-aware approach for updating encoder parameters.\nThis involves adopting a patch-based Group of Pictures (GoP) training strategy\nto segment a video into patch-based GoPs, which will be updated to facilitate a\nglobally optimized domain-transferable solution. Subsequently, we introduce a\nparameter-efficient delta-tuning strategy, which is achieved by integrating\nseveral light-weight adapters into each coding component of the encoding\nprocess by both serial and parallel configuration. Such architecture-agnostic\nmodules stimulate the components with large parameters, thereby reducing both\nthe update cost and the encoding time. We incorporate our GPU into the latest\nNVC framework and conduct comprehensive experiments, whose results showcase\noutstanding video compression efficiency across four video benchmarks and\nadaptability of one medical image benchmark.\n","authors":["Zhenghao Chen","Luping Zhou","Zhihao Hu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2405.04274v2.pdf","comment":"Accepted by ACM MM 2024, Melbourne, Australia"},{"id":"http://arxiv.org/abs/2409.02453v1","updated":"2024-09-04T05:19:57Z","published":"2024-09-04T05:19:57Z","title":"FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video\n Reconstruction in Resource and Timing Constrained Network Settings","summary":" Despite the growing adoption of video processing via Internet of Things (IoT)\ndevices due to their cost-effectiveness, transmitting captured data to nearby\nservers poses challenges due to varying timing constraints and scarcity of\nnetwork bandwidth. Existing video compression methods face difficulties in\nrecovering compressed data when incomplete data is provided. Here, we introduce\n\\emph{\\project}, a deep-learning based solution that utilizes previously\nreceived data to predict the missing segments of a frame, enabling the\nreconstruction of a frame from partially received data.\n","authors":["John Li","Shehab Sarar Ahmed","Deepak Nair"],"pdf_url":"https://arxiv.org/pdf/2409.02453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02448v1","updated":"2024-09-04T05:06:34Z","published":"2024-09-04T05:06:34Z","title":"Detecting Korean Food Using Image using Hierarchical Model","summary":" A solution was made available for Korean Food lovers who have dietary\nrestrictions to identify the Korean food before consuming. Just by uploading a\nclear photo of the dish, people can get to know what they are eating. Image\nprocessing techniques together with machine learning helped to come up with\nthis solution.\n","authors":["Hoang Khanh Lam","Kahandakanaththage Maduni Pramuditha Perera"],"pdf_url":"https://arxiv.org/pdf/2409.02448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02438v1","updated":"2024-09-04T04:29:49Z","published":"2024-09-04T04:29:49Z","title":"Non-target Divergence Hypothesis: Toward Understanding Domain Gaps in\n Cross-Modal Knowledge Distillation","summary":" Compared to single-modal knowledge distillation, cross-modal knowledge\ndistillation faces more severe challenges due to domain gaps between\nmodalities. Although various methods have proposed various solutions to\novercome these challenges, there is still limited research on how domain gaps\naffect cross-modal knowledge distillation. This paper provides an in-depth\nanalysis and evaluation of this issue. We first introduce the Non-Target\nDivergence Hypothesis (NTDH) to reveal the impact of domain gaps on cross-modal\nknowledge distillation. Our key finding is that domain gaps between modalities\nlead to distribution differences in non-target classes, and the smaller these\ndifferences, the better the performance of cross-modal knowledge distillation.\nSubsequently, based on Vapnik-Chervonenkis (VC) theory, we derive the upper and\nlower bounds of the approximation error for cross-modal knowledge distillation,\nthereby theoretically validating the NTDH. Finally, experiments on five\ncross-modal datasets further confirm the validity, generalisability, and\napplicability of the NTDH.\n","authors":["Yilong Chen","Zongyi Xu","Xiaoshui Huang","Shanshan Zhao","Xinqi Jiang","Xinyu Gao","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2409.02438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00359v2","updated":"2024-09-04T04:21:06Z","published":"2023-09-30T12:30:25Z","title":"CrossDF: Improving Cross-Domain Deepfake Detection with Deep Information\n Decomposition","summary":" Deepfake technology poses a significant threat to security and social trust.\nAlthough existing detection methods have shown high performance in identifying\nforgeries within datasets that use the same deepfake techniques for both\ntraining and testing, they suffer from sharp performance degradation when faced\nwith cross-dataset scenarios where unseen deepfake techniques are tested. To\naddress this challenge, we propose a Deep Information Decomposition (DID)\nframework to enhance the performance of Cross-dataset Deepfake Detection\n(CrossDF). Unlike most existing deepfake detection methods, our framework\nprioritizes high-level semantic features over specific visual artifacts.\nSpecifically, it adaptively decomposes facial features into deepfake-related\nand irrelevant information, only using the intrinsic deepfake-related\ninformation for real/fake discrimination. Moreover, it optimizes these two\nkinds of information to be independent with a de-correlation learning module,\nthereby enhancing the model's robustness against various irrelevant information\nchanges and generalization ability to unseen forgery methods. Our extensive\nexperimental evaluation and comparison with existing state-of-the-art detection\nmethods validate the effectiveness and superiority of the DID framework on\ncross-dataset deepfake detection.\n","authors":["Shanmin Yang","Hui Guo","Shu Hu","Bin Zhu","Ying Fu","Siwei Lyu","Xi Wu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.00359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02429v1","updated":"2024-09-04T04:16:58Z","published":"2024-09-04T04:16:58Z","title":"Training-free Color-Style Disentanglement for Constrained Text-to-Image\n Synthesis","summary":" We consider the problem of independently, in a disentangled fashion,\ncontrolling the outputs of text-to-image diffusion models with color and style\nattributes of a user-supplied reference image. We present the first\ntraining-free, test-time-only method to disentangle and condition text-to-image\nmodels on color and style attributes from reference image. To realize this, we\npropose two key innovations. Our first contribution is to transform the latent\ncodes at inference time using feature transformations that make the covariance\nmatrix of current generation follow that of the reference image, helping\nmeaningfully transfer color. Next, we observe that there exists a natural\ndisentanglement between color and style in the LAB image space, which we\nexploit to transform the self-attention feature maps of the image being\ngenerated with respect to those of the reference computed from its L channel.\nBoth these operations happen purely at test time and can be done independently\nor merged. This results in a flexible method where color and style information\ncan come from the same reference image or two different sources, and a new\ngeneration can seamlessly fuse them in either scenario.\n","authors":["Aishwarya Agarwal","Srikrishna Karanam","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2409.02429v1.pdf","comment":"16 pages, 17 figures"},{"id":"http://arxiv.org/abs/2409.02426v1","updated":"2024-09-04T04:14:02Z","published":"2024-09-04T04:14:02Z","title":"Diffusion Models Learn Low-Dimensional Distributions via Subspace\n Clustering","summary":" Recent empirical studies have demonstrated that diffusion models can\neffectively learn the image distribution and generate new samples. Remarkably,\nthese models can achieve this even with a small number of training samples\ndespite a large image dimension, circumventing the curse of dimensionality. In\nthis work, we provide theoretical insights into this phenomenon by leveraging\nkey empirical observations: (i) the low intrinsic dimensionality of image data,\n(ii) a union of manifold structure of image data, and (iii) the low-rank\nproperty of the denoising autoencoder in trained diffusion models. These\nobservations motivate us to assume the underlying data distribution of image\ndata as a mixture of low-rank Gaussians and to parameterize the denoising\nautoencoder as a low-rank model according to the score function of the assumed\ndistribution. With these setups, we rigorously show that optimizing the\ntraining loss of diffusion models is equivalent to solving the canonical\nsubspace clustering problem over the training samples. Based on this\nequivalence, we further show that the minimal number of samples required to\nlearn the underlying distribution scales linearly with the intrinsic dimensions\nunder the above data and model assumptions. This insight sheds light on why\ndiffusion models can break the curse of dimensionality and exhibit the phase\ntransition in learning distributions. Moreover, we empirically establish a\ncorrespondence between the subspaces and the semantic representations of image\ndata, facilitating image editing. We validate these results with corroborated\nexperimental results on both simulated distributions and image datasets.\n","authors":["Peng Wang","Huijie Zhang","Zekai Zhang","Siyi Chen","Yi Ma","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02426v1.pdf","comment":"39 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.02418v1","updated":"2024-09-04T03:46:17Z","published":"2024-09-04T03:46:17Z","title":"MOSMOS: Multi-organ segmentation facilitated by medical report\n supervision","summary":" Owing to a large amount of multi-modal data in modern medical systems, such\nas medical images and reports, Medical Vision-Language Pre-training (Med-VLP)\nhas demonstrated incredible achievements in coarse-grained downstream tasks\n(i.e., medical classification, retrieval, and visual question answering).\nHowever, the problem of transferring knowledge learned from Med-VLP to\nfine-grained multi-organ segmentation tasks has barely been investigated.\nMulti-organ segmentation is challenging mainly due to the lack of large-scale\nfully annotated datasets and the wide variation in the shape and size of the\nsame organ between individuals with different diseases. In this paper, we\npropose a novel pre-training & fine-tuning framework for Multi-Organ\nSegmentation by harnessing Medical repOrt Supervision (MOSMOS). Specifically,\nwe first introduce global contrastive learning to maximally align the medical\nimage-report pairs in the pre-training stage. To remedy the granularity\ndiscrepancy, we further leverage multi-label recognition to implicitly learn\nthe semantic correspondence between image pixels and organ tags. More\nimportantly, our pre-trained models can be transferred to any segmentation\nmodel by introducing the pixel-tag attention maps. Different network settings,\ni.e., 2D U-Net and 3D UNETR, are utilized to validate the generalization. We\nhave extensively evaluated our approach using different diseases and modalities\non BTCV, AMOS, MMWHS, and BRATS datasets. Experimental results in various\nsettings demonstrate the effectiveness of our framework. This framework can\nserve as the foundation to facilitate future research on automatic annotation\ntasks under the supervision of medical reports.\n","authors":["Weiwei Tian","Xinyu Huang","Junlin Hou","Caiyue Ren","Longquan Jiang","Rui-Wei Zhao","Gang Jin","Yuejie Zhang","Daoying Geng"],"pdf_url":"https://arxiv.org/pdf/2409.02418v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.02415v1","updated":"2024-09-04T03:41:42Z","published":"2024-09-04T03:41:42Z","title":"Local map Construction Methods with SD map: A Novel Survey","summary":" In recent years, significant academic advancements have been made in the\nfield of autonomous vehicles, with Local maps emerging as a crucial component\nof autonomous driving technology. Local maps not only provide intricate details\nof road networks but also serve as fundamental inputs for critical tasks such\nas vehicle localization, navigation, and decision-making. Given the\ncharacteristics of SD map (Standard Definition Map), which include low cost,\nease of acquisition, and high versatility, perception methods that integrate SD\nmap as prior information have demonstrated significant potential in the field\nof Local map perception. The purpose of this paper is to provide researchers\nwith a comprehensive overview and summary of the latest advancements in the\nintegration of SD map as prior information for Local map perception methods.\nThis review begins by introducing the task definition and general pipeline of\nlocal map perception methods that incorporate SD maps as prior information,\nalong with relevant public datasets. And then it focuses on the representation\nand encoding methods of multi-source information, as well as the methods for\nfusing multi-source information. In response to this burgeoning trend, this\narticle presents a comprehensive and meticulous overview of the diverse\nresearch efforts in this particular field. Finally, the article addresses\npertinent issues and future challenges with the aim of guiding researchers in\nunderstanding the current trends and methodologies prevalent in the field.\n","authors":["Jiaqi Li","Pingfan Jia","Jiaxing Chen","Jiaxi Liu","Lei He"],"pdf_url":"https://arxiv.org/pdf/2409.02415v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2404.18820v4","updated":"2024-09-04T03:35:58Z","published":"2024-04-29T16:02:38Z","title":"Towards Extreme Image Compression with Latent Feature Guidance and\n Diffusion Prior","summary":" Image compression at extremely low bitrates (below 0.1 bits per pixel (bpp))\nis a significant challenge due to substantial information loss. In this work,\nwe propose a novel two-stage extreme image compression framework that exploits\nthe powerful generative capability of pre-trained diffusion models to achieve\nrealistic image reconstruction at extremely low bitrates. In the first stage,\nwe treat the latent representation of images in the diffusion space as\nguidance, employing a VAE-based compression approach to compress images and\ninitially decode the compressed information into content variables. The second\nstage leverages pre-trained stable diffusion to reconstruct images under the\nguidance of content variables. Specifically, we introduce a small control\nmodule to inject content information while keeping the stable diffusion model\nfixed to maintain its generative capability. Furthermore, we design a space\nalignment loss to force the content variables to align with the diffusion space\nand provide the necessary constraints for optimization. Extensive experiments\ndemonstrate that our method significantly outperforms state-of-the-art\napproaches in terms of visual performance at extremely low bitrates. The source\ncode and trained models are available at https://github.com/huai-chang/DiffEIC.\n","authors":["Zhiyuan Li","Yanhui Zhou","Hao Wei","Chenyang Ge","Jingwen Jiang"],"pdf_url":"https://arxiv.org/pdf/2404.18820v4.pdf","comment":"Accepted by IEEE TCSVT"},{"id":"http://arxiv.org/abs/2408.07341v2","updated":"2024-09-04T03:22:05Z","published":"2024-08-14T07:34:12Z","title":"Robust Semi-supervised Multimodal Medical Image Segmentation via Cross\n Modality Collaboration","summary":" Multimodal learning leverages complementary information derived from\ndifferent modalities, thereby enhancing performance in medical image\nsegmentation. However, prevailing multimodal learning methods heavily rely on\nextensive well-annotated data from various modalities to achieve accurate\nsegmentation performance. This dependence often poses a challenge in clinical\nsettings due to limited availability of such data. Moreover, the inherent\nanatomical misalignment between different imaging modalities further\ncomplicates the endeavor to enhance segmentation performance. To address this\nproblem, we propose a novel semi-supervised multimodal segmentation framework\nthat is robust to scarce labeled data and misaligned modalities. Our framework\nemploys a novel cross modality collaboration strategy to distill\nmodality-independent knowledge, which is inherently associated with each\nmodality, and integrates this information into a unified fusion layer for\nfeature amalgamation. With a channel-wise semantic consistency loss, our\nframework ensures alignment of modality-independent information from a\nfeature-wise perspective across modalities, thereby fortifying it against\nmisalignments in multimodal scenarios. Furthermore, our framework effectively\nintegrates contrastive consistent learning to regulate anatomical structures,\nfacilitating anatomical-wise prediction alignment on unlabeled data in\nsemi-supervised segmentation tasks. Our method achieves competitive performance\ncompared to other multimodal methods across three tasks: cardiac, abdominal\nmulti-organ, and thyroid-associated orbitopathy segmentations. It also\ndemonstrates outstanding robustness in scenarios involving scarce labeled data\nand misaligned modalities.\n","authors":["Xiaogen Zhou","Yiyou Sun","Min Deng","Winnie Chiu Wing Chu","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2408.07341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.20461v3","updated":"2024-09-04T03:17:22Z","published":"2024-07-29T23:40:13Z","title":"Weakly Supervised Intracranial Hemorrhage Segmentation with YOLO and an\n Uncertainty Rectified Segment Anything Model","summary":" Intracranial hemorrhage (ICH) is a life-threatening condition that requires\nrapid and accurate diagnosis to improve treatment outcomes and patient survival\nrates. Recent advancements in supervised deep learning have greatly improved\nthe analysis of medical images, but often rely on extensive datasets with\nhigh-quality annotations, which are costly, time-consuming, and require medical\nexpertise to prepare. To mitigate the need for large amounts of expert-prepared\nsegmentation data, we have developed a novel weakly supervised ICH segmentation\nmethod that utilizes the YOLO object detection model and an\nuncertainty-rectified Segment Anything Model (SAM). In addition, we have\nproposed a novel point prompt generator for this model to further improve\nsegmentation results with YOLO-predicted bounding box prompts. Our approach\nachieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along\nwith a mean Dice score of 0.629 for ICH segmentation, outperforming existing\nweakly supervised and popular supervised (UNet and Swin-UNETR) approaches.\nOverall, the proposed method provides a robust and accurate alternative to the\nmore commonly used supervised techniques for ICH quantification without\nrequiring refined segmentation ground truths during model training.\n","authors":["Pascal Spiegler","Amirhossein Rasoulian","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2407.20461v3.pdf","comment":"Manuscript was accepted at SWITCH2024. 10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.02406v1","updated":"2024-09-04T03:14:48Z","published":"2024-09-04T03:14:48Z","title":"Hadamard Row-Wise Generation Algorithm","summary":" In this paper, we introduce an efficient algorithm for generating specific\nHadamard rows, addressing the memory demands of pre-computing the entire\nmatrix. Leveraging Sylvester's recursive construction, our method generates the\nrequired $i$-th row on demand, significantly reducing computational resources.\nThe algorithm uses the Kronecker product to construct the desired row from the\nbinary representation of the index, without creating the full matrix. This\napproach is particularly useful for single-pixel imaging systems that need only\none row at a time.\n","authors":["Brayan Monroy","Jorge Bacca"],"pdf_url":"https://arxiv.org/pdf/2409.02406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16340v2","updated":"2024-09-04T03:11:10Z","published":"2024-08-29T08:23:57Z","title":"Learned Image Transmission with Hierarchical Variational Autoencoder","summary":" In this paper, we introduce an innovative hierarchical joint source-channel\ncoding (HJSCC) framework for image transmission, utilizing a hierarchical\nvariational autoencoder (VAE). Our approach leverages a combination of\nbottom-up and top-down paths at the transmitter to autoregressively generate\nmultiple hierarchical representations of the original image. These\nrepresentations are then directly mapped to channel symbols for transmission by\nthe JSCC encoder. We extend this framework to scenarios with a feedback link,\nmodeling transmission over a noisy channel as a probabilistic sampling process\nand deriving a novel generative formulation for JSCC with feedback. Compared\nwith existing approaches, our proposed HJSCC provides enhanced adaptability by\ndynamically adjusting transmission bandwidth, encoding these representations\ninto varying amounts of channel symbols. Additionally, we introduce a rate\nattention module to guide the JSCC encoder in optimizing its encoding strategy\nbased on prior information. Extensive experiments on images of varying\nresolutions demonstrate that our proposed model outperforms existing baselines\nin rate-distortion performance and maintains robustness against channel noise.\n","authors":["Guangyi Zhang","Hanlei Li","Yunlong Cai","Qiyu Hu","Guanding Yu","Runmin Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.16340v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00025v2","updated":"2024-09-04T03:11:02Z","published":"2024-08-16T19:22:02Z","title":"A Novel Approach to Classify Power Quality Signals Using Vision\n Transformers","summary":" With the rapid integration of electronically interfaced renewable energy\nresources and loads into smart grids, there is increasing interest in power\nquality disturbances (PQD) classification to enhance the security and\nefficiency of these grids. This paper introduces a new approach to PQD\nclassification based on the Vision Transformer (ViT) model. When a PQD occurs,\nthe proposed approach first converts the power quality signal into an image and\nthen utilizes a pre-trained ViT to accurately determine the class of the PQD.\nUnlike most previous works, which were limited to a few disturbance classes or\nsmall datasets, the proposed method is trained and tested on a large dataset\nwith 17 disturbance classes. Our experimental results show that the proposed\nViT-based approach achieves PQD classification precision and recall of 98.28%\nand 97.98%, respectively, outperforming recently proposed techniques applied to\nthe same dataset.\n","authors":["Ahmad Mohammad Saber","Alaa Selim","Mohamed M. Hammad","Amr Youssef","Deepa Kundur","Ehab El-Saadany"],"pdf_url":"https://arxiv.org/pdf/2409.00025v2.pdf","comment":"IECON 2024-50th Annual Conference of the IEEE Industrial Electronics\n Society, Chicago, U.S.A, 2024, pp. 1-6"},{"id":"http://arxiv.org/abs/2408.15461v2","updated":"2024-09-04T02:45:56Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":" Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v2.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2404.17364v3","updated":"2024-09-04T02:45:32Z","published":"2024-04-26T12:27:57Z","title":"MV-VTON: Multi-View Virtual Try-On with Diffusion Models","summary":" The goal of image-based virtual try-on is to generate an image of the target\nperson naturally wearing the given clothing. However, existing methods solely\nfocus on the frontal try-on using the frontal clothing. When the views of the\nclothing and person are significantly inconsistent, particularly when the\nperson's view is non-frontal, the results are unsatisfactory. To address this\nchallenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to\nreconstruct the dressing results from multiple views using the given clothes.\nGiven that single-view clothes provide insufficient information for MV-VTON, we\ninstead employ two images, i.e., the frontal and back views of the clothing, to\nencompass the complete view as much as possible. Moreover, we adopt diffusion\nmodels that have demonstrated superior abilities to perform our MV-VTON. In\nparticular, we propose a view-adaptive selection method where hard-selection\nand soft-selection are applied to the global and local clothing feature\nextraction, respectively. This ensures that the clothing features are roughly\nfit to the person's view. Subsequently, we suggest joint attention blocks to\nalign and fuse clothing features with person features. Additionally, we collect\na MV-VTON dataset MVG, in which each person has multiple photos with diverse\nviews and poses. Experiments show that the proposed method not only achieves\nstate-of-the-art results on MV-VTON task using our MVG dataset, but also has\nsuperiority on frontal-view virtual try-on task using VITON-HD and DressCode\ndatasets. Codes and datasets are publicly released at\nhttps://github.com/hywang2002/MV-VTON .\n","authors":["Haoyu Wang","Zhilu Zhang","Donglin Di","Shiliang Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.17364v3.pdf","comment":"Project url: https://hywang2002.github.io/MV-VTON/"},{"id":"http://arxiv.org/abs/2409.01128v2","updated":"2024-09-04T02:40:52Z","published":"2024-09-02T10:07:24Z","title":"Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in\n Federated Class Continual Learning","summary":" Federated Class Continual Learning (FCCL) merges the challenges of\ndistributed client learning with the need for seamless adaptation to new\nclasses without forgetting old ones. The key challenge in FCCL is catastrophic\nforgetting, an issue that has been explored to some extent in Continual\nLearning (CL). However, due to privacy preservation requirements, some\nconventional methods, such as experience replay, are not directly applicable to\nFCCL. Existing FCCL methods mitigate forgetting by generating historical data\nthrough federated training of GANs or data-free knowledge distillation.\nHowever, these approaches often suffer from unstable training of generators or\nlow-quality generated data, limiting their guidance for the model. To address\nthis challenge, we propose a novel method of data replay based on diffusion\nmodels. Instead of training a diffusion model, we employ a pre-trained\nconditional diffusion model to reverse-engineer each class, searching the\ncorresponding input conditions for each class within the model's input space,\nsignificantly reducing computational resources and time consumption while\nensuring effective generation. Furthermore, we enhance the classifier's domain\ngeneralization ability on generated and real data through contrastive learning,\nindirectly improving the representational capability of generated data for real\ndata. Comprehensive experiments demonstrate that our method significantly\noutperforms existing baselines. Code is available at\nhttps://github.com/jinglin-liang/DDDR.\n","authors":["Jinglin Liang","Jin Zhong","Hanlin Gu","Zhongqi Lu","Xingxing Tang","Gang Dai","Shuangping Huang","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01128v2.pdf","comment":"Accepted by ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2409.02390v1","updated":"2024-09-04T02:38:52Z","published":"2024-09-04T02:38:52Z","title":"Neural Dynamics Model of Visual Decision-Making: Learning from Human\n Experts","summary":" Uncovering the fundamental neural correlates of biological intelligence,\ndeveloping mathematical models, and conducting computational simulations are\ncritical for advancing new paradigms in artificial intelligence (AI). In this\nstudy, we implemented a comprehensive visual decision-making model that spans\nfrom visual input to behavioral output, using a neural dynamics modeling\napproach. Drawing inspiration from the key components of the dorsal visual\npathway in primates, our model not only aligns closely with human behavior but\nalso reflects neural activities in primates, and achieving accuracy comparable\nto convolutional neural networks (CNNs). Moreover, magnetic resonance imaging\n(MRI) identified key neuroimaging features such as structural connections and\nfunctional connectivity that are associated with performance in perceptual\ndecision-making tasks. A neuroimaging-informed fine-tuning approach was\nintroduced and applied to the model, leading to performance improvements that\nparalleled the behavioral variations observed among subjects. Compared to\nclassical deep learning models, our model more accurately replicates the\nbehavioral performance of biological intelligence, relying on the structural\ncharacteristics of biological neural networks rather than extensive training\ndata, and demonstrating enhanced resilience to perturbation.\n","authors":["Jie Su","Fang Cai","Shu-Kuo Zhao","Xin-Yi Wang","Tian-Yi Qian","Da-Hui Wang","Bo Hong"],"pdf_url":"https://arxiv.org/pdf/2409.02390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02389v1","updated":"2024-09-04T02:37:38Z","published":"2024-09-04T02:37:38Z","title":"Multi-modal Situated Reasoning in 3D Scenes","summary":" Situation awareness is essential for understanding and reasoning about 3D\nscenes in embodied AI agents. However, existing datasets and benchmarks for\nsituated understanding are limited in data modality, diversity, scale, and task\nscope. To address these limitations, we propose Multi-modal Situated Question\nAnswering (MSQA), a large-scale multi-modal situated reasoning dataset,\nscalably collected leveraging 3D scene graphs and vision-language models (VLMs)\nacross a diverse range of real-world 3D scenes. MSQA includes 251K situated\nquestion-answering pairs across 9 distinct question categories, covering\ncomplex scenarios within 3D scenes. We introduce a novel interleaved\nmulti-modal input setting in our benchmark to provide text, image, and point\ncloud for situation and question description, resolving ambiguity in previous\nsingle-modality convention (e.g., text). Additionally, we devise the\nMulti-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models'\nsituated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN\nhighlight the limitations of existing vision-language models and underscore the\nimportance of handling multi-modal interleaved inputs and situation modeling.\nExperiments on data scaling and cross-domain transfer further demonstrate the\nefficacy of leveraging MSQA as a pre-training dataset for developing more\npowerful situated reasoning models.\n","authors":["Xiongkun Linghu","Jiangyong Huang","Xuesong Niu","Xiaojian Ma","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2409.02389v1.pdf","comment":"Project page: https://msr3d.github.io/"},{"id":"http://arxiv.org/abs/2409.02385v1","updated":"2024-09-04T02:25:10Z","published":"2024-09-04T02:25:10Z","title":"Unified Framework with Consistency across Modalities for Human Activity\n Recognition","summary":" Recognizing human activities in videos is challenging due to the\nspatio-temporal complexity and context-dependence of human interactions. Prior\nstudies often rely on single input modalities, such as RGB or skeletal data,\nlimiting their ability to exploit the complementary advantages across\nmodalities. Recent studies focus on combining these two modalities using simple\nfeature fusion techniques. However, due to the inherent disparities in\nrepresentation between these input modalities, designing a unified neural\nnetwork architecture to effectively leverage their complementary information\nremains a significant challenge. To address this, we propose a comprehensive\nmultimodal framework for robust video-based human activity recognition. Our key\ncontribution is the introduction of a novel compositional query machine, called\nCOMPUTER ($\\textbf{COMP}ositional h\\textbf{U}man-cen\\textbf{T}ric\nqu\\textbf{ER}y$ machine), a generic neural architecture that models the\ninteractions between a human of interest and its surroundings in both space and\ntime. Thanks to its versatile design, COMPUTER can be leveraged to distill\ndistinctive representations for various input modalities. Additionally, we\nintroduce a consistency loss that enforces agreement in prediction between\nmodalities, exploiting the complementary information from multimodal inputs for\nrobust human movement recognition. Through extensive experiments on action\nlocalization and group activity recognition tasks, our approach demonstrates\nsuperior performance when compared with state-of-the-art methods. Our code is\navailable at: https://github.com/tranxuantuyen/COMPUTER.\n","authors":["Tuyen Tran","Thao Minh Le","Hung Tran","Truyen Tran"],"pdf_url":"https://arxiv.org/pdf/2409.02385v1.pdf","comment":"Accepted to BMVC 2024"},{"id":"http://arxiv.org/abs/2407.05576v2","updated":"2024-09-04T02:20:29Z","published":"2024-07-08T03:17:10Z","title":"ORMNet: Object-centric Relationship Modeling for Egocentric Hand-object\n Segmentation","summary":" Egocentric hand-object segmentation (EgoHOS) is a promising new task aiming\nat segmenting hands and interacting objects in egocentric images. Although\nEgoHOS has the potential to enable various applications, current methods\nstruggle to achieve both high performance and end-to-end optimization\nsimultaneously. Moreover, existing approaches fail to fully leverage hand cues\nto assist the interacting-object segmentation and overlook the coupled\nrelationships between diverse interacting-object categories, resulting in\nperformance deficiencies. To address these limitations, this paper proposes a\nnovel Object-centric Relationship Modeling Network (ORMNet) to fulfill\nend-to-end and effective EgoHOS by modeling relationships between hands and\nobjects as well as objects and objects. Specifically, a Hand-Object Relation\n(HOR) module is introduced to capture the correlation between hands and\nobjects, which uses hand features to guide the network to extract more\ndistinguishing interacting-object features. Besides, we find the coupling\nrelations between diverse interacting-object categories and design the Object\nRelation Decoupling (ORD) strategy to disentangle them, emphasizing learning of\nthe interaction between hands and objects and reducing the confusion of\ninteracting-object classification. In-domain experiments show that ORMNet has\nnotably exceptional segmentation performance compared with state-of-the-art\nmethods, while out-of-domain experiments further exhibit its robust\ngeneralization capability. The project is available at\nhttps://github.com/yuggiehk/ORMNet/\n","authors":["Yuejiao Su","Yi Wang","Lap-Pui Chau"],"pdf_url":"https://arxiv.org/pdf/2407.05576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02382v1","updated":"2024-09-04T02:18:35Z","published":"2024-09-04T02:18:35Z","title":"GGS: Generalizable Gaussian Splatting for Lane Switching in Autonomous\n Driving","summary":" We propose GGS, a Generalizable Gaussian Splatting method for Autonomous\nDriving which can achieve realistic rendering under large viewpoint changes.\nPrevious generalizable 3D gaussian splatting methods are limited to rendering\nnovel views that are very close to the original pair of images, which cannot\nhandle large differences in viewpoint. Especially in autonomous driving\nscenarios, images are typically collected from a single lane. The limited\ntraining perspective makes rendering images of a different lane very\nchallenging. To further improve the rendering capability of GGS under large\nviewpoint changes, we introduces a novel virtual lane generation module into\nGSS method to enables high-quality lane switching even without a multi-lane\ndataset. Besides, we design a diffusion loss to supervise the generation of\nvirtual lane image to further address the problem of lack of data in the\nvirtual lanes. Finally, we also propose a depth refinement module to optimize\ndepth estimation in the GSS model. Extensive validation of our method, compared\nto existing approaches, demonstrates state-of-the-art performance.\n","authors":["Huasong Han","Kaixuan Zhou","Xiaoxiao Long","Yusen Wang","Chunxia Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.02382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.07076v2","updated":"2024-09-04T01:59:59Z","published":"2024-07-09T17:49:23Z","title":"MADE-for-ASD: A Multi-Atlas Deep Ensemble Network for Diagnosing Autism\n Spectrum Disorder","summary":" In response to the global need for efficient early diagnosis of Autism\nSpectrum Disorder (ASD), this paper bridges the gap between traditional,\ntime-consuming diagnostic methods and potential automated solutions. We propose\na multi-atlas deep ensemble network, MADE-for-ASD, that integrates multiple\natlases of the brain's functional magnetic resonance imaging (fMRI) data\nthrough a weighted deep ensemble network. Our approach integrates demographic\ninformation into the prediction workflow, which enhances ASD diagnosis\nperformance and offers a more holistic perspective on patient profiling. We\nexperiment with the well-known publicly available ABIDE (Autism Brain Imaging\nData Exchange) I dataset, consisting of resting state fMRI data from 17\ndifferent laboratories around the globe. Our proposed system achieves 75.20%\naccuracy on the entire dataset and 96.40% on a specific subset $-$ both\nsurpassing reported ASD diagnosis accuracy in ABIDE I fMRI studies.\nSpecifically, our model improves by 4.4 percentage points over prior works on\nthe same amount of data. The model exhibits a sensitivity of 82.90% and a\nspecificity of 69.70% on the entire dataset, and 91.00% and 99.50%,\nrespectively, on the specific subset. We leverage the F-score to pinpoint the\ntop 10 ROI in ASD diagnosis, such as precuneus and anterior\ncingulate/ventromedial. The proposed system can potentially pave the way for\nmore cost-effective, efficient and scalable strategies in ASD diagnosis. Codes\nand evaluations are publicly available at\nhttps://github.com/hasan-rakibul/MADE-for-ASD.\n","authors":["Xuehan Liu","Md Rakibul Hasan","Tom Gedeon","Md Zakir Hossain"],"pdf_url":"https://arxiv.org/pdf/2407.07076v2.pdf","comment":"Xuehan Liu and Md Rakibul Hasan contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02376v1","updated":"2024-09-04T01:54:20Z","published":"2024-09-04T01:54:20Z","title":"Coral Model Generation from Single Images for Virtual Reality\n Applications","summary":" With the rapid development of VR technology, the demand for high-quality 3D\nmodels is increasing. Traditional methods struggle with efficiency and quality\nin large-scale customization. This paper introduces a deep-learning framework\nthat generates high-precision 3D coral models from a single image. Using the\nCoral dataset, the framework extracts geometric and texture features, performs\n3D reconstruction, and optimizes design and material blending. Advanced\noptimization and polygon count control ensure shape accuracy, detail retention,\nand flexible output for various complexities, catering to high-quality\nrendering and real-time interaction needs.The project incorporates Explainable\nAI (XAI) to transform AI-generated models into interactive \"artworks,\" best\nviewed in VR and XR. This enhances model interpretability and human-machine\ncollaboration. Real-time feedback in VR interactions displays information like\ncoral species and habitat, enriching user experience. The generated models\nsurpass traditional methods in detail, visual quality, and efficiency. This\nresearch offers an intelligent approach to 3D content creation for VR, lowering\nproduction barriers, and promoting widespread VR applications. Additionally,\nintegrating XAI provides new insights into AI-generated visual content and\nadvances research in 3D vision interpretability.\n","authors":["Jie Fu","Shun Fu","Mick Grierson"],"pdf_url":"https://arxiv.org/pdf/2409.02376v1.pdf","comment":"In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts\n 2024) arXiv:2406.14485"},{"id":"http://arxiv.org/abs/2409.02374v1","updated":"2024-09-04T01:47:01Z","published":"2024-09-04T01:47:01Z","title":"Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable\n Image Editing","summary":" Recently, diffusion models have emerged as a powerful class of generative\nmodels. Despite their success, there is still limited understanding of their\nsemantic spaces. This makes it challenging to achieve precise and disentangled\nimage generation without additional training, especially in an unsupervised\nway. In this work, we improve the understanding of their semantic spaces from\nintriguing observations: among a certain range of noise levels, (1) the learned\nposterior mean predictor (PMP) in the diffusion model is locally linear, and\n(2) the singular vectors of its Jacobian lie in low-dimensional semantic\nsubspaces. We provide a solid theoretical basis to justify the linearity and\nlow-rankness in the PMP. These insights allow us to propose an unsupervised,\nsingle-step, training-free LOw-rank COntrollable image editing (LOCO Edit)\nmethod for precise local editing in diffusion models. LOCO Edit identified\nediting directions with nice properties: homogeneity, transferability,\ncomposability, and linearity. These properties of LOCO Edit benefit greatly\nfrom the low-dimensional semantic subspace. Our method can further be extended\nto unsupervised or text-supervised editing in various text-to-image diffusion\nmodels (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the\neffectiveness and efficiency of LOCO Edit. The codes will be released at\nhttps://github.com/ChicyChen/LOCO-Edit.\n","authors":["Siyi Chen","Huijie Zhang","Minzhe Guo","Yifu Lu","Peng Wang","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02371v1","updated":"2024-09-04T01:41:09Z","published":"2024-09-04T01:41:09Z","title":"Unfolding Videos Dynamics via Taylor Expansion","summary":" Taking inspiration from physical motion, we present a new self-supervised\ndynamics learning strategy for videos: Video Time-Differentiation for Instance\nDiscrimination (ViDiDi). ViDiDi is a simple and data-efficient strategy,\nreadily applicable to existing self-supervised video representation learning\nframeworks based on instance discrimination. At its core, ViDiDi observes\ndifferent aspects of a video through various orders of temporal derivatives of\nits frame sequence. These derivatives, along with the original frames, support\nthe Taylor series expansion of the underlying continuous dynamics at discrete\ntimes, where higher-order derivatives emphasize higher-order motion features.\nViDiDi learns a single neural network that encodes a video and its temporal\nderivatives into consistent embeddings following a balanced alternating\nlearning algorithm. By learning consistent representations for original frames\nand derivatives, the encoder is steered to emphasize motion features over\nstatic backgrounds and uncover the hidden dynamics in original frames. Hence,\nvideo representations are better separated by dynamic features. We integrate\nViDiDi into existing instance discrimination frameworks (VICReg, BYOL, and\nSimCLR) for pretraining on UCF101 or Kinetics and test on standard benchmarks\nincluding video retrieval, action recognition, and action detection. The\nperformances are enhanced by a significant margin without the need for large\nmodels or extensive datasets.\n","authors":["Siyi Chen","Minkyu Choi","Zesen Zhao","Kuan Han","Qing Qu","Zhongming Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02368v1","updated":"2024-09-04T01:38:37Z","published":"2024-09-04T01:38:37Z","title":"Pluralistic Salient Object Detection","summary":" We introduce pluralistic salient object detection (PSOD), a novel task aimed\nat generating multiple plausible salient segmentation results for a given input\nimage. Unlike conventional SOD methods that produce a single segmentation mask\nfor salient objects, this new setting recognizes the inherent complexity of\nreal-world images, comprising multiple objects, and the ambiguity in defining\nsalient objects due to different user intentions. To study this task, we\npresent two new SOD datasets \"DUTS-MM\" and \"DUS-MQ\", along with newly designed\nevaluation metrics. DUTS-MM builds upon the DUTS dataset but enriches the\nground-truth mask annotations from three aspects which 1) improves the mask\nquality especially for boundary and fine-grained structures; 2) alleviates the\nannotation inconsistency issue; and 3) provides multiple ground-truth masks for\nimages with saliency ambiguity. DUTS-MQ consists of approximately 100K\nimage-mask pairs with human-annotated preference scores, enabling the learning\nof real human preferences in measuring mask quality. Building upon these two\ndatasets, we propose a simple yet effective pluralistic SOD baseline based on a\nMixture-of-Experts (MOE) design. Equipped with two prediction heads, it\nsimultaneously predicts multiple masks using different query prompts and\npredicts human preference scores for each mask candidate. Extensive experiments\nand analyses underscore the significance of our proposed datasets and affirm\nthe effectiveness of our PSOD framework.\n","authors":["Xuelu Feng","Yunsheng Li","Dongdong Chen","Chunming Qiao","Junsong Yuan","Lu Yuan","Gang Hua"],"pdf_url":"https://arxiv.org/pdf/2409.02368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11593v3","updated":"2024-09-04T01:25:55Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v3.pdf","comment":"Accepted by NCMMSC2024"},{"id":"http://arxiv.org/abs/2307.10593v3","updated":"2024-09-04T01:13:40Z","published":"2023-07-20T05:15:03Z","title":"Asynchronous Blob Tracker for Event Cameras","summary":" Event-based cameras are popular for tracking fast-moving objects due to their\nhigh temporal resolution, low latency, and high dynamic range. In this paper,\nwe propose a novel algorithm for tracking event blobs using raw events\nasynchronously in real time. We introduce the concept of an event blob as a\nspatio-temporal likelihood of event occurrence where the conditional spatial\nlikelihood is blob-like. Many real-world objects such as car headlights or any\nquickly moving foreground objects generate event blob data. The proposed\nalgorithm uses a nearest neighbour classifier with a dynamic threshold criteria\nfor data association coupled with an extended Kalman filter to track the event\nblob state. Our algorithm achieves highly accurate blob tracking, velocity\nestimation, and shape estimation even under challenging lighting conditions and\nhigh-speed motions (> 11000 pixels/s). The microsecond time resolution achieved\nmeans that the filter output can be used to derive secondary information such\nas time-to-contact or range estimation, that will enable applications to\nreal-world problems such as collision avoidance in autonomous driving.\n","authors":["Ziwei Wang","Timothy Molloy","Pieter van Goor","Robert Mahony"],"pdf_url":"https://arxiv.org/pdf/2307.10593v3.pdf","comment":"18 pages, 16 figures. The manuscript was accepted on August 7, 2024,\n by IEEE Transactions on Robotics"},{"id":"http://arxiv.org/abs/2312.02078v2","updated":"2024-09-04T00:06:20Z","published":"2023-12-04T17:41:52Z","title":"From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video\n Solution to Enhance Community Safety","summary":" This article adopts and evaluates an AI-enabled Smart Video Solution (SVS)\ndesigned to enhance safety in the real world. The system integrates with\nexisting infrastructure camera networks, leveraging recent advancements in AI\nfor easy adoption. Prioritizing privacy and ethical standards, pose based data\nis used for downstream AI tasks such as anomaly detection. Cloud-based\ninfrastructure and mobile app are deployed, enabling real-time alerts within\ncommunities. The SVS employs innovative data representation and visualization\ntechniques, such as the Occupancy Indicator, Statistical Anomaly Detection,\nBird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance\npublic safety. Evaluation of the SVS demonstrates its capacity to convert\ncomplex computer vision outputs into actionable insights for stakeholders,\ncommunity partners, law enforcement, urban planners, and social scientists.\nThis article presents a comprehensive real-world deployment and evaluation of\nthe SVS, implemented in a community college environment across 16 cameras. The\nsystem integrates AI-driven visual processing, supported by statistical\nanalysis, database management, cloud communication, and user notifications.\nAdditionally, the article evaluates the end-to-end latency from the moment an\nAI algorithm detects anomalous behavior in real-time at the camera level to the\ntime stakeholders receive a notification. The results demonstrate the system's\nrobustness, effectively managing 16 CCTV cameras with a consistent throughput\nof 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end\nlatency of 26.76 seconds between anomaly detection and alert issuance.\n","authors":["Shanle Yao","Babak Rahimi Ardabili","Armin Danesh Pazho","Ghazal Alinezhad Noghre","Christopher Neff","Lauren Bourque","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2312.02078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13206v2","updated":"2024-09-04T22:45:38Z","published":"2024-03-19T23:54:07Z","title":"Depth-guided NeRF Training via Earth Mover's Distance","summary":" Neural Radiance Fields (NeRFs) are trained to minimize the rendering loss of\npredicted viewpoints. However, the photometric loss often does not provide\nenough information to disambiguate between different possible geometries\nyielding the same image. Previous work has thus incorporated depth supervision\nduring NeRF training, leveraging dense predictions from pre-trained depth\nnetworks as pseudo-ground truth. While these depth priors are assumed to be\nperfect once filtered for noise, in practice, their accuracy is more\nchallenging to capture. This work proposes a novel approach to uncertainty in\ndepth priors for NeRF supervision. Instead of using custom-trained depth or\nuncertainty priors, we use off-the-shelf pretrained diffusion models to predict\ndepth and capture uncertainty during the denoising process. Because we know\nthat depth priors are prone to errors, we propose to supervise the ray\ntermination distance distribution with Earth Mover's Distance instead of\nenforcing the rendered depth to replicate the depth prior exactly through\nL2-loss. Our depth-guided NeRF outperforms all baselines on standard depth\nmetrics by a large margin while maintaining performance on photometric\nmeasures.\n","authors":["Anita Rau","Josiah Aklilu","F. Christopher Holsinger","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2403.13206v2.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2409.03114v1","updated":"2024-09-04T22:39:02Z","published":"2024-09-04T22:39:02Z","title":"Developing, Analyzing, and Evaluating Self-Drive Algorithms Using\n Drive-by-Wire Electric Vehicles","summary":" Reliable lane-following algorithms are essential for safe and effective\nautonomous driving. This project was primarily focused on developing and\nevaluating different lane-following programs to find the most reliable\nalgorithm for a Vehicle to Everything (V2X) project. The algorithms were first\ntested on a simulator and then with real vehicles equipped with a drive-by-wire\nsystem using ROS (Robot Operating System). Their performance was assessed\nthrough reliability, comfort, speed, and adaptability metrics. The results show\nthat the two most reliable approaches detect both lane lines and use\nunsupervised learning to separate them. These approaches proved to be robust in\nvarious driving scenarios, making them suitable candidates for integration into\nthe V2X project.\n","authors":["Beñat Froemming-Aldanondo","Tatiana Rastoskueva","Michael Evans","Marcial Machado","Anna Vadella","Rickey Johnson","Luis Escamilla","Milan Jostes","Devson Butani","Ryan Kaddis","Chan-Jin Chung","Joshua Siegel"],"pdf_url":"https://arxiv.org/pdf/2409.03114v1.pdf","comment":"Supported by the National Science Foundation under Grants No. 2150292\n and 2150096"},{"id":"http://arxiv.org/abs/2409.03110v1","updated":"2024-09-04T22:33:17Z","published":"2024-09-04T22:33:17Z","title":"MSTT-199: MRI Dataset for Musculoskeletal Soft Tissue Tumor Segmentation","summary":" Accurate musculoskeletal soft tissue tumor segmentation is vital for\nassessing tumor size, location, diagnosis, and response to treatment, thereby\ninfluencing patient outcomes. However, segmentation of these tumors requires\nclinical expertise, and an automated segmentation model would save valuable\ntime for both clinician and patient. Training an automatic model requires a\nlarge dataset of annotated images. In this work, we describe the collection of\nan MR imaging dataset of 199 musculoskeletal soft tissue tumors from 199\npatients. We trained segmentation models on this dataset and then benchmarked\nthem on a publicly available dataset. Our model achieved the state-of-the-art\ndice score of 0.79 out of the box without any fine tuning, which shows the\ndiversity and utility of our curated dataset. We analyzed the model predictions\nand found that its performance suffered on fibrous and vascular tumors due to\ntheir diverse anatomical location, size, and intensity heterogeneity. The code\nand models are available in the following github repository,\nhttps://github.com/Reasat/mstt\n","authors":["Tahsin Reasat","Stephen Chenard","Akhil Rekulapelli","Nicholas Chadwick","Joanna Shechtel","Katherine van Schaik","David S. Smith","Joshua Lawrenz"],"pdf_url":"https://arxiv.org/pdf/2409.03110v1.pdf","comment":"Dataset will be made publicly available after the acceptance of the\n paper"},{"id":"http://arxiv.org/abs/2409.03106v1","updated":"2024-09-04T22:09:21Z","published":"2024-09-04T22:09:21Z","title":"Spatial Diffusion for Cell Layout Generation","summary":" Generative models, such as GANs and diffusion models, have been used to\naugment training sets and boost performances in different tasks. We focus on\ngenerative models for cell detection instead, i.e., locating and classifying\ncells in given pathology images. One important information that has been\nlargely overlooked is the spatial patterns of the cells. In this paper, we\npropose a spatial-pattern-guided generative model for cell layout generation.\nSpecifically, a novel diffusion model guided by spatial features and generates\nrealistic cell layouts has been proposed. We explore different density models\nas spatial features for the diffusion model. In downstream tasks, we show that\nthe generated cell layouts can be used to guide the generation of high-quality\npathology images. Augmenting with these images can significantly boost the\nperformance of SOTA cell detection methods. The code is available at\nhttps://github.com/superlc1995/Diffusion-cell.\n","authors":["Chen Li","Xiaoling Hu","Shahira Abousamra","Meilong Xu","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2409.03106v1.pdf","comment":"12 pages, 4 figures, accepted by MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.03087v1","updated":"2024-09-04T21:22:54Z","published":"2024-09-04T21:22:54Z","title":"Coupling AI and Citizen Science in Creation of Enhanced Training Dataset\n for Medical Image Segmentation","summary":" Recent advancements in medical imaging and artificial intelligence (AI) have\ngreatly enhanced diagnostic capabilities, but the development of effective deep\nlearning (DL) models is still constrained by the lack of high-quality annotated\ndatasets. The traditional manual annotation process by medical experts is time-\nand resource-intensive, limiting the scalability of these datasets. In this\nwork, we introduce a robust and versatile framework that combines AI and\ncrowdsourcing to improve both the quality and quantity of medical image\ndatasets across different modalities. Our approach utilises a user-friendly\nonline platform that enables a diverse group of crowd annotators to label\nmedical images efficiently. By integrating the MedSAM segmentation AI with this\nplatform, we accelerate the annotation process while maintaining expert-level\nquality through an algorithm that merges crowd-labelled images. Additionally,\nwe employ pix2pixGAN, a generative AI model, to expand the training dataset\nwith synthetic images that capture realistic morphological features. These\nmethods are combined into a cohesive framework designed to produce an enhanced\ndataset, which can serve as a universal pre-processing pipeline to boost the\ntraining of any medical deep learning segmentation model. Our results\ndemonstrate that this framework significantly improves model performance,\nespecially when training data is limited.\n","authors":["Amir Syahmi","Xiangrong Lu","Yinxuan Li","Haoxuan Yao","Hanjun Jiang","Ishita Acharya","Shiyi Wang","Yang Nan","Xiaodan Xing","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.03087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08109v3","updated":"2024-09-04T20:54:13Z","published":"2024-03-12T22:33:08Z","title":"VANP: Learning Where to See for Navigation with Self-Supervised\n Vision-Action Pre-Training","summary":" Humans excel at efficiently navigating through crowds without collision by\nfocusing on specific visual regions relevant to navigation. However, most\nrobotic visual navigation methods rely on deep learning models pre-trained on\nvision tasks, which prioritize salient objects -- not necessarily relevant to\nnavigation and potentially misleading. Alternative approaches train specialized\nnavigation models from scratch, requiring significant computation. On the other\nhand, self-supervised learning has revolutionized computer vision and natural\nlanguage processing, but its application to robotic navigation remains\nunderexplored due to the difficulty of defining effective self-supervision\nsignals. Motivated by these observations, in this work, we propose a\nSelf-Supervised Vision-Action Model for Visual Navigation Pre-Training (VANP).\nInstead of detecting salient objects that are beneficial for tasks such as\nclassification or detection, VANP learns to focus only on specific visual\nregions that are relevant to the navigation task. To achieve this, VANP uses a\nhistory of visual observations, future actions, and a goal image for\nself-supervision, and embeds them using two small Transformer Encoders. Then,\nVANP maximizes the information between the embeddings by using a mutual\ninformation maximization objective function. We demonstrate that most\nVANP-extracted features match with human navigation intuition. VANP achieves\ncomparable performance as models learned end-to-end with half the training time\nand models trained on a large-scale, fully supervised dataset, i.e., ImageNet,\nwith only 0.08% data.\n","authors":["Mohammad Nazeri","Junzhe Wang","Amirreza Payandeh","Xuesu Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.08109v3.pdf","comment":"Extended version of the paper accepted at IROS 2024. Code:\n https://github.com/mhnazeri/VANP"},{"id":"http://arxiv.org/abs/2409.03062v1","updated":"2024-09-04T20:23:37Z","published":"2024-09-04T20:23:37Z","title":"MobileUNETR: A Lightweight End-To-End Hybrid Vision Transformer For\n Efficient Medical Image Segmentation","summary":" Skin cancer segmentation poses a significant challenge in medical image\nanalysis. Numerous existing solutions, predominantly CNN-based, face issues\nrelated to a lack of global contextual understanding. Alternatively, some\napproaches resort to large-scale Transformer models to bridge the global\ncontextual gaps, but at the expense of model size and computational complexity.\nFinally many Transformer based approaches rely primarily on CNN based decoders\noverlooking the benefits of Transformer based decoding models. Recognizing\nthese limitations, we address the need efficient lightweight solutions by\nintroducing MobileUNETR, which aims to overcome the performance constraints\nassociated with both CNNs and Transformers while minimizing model size,\npresenting a promising stride towards efficient image segmentation. MobileUNETR\nhas 3 main features. 1) MobileUNETR comprises of a lightweight hybrid\nCNN-Transformer encoder to help balance local and global contextual feature\nextraction in an efficient manner; 2) A novel hybrid decoder that\nsimultaneously utilizes low-level and global features at different resolutions\nwithin the decoding stage for accurate mask generation; 3) surpassing large and\ncomplex architectures, MobileUNETR achieves superior performance with 3 million\nparameters and a computational complexity of 1.3 GFLOP resulting in 10x and 23x\nreduction in parameters and FLOPS, respectively. Extensive experiments have\nbeen conducted to validate the effectiveness of our proposed method on four\npublicly available skin lesion segmentation datasets, including ISIC 2016, ISIC\n2017, ISIC 2018, and PH2 datasets. The code will be publicly available at:\nhttps://github.com/OSUPCVLab/MobileUNETR.git\n","authors":["Shehan Perera","Yunus Erzurumlu","Deepak Gulati","Alper Yilmaz"],"pdf_url":"https://arxiv.org/pdf/2409.03062v1.pdf","comment":"Accepted at ECCV 2024 - BioImage Computing Workshop (Oral)"},{"id":"http://arxiv.org/abs/2409.03061v1","updated":"2024-09-04T20:21:13Z","published":"2024-09-04T20:21:13Z","title":"Incorporating dense metric depth into neural 3D representations for view\n synthesis and relighting","summary":" Synthesizing accurate geometry and photo-realistic appearance of small scenes\nis an active area of research with compelling use cases in gaming, virtual\nreality, robotic-manipulation, autonomous driving, convenient product capture,\nand consumer-level photography. When applying scene geometry and appearance\nestimation techniques to robotics, we found that the narrow cone of possible\nviewpoints due to the limited range of robot motion and scene clutter caused\ncurrent estimation techniques to produce poor quality estimates or even fail.\nOn the other hand, in robotic applications, dense metric depth can often be\nmeasured directly using stereo and illumination can be controlled. Depth can\nprovide a good initial estimate of the object geometry to improve\nreconstruction, while multi-illumination images can facilitate relighting. In\nthis work we demonstrate a method to incorporate dense metric depth into the\ntraining of neural 3D representations and address an artifact observed while\njointly refining geometry and appearance by disambiguating between texture and\ngeometry edges. We also discuss a multi-flash stereo camera system developed to\ncapture the necessary data for our pipeline and show results on relighting and\nview synthesis with a few training views.\n","authors":["Arkadeep Narayan Chaudhury","Igor Vasiljevic","Sergey Zakharov","Vitor Guizilini","Rares Ambrus","Srinivasa Narasimhan","Christopher G. Atkeson"],"pdf_url":"https://arxiv.org/pdf/2409.03061v1.pdf","comment":"Project webpage: https://stereomfc.github.io"},{"id":"http://arxiv.org/abs/2302.01541v2","updated":"2024-09-04T19:44:26Z","published":"2023-02-03T04:34:00Z","title":"Contrastive Learning with Consistent Representations","summary":" Contrastive learning demonstrates great promise for representation learning.\nData augmentations play a critical role in contrastive learning by providing\ninformative views of the data without necessitating explicit labels.\nNonetheless, the efficacy of current methodologies heavily hinges on the\nquality of employed data augmentation (DA) functions, often chosen manually\nfrom a limited set of options. While exploiting diverse data augmentations is\nappealing, the complexities inherent in both DAs and representation learning\ncan lead to performance deterioration. Addressing this challenge and\nfacilitating the systematic incorporation of diverse data augmentations, this\npaper proposes Contrastive Learning with Consistent Representations CoCor. At\nthe heart of CoCor is a novel consistency metric termed DA consistency. This\nmetric governs the mapping of augmented input data to the representation space,\nensuring that these instances are positioned optimally in a manner consistent\nwith the applied intensity of the DA. Moreover, we propose to learn the optimal\nmapping locations as a function of DA, all while preserving a desired monotonic\nproperty relative to DA intensity. Experimental results demonstrate that CoCor\nnotably enhances the generalizability and transferability of learned\nrepresentations in comparison to baseline methods.\n","authors":["Zihu Wang","Yu Wang","Zhuotong Chen","Hanbin Hu","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2302.01541v2.pdf","comment":"Accepted by TMLR"},{"id":"http://arxiv.org/abs/2409.03043v1","updated":"2024-09-04T19:27:56Z","published":"2024-09-04T19:27:56Z","title":"Can Your Generative Model Detect Out-of-Distribution Covariate Shift?","summary":" Detecting Out-of-Distribution~(OOD) sensory data and covariate distribution\nshift aims to identify new test examples with different high-level image\nstatistics to the captured, normal and In-Distribution (ID) set. Existing OOD\ndetection literature largely focuses on semantic shift with little-to-no\nconsensus over covariate shift. Generative models capture the ID data in an\nunsupervised manner, enabling them to effectively identify samples that deviate\nsignificantly from this learned distribution, irrespective of the downstream\ntask. In this work, we elucidate the ability of generative models to detect and\nquantify domain-specific covariate shift through extensive analyses that\ninvolves a variety of models. To this end, we conjecture that it is sufficient\nto detect most occurring sensory faults (anomalies and deviations in global\nsignals statistics) by solely modeling high-frequency signal-dependent and\nindependent details. We propose a novel method, CovariateFlow, for OOD\ndetection, specifically tailored to covariate heteroscedastic high-frequency\nimage-components using conditional Normalizing Flows (cNFs). Our results on\nCIFAR10 vs. CIFAR10-C and ImageNet200 vs. ImageNet200-C demonstrate the\neffectiveness of the method by accurately detecting OOD covariate shift. This\nwork contributes to enhancing the fidelity of imaging systems and aiding\nmachine learning models in OOD detection in the presence of covariate shift.\n","authors":["Christiaan Viviers","Amaan Valiuddin","Francisco Caetano","Lemar Abdi","Lena Filatova","Peter de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2409.03043v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.03034v1","updated":"2024-09-04T19:08:13Z","published":"2024-09-04T19:08:13Z","title":"MDNF: Multi-Diffusion-Nets for Neural Fields on Meshes","summary":" We propose a novel framework for representing neural fields on triangle\nmeshes that is multi-resolution across both spatial and frequency domains.\nInspired by the Neural Fourier Filter Bank (NFFB), our architecture decomposes\nthe spatial and frequency domains by associating finer spatial resolution\nlevels with higher frequency bands, while coarser resolutions are mapped to\nlower frequencies. To achieve geometry-aware spatial decomposition we leverage\nmultiple DiffusionNet components, each associated with a different spatial\nresolution level. Subsequently, we apply a Fourier feature mapping to encourage\nfiner resolution levels to be associated with higher frequencies. The final\nsignal is composed in a wavelet-inspired manner using a sine-activated MLP,\naggregating higher-frequency signals on top of lower-frequency ones. Our\narchitecture attains high accuracy in learning complex neural fields and is\nrobust to discontinuities, exponential scale variations of the target field,\nand mesh modification. We demonstrate the effectiveness of our approach through\nits application to diverse neural fields, such as synthetic RGB functions, UV\ntexture coordinates, and vertex normals, illustrating different challenges. To\nvalidate our method, we compare its performance against two alternatives,\nshowcasing the advantages of our multi-resolution architecture.\n","authors":["Avigail Cohen Rimon","Tal Shnitzer","Mirela Ben Chen"],"pdf_url":"https://arxiv.org/pdf/2409.03034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03032v1","updated":"2024-09-04T18:58:32Z","published":"2024-09-04T18:58:32Z","title":"A General Albedo Recovery Approach for Aerial Photogrammetric Images\n through Inverse Rendering","summary":" Modeling outdoor scenes for the synthetic 3D environment requires the\nrecovery of reflectance/albedo information from raw images, which is an\nill-posed problem due to the complicated unmodeled physics in this process\n(e.g., indirect lighting, volume scattering, specular reflection). The problem\nremains unsolved in a practical context. The recovered albedo can facilitate\nmodel relighting and shading, which can further enhance the realism of rendered\nmodels and the applications of digital twins. Typically, photogrammetric 3D\nmodels simply take the source images as texture materials, which inherently\nembed unwanted lighting artifacts (at the time of capture) into the texture.\nTherefore, these polluted textures are suboptimal for a synthetic environment\nto enable realistic rendering. In addition, these embedded environmental\nlightings further bring challenges to photo-consistencies across different\nimages that cause image-matching uncertainties. This paper presents a general\nimage formation model for albedo recovery from typical aerial photogrammetric\nimages under natural illuminations and derives the inverse model to resolve the\nalbedo information through inverse rendering intrinsic image decomposition. Our\napproach builds on the fact that both the sun illumination and scene geometry\nare estimable in aerial photogrammetry, thus they can provide direct inputs for\nthis ill-posed problem. This physics-based approach does not require additional\ninput other than data acquired through the typical drone-based photogrammetric\ncollection and was shown to favorably outperform existing approaches. We also\ndemonstrate that the recovered albedo image can in turn improve typical image\nprocessing tasks in photogrammetry such as feature and dense matching, edge,\nand line extraction.\n","authors":["Shuang Song","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2409.03032v1.pdf","comment":"ISPRS Journal of Photogrammetry and Remote Sensing"},{"id":"http://arxiv.org/abs/2409.03025v1","updated":"2024-09-04T18:32:39Z","published":"2024-09-04T18:32:39Z","title":"No Detail Left Behind: Revisiting Self-Retrieval for Fine-Grained Image\n Captioning","summary":" Image captioning systems are unable to generate fine-grained captions as they\nare trained on data that is either noisy (alt-text) or generic (human\nannotations). This is further exacerbated by maximum likelihood training that\nencourages generation of frequently occurring phrases. Previous works have\ntried to address this limitation by fine-tuning captioners with a\nself-retrieval (SR) reward. However, we find that SR fine-tuning has a tendency\nto reduce caption faithfulness and even hallucinate. In this work, we\ncircumvent this bottleneck by improving the MLE initialization of the\ncaptioning system and designing a curriculum for the SR fine-tuning process. To\nthis extent, we present (1) Visual Caption Boosting, a novel framework to\ninstill fine-grainedness in generic image captioning datasets while remaining\nanchored in human annotations; and (2) BagCurri, a carefully designed training\ncurriculum that more optimally leverages the contrastive nature of the\nself-retrieval reward. Jointly, they enable the captioner to describe\nfine-grained aspects in the image while preserving faithfulness to ground-truth\ncaptions. Our approach outperforms previous work by +8.9% on SR against 99\nrandom distractors (RD100) (Dessi et al., 2023); and +7.6% on ImageCoDe.\n Additionally, existing metrics to evaluate captioning systems fail to reward\ndiversity or evaluate a model's fine-grained understanding ability. Our third\ncontribution addresses this by proposing self-retrieval from the lens of\nevaluation. We introduce TrueMatch, a benchmark comprising bags of highly\nsimilar images that uses SR to assess the captioner's ability to capture subtle\nvisual distinctions. We evaluate and compare several state-of-the-art\nopen-source MLLMs on TrueMatch, and find that our SR approach outperforms them\nall by a significant margin (e.g. +4.8% - 7.1% over Cambrian) while having 1-2\norders of magnitude fewer parameters.\n","authors":["Manu Gaur","Darshan Singh S","Makarand Tapaswi"],"pdf_url":"https://arxiv.org/pdf/2409.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13965v2","updated":"2024-09-04T18:32:38Z","published":"2024-03-20T20:37:13Z","title":"ConGeo: Robust Cross-view Geo-localization across Ground View Variations","summary":" Cross-view geo-localization aims at localizing a ground-level query image by\nmatching it to its corresponding geo-referenced aerial view. In real-world\nscenarios, the task requires accommodating diverse ground images captured by\nusers with varying orientations and reduced field of views (FoVs). However,\nexisting learning pipelines are orientation-specific or FoV-specific, demanding\nseparate model training for different ground view variations. Such models\nheavily depend on the North-aligned spatial correspondence and predefined FoVs\nin the training data, compromising their robustness across different settings.\nTo tackle this challenge, we propose ConGeo, a single- and cross-view\nContrastive method for Geo-localization: it enhances robustness and consistency\nin feature representations to improve a model's invariance to orientation and\nits resilience to FoV variations, by enforcing proximity between ground view\nvariations of the same location. As a generic learning objective for cross-view\ngeo-localization, when integrated into state-of-the-art pipelines, ConGeo\nsignificantly boosts the performance of three base models on four\ngeo-localization benchmarks for diverse ground view variations and outperforms\ncompeting methods that train separate models for each ground view variation.\n","authors":["Li Mi","Chang Xu","Javiera Castillo-Navarro","Syrielle Montariol","Wen Yang","Antoine Bosselut","Devis Tuia"],"pdf_url":"https://arxiv.org/pdf/2403.13965v2.pdf","comment":"ECCV2024. Project page at https://eceo-epfl.github.io/ConGeo/"},{"id":"http://arxiv.org/abs/2409.03022v1","updated":"2024-09-04T18:28:10Z","published":"2024-09-04T18:28:10Z","title":"Boundless: Generating Photorealistic Synthetic Data for Object Detection\n in Urban Streetscapes","summary":" We introduce Boundless, a photo-realistic synthetic data generation system\nfor enabling highly accurate object detection in dense urban streetscapes.\nBoundless can replace massive real-world data collection and manual\nground-truth object annotation (labeling) with an automated and configurable\nprocess. Boundless is based on the Unreal Engine 5 (UE5) City Sample project\nwith improvements enabling accurate collection of 3D bounding boxes across\ndifferent lighting and scene variability conditions.\n We evaluate the performance of object detection models trained on the dataset\ngenerated by Boundless when used for inference on a real-world dataset acquired\nfrom medium-altitude cameras. We compare the performance of the\nBoundless-trained model against the CARLA-trained model and observe an\nimprovement of 7.8 mAP. The results we achieved support the premise that\nsynthetic data generation is a credible methodology for training/fine-tuning\nscalable object detection models for urban scenes.\n","authors":["Mehmet Kerem Turkcan","Ian Li","Chengbo Zang","Javad Ghaderi","Gil Zussman","Zoran Kostic"],"pdf_url":"https://arxiv.org/pdf/2409.03022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03012v1","updated":"2024-09-04T18:10:35Z","published":"2024-09-04T18:10:35Z","title":"Design and Evaluation of Camera-Centric Mobile Crowdsourcing\n Applications","summary":" The data that underlies automated methods in computer vision and machine\nlearning, such as image retrieval and fine-grained recognition, often comes\nfrom crowdsourcing. In contexts that rely on the intrinsic motivation of users,\nwe seek to understand how the application design affects a user's willingness\nto contribute and the quantity and quality of the data they capture. In this\nproject, we designed three versions of a camera-based mobile crowdsourcing\napplication, which varied in the amount of labeling effort requested of the\nuser and conducted a user study to evaluate the trade-off between the level of\nuser-contributed information requested and the quantity and quality of labeled\nimages collected. The results suggest that higher levels of user labeling do\nnot lead to reduced contribution. Users collected and annotated the most images\nusing the application version with the highest requested level of labeling with\nno decrease in user satisfaction. In preliminary experiments, the additional\nlabeled data supported increased performance on an image retrieval task.\n","authors":["Abby Stylianou","Michelle Brachman","Albatool Wazzan","Samuel Black","Richard Souvenir"],"pdf_url":"https://arxiv.org/pdf/2409.03012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02979v1","updated":"2024-09-04T17:59:51Z","published":"2024-09-04T17:59:51Z","title":"Vec2Face: Scaling Face Dataset Generation with Loosely Constrained\n Vectors","summary":" This paper studies how to synthesize face images of non-existent persons, to\ncreate a dataset that allows effective training of face recognition (FR)\nmodels. Two important goals are (1) the ability to generate a large number of\ndistinct identities (inter-class separation) with (2) a wide variation in\nappearance of each identity (intra-class variation). However, existing works 1)\nare typically limited in how many well-separated identities can be generated\nand 2) either neglect or use a separate editing model for attribute\naugmentation. We propose Vec2Face, a holistic model that uses only a sampled\nvector as input and can flexibly generate and control face images and their\nattributes. Composed of a feature masked autoencoder and a decoder, Vec2Face is\nsupervised by face image reconstruction and can be conveniently used in\ninference. Using vectors with low similarity among themselves as inputs,\nVec2Face generates well-separated identities. Randomly perturbing an input\nidentity vector within a small range allows Vec2Face to generate faces of the\nsame identity with robust variation in face attributes. It is also possible to\ngenerate images with designated attributes by adjusting vector values with a\ngradient descent method. Vec2Face has efficiently synthesized as many as 300K\nidentities with 15 million total images, whereas 60K is the largest number of\nidentities created in the previous works. FR models trained with the generated\nHSFace datasets, from 10k to 300k identities, achieve state-of-the-art\naccuracy, from 92% to 93.52%, on five real-world test sets. For the first time,\nour model created using a synthetic training set achieves higher accuracy than\nthe model created using a same-scale training set of real face images (on the\nCALFW test set).\n","authors":["Haiyu Wu","Jaskirat Singh","Sicong Tian","Liang Zheng","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2409.02979v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.02864v1","updated":"2024-09-04T16:43:14Z","published":"2024-09-04T16:43:14Z","title":"Bioinformatics Retrieval Augmentation Data (BRAD) Digital Assistant","summary":" We present a prototype for a Bioinformatics Retrieval Augmentation Data\n(BRAD) digital assistant. BRAD integrates a suite of tools to handle a wide\nrange of bioinformatics tasks, from code execution to online search. We\ndemonstrate BRAD's capabilities through (1) improved question-and-answering\nwith retrieval augmented generation (RAG), (2) BRAD's ability to run and write\ncomplex software pipelines, and (3) BRAD's ability to organize and distribute\ntasks across individual and teams of agents. We use BRAD for automation of\nbioinformatics workflows, performing tasks ranging from gene enrichment and\nsearching the archive to automatic code generation and running biomarker\nidentification pipelines. BRAD is a step toward the ultimate goal to develop a\ndigital twin of laboratories driven by self-contained loops for hypothesis\ngeneration and testing of digital biology experiments.\n","authors":["Joshua Pickard","Marc Andrew Choi","Natalie Oliven","Cooper Stansbury","Jillian Cwycyshyn","Nicholas Galioto","Alex Gorodetsky","Alvaro Velasquez","Indika Rajapakse"],"pdf_url":"https://arxiv.org/pdf/2409.02864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00847v2","updated":"2024-09-04T16:39:22Z","published":"2024-09-01T21:30:14Z","title":"The Design of an LLM-powered Unstructured Analytics System","summary":" LLMs demonstrate an uncanny ability to process unstructured data, and as\nsuch, have the potential to go beyond search and run complex, semantic analyses\nat scale. We describe the design of an unstructured analytics system, Aryn, and\nthe tenets and use cases that motivate its design. With Aryn, users can specify\nqueries in natural language and the system automatically determines a semantic\nplan and executes it to compute an answer from a large collection of\nunstructured documents using LLMs. At the core of Aryn is Sycamore, a\ndeclarative document processing engine, built using Ray, that provides a\nreliable distributed abstraction called DocSets. Sycamore allows users to\nanalyze, enrich, and transform complex documents at scale. Aryn also comprises\nLuna, a query planner that translates natural language queries to Sycamore\nscripts, and the Aryn Partitioner, which takes raw PDFs and document images,\nand converts them to DocSets for downstream processing. Using Aryn, we\ndemonstrate a real world use case for analyzing accident reports from the\nNational Transportation Safety Board (NTSB), and discuss some of the major\nchallenges we encountered in deploying Aryn in the wild.\n","authors":["Eric Anderson","Jonathan Fritz","Austin Lee","Bohou Li","Mark Lindblad","Henry Lindeman","Alex Meyer","Parth Parmar","Tanvi Ranade","Mehul A. Shah","Benjamin Sowell","Dan Tecuci","Vinayak Thapliyal","Matt Welsh"],"pdf_url":"https://arxiv.org/pdf/2409.00847v2.pdf","comment":"6 pages, 3 figures, fixed typos"},{"id":"http://arxiv.org/abs/2409.02856v1","updated":"2024-09-04T16:29:25Z","published":"2024-09-04T16:29:25Z","title":"Building a Scalable, Effective, and Steerable Search and Ranking\n Platform","summary":" Modern e-commerce platforms offer vast product selections, making it\ndifficult for customers to find items that they like and that are relevant to\ntheir current session intent. This is why it is key for e-commerce platforms to\nhave near real-time scalable and adaptable personalized ranking and search\nsystems. While numerous methods exist in the scientific literature for building\nsuch systems, many are unsuitable for large-scale industrial use due to\ncomplexity and performance limitations. Consequently, industrial ranking\nsystems often resort to computationally efficient yet simplistic retrieval or\ncandidate generation approaches, which overlook near real-time and\nheterogeneous customer signals, which results in a less personalized and\nrelevant experience. Moreover, related customer experiences are served by\ncompletely different systems, which increases complexity, maintenance, and\ninconsistent experiences.\n In this paper, we present a personalized, adaptable near real-time ranking\nplatform that is reusable across various use cases, such as browsing and\nsearch, and that is able to cater to millions of items and customers under\nheavy load (thousands of requests per second). We employ transformer-based\nmodels through different ranking layers which can learn complex behavior\npatterns directly from customer action sequences while being able to\nincorporate temporal (e.g. in-session) and contextual information. We validate\nour system through a series of comprehensive offline and online real-world\nexperiments at a large online e-commerce platform, and we demonstrate its\nsuperiority when compared to existing systems, both in terms of customer\nexperience as well as in net revenue. Finally, we share the lessons learned\nfrom building a comprehensive, modern ranking platform for use in a large-scale\ne-commerce environment.\n","authors":["Marjan Celikik","Jacek Wasilewski","Ana Peleteiro Ramallo","Alexey Kurennoy","Evgeny Labzin","Danilo Ascione","Tural Gurbanov","Géraud Le Falher","Andrii Dzhoha","Ian Harris"],"pdf_url":"https://arxiv.org/pdf/2409.02856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02727v1","updated":"2024-09-04T14:01:48Z","published":"2024-09-04T14:01:48Z","title":"Pooling And Attention: What Are Effective Designs For LLm-Based\n Embedding Models?","summary":" The significant advancements of Large Language Models (LLMs) in generative\ntasks have led to a growing body of work exploring LLM-based embedding models.\nWhile these models, employing different pooling and attention strategies, have\nachieved state-of-the-art performance on public embedding benchmarks, questions\nstill arise about what constitutes an effective design for LLM-based embedding\nmodels. However, these models are often trained on different datasets, using\ndifferent LLM base models or training settings. Moreover, evaluations on public\nembedding benchmarks often fail to report statistical significance, making it\ndifficult to determine which designs truly contribute to final performance.\nThis complicates the process for practitioners seeking optimal training recipes\nfor LLM-based embedding models. In this study, we conduct a large-scale\nexperiment by training a series of LLM-based embedding models using the same\ntraining data and base model but differing in their pooling and attention\nstrategies. The results show that there is no one-size-fits-all solution: while\nbidirectional attention and an additional trainable pooling layer outperform in\ntext similarity and information retrieval tasks, they do not significantly\nsurpass simpler designs like EOS-last token pooling and default causal\nattention in clustering and classification tasks. Furthermore, we propose a new\npooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs\nof all hidden layers, rather than just the last layer, using a cross-attention\nnetwork. This method proves to be statistically superior in text similarity and\nretrieval tasks compared to existing pooling methods. Overall, this paper sheds\nlight on effective training strategies for LLM-based embedding models.\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02727v1.pdf","comment":"https://github.com/yixuantt/PoolingAndAttn"},{"id":"http://arxiv.org/abs/2302.03883v2","updated":"2024-09-04T14:00:42Z","published":"2023-02-08T05:12:54Z","title":"Multimodal Recommender Systems: A Survey","summary":" The recommender system (RS) has been an integral toolkit of online services.\nThey are equipped with various deep learning techniques to model user\npreference based on identifier and attribute information. With the emergence of\nmultimedia services, such as short videos, news and etc., understanding these\ncontents while recommending becomes critical. Besides, multimodal features are\nalso helpful in alleviating the problem of data sparsity in RS. Thus,\nMultimodal Recommender System (MRS) has attracted much attention from both\nacademia and industry recently. In this paper, we will give a comprehensive\nsurvey of the MRS models, mainly from technical views. First, we conclude the\ngeneral procedures and major challenges for MRS. Then, we introduce the\nexisting MRS models according to four categories, i.e., Modality Encoder,\nFeature Interaction, Feature Enhancement and Model Optimization. Besides, to\nmake it convenient for those who want to research this field, we also summarize\nthe dataset and code resources. Finally, we discuss some promising future\ndirections of MRS and conclude this paper. To access more details of the\nsurveyed papers, such as implementation code, we open source a repository.\n","authors":["Qidong Liu","Jiaxi Hu","Yutian Xiao","Xiangyu Zhao","Jingtong Gao","Wanyu Wang","Qing Li","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2302.03883v2.pdf","comment":"accepted by CSUR"},{"id":"http://arxiv.org/abs/2409.00702v2","updated":"2024-09-04T13:19:42Z","published":"2024-09-01T12:11:48Z","title":"MARS: Matching Attribute-aware Representations for Text-based Sequential\n Recommendation","summary":" Sequential recommendation aims to predict the next item a user is likely to\nprefer based on their sequential interaction history. Recently, text-based\nsequential recommendation has emerged as a promising paradigm that uses\npre-trained language models to exploit textual item features to enhance\nperformance and facilitate knowledge transfer to unseen datasets. However,\nexisting text-based recommender models still struggle with two key challenges:\n(i) representing users and items with multiple attributes, and (ii) matching\nitems with complex user interests. To address these challenges, we propose a\nnovel model, Matching Attribute-aware Representations for Text-based Sequential\nRecommendation (MARS). MARS extracts detailed user and item representations\nthrough attribute-aware text encoding, capturing diverse user intents with\nmultiple attribute-aware representations. It then computes user-item scores via\nattribute-wise interaction matching, effectively capturing attribute-level user\npreferences. Our extensive experiments demonstrate that MARS significantly\noutperforms existing sequential models, achieving improvements of up to 24.43%\nand 29.26% in Recall@10 and NDCG@10 across five benchmark datasets. Code is\navailable at https://github.com/junieberry/MARS\n","authors":["Hyunsoo Kim","Junyoung Kim","Minjin Choi","Sunkyung Lee","Jongwuk Lee"],"pdf_url":"https://arxiv.org/pdf/2409.00702v2.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2409.02685v1","updated":"2024-09-04T13:16:55Z","published":"2024-09-04T13:16:55Z","title":"RouterRetriever: Exploring the Benefits of Routing over Multiple Expert\n Embedding Models","summary":" Information retrieval methods often rely on a single embedding model trained\non large, general-domain datasets like MSMARCO. While this approach can produce\na retriever with reasonable overall performance, models trained on\ndomain-specific data often yield better results within their respective\ndomains. While prior work in information retrieval has tackled this through\nmulti-task training, the topic of combining multiple domain-specific expert\nretrievers remains unexplored, despite its popularity in language model\ngeneration. In this work, we introduce RouterRetriever, a retrieval model that\nleverages multiple domain-specific experts along with a routing mechanism to\nselect the most appropriate expert for each query. It is lightweight and allows\neasy addition or removal of experts without additional training. Evaluation on\nthe BEIR benchmark demonstrates that RouterRetriever outperforms both\nMSMARCO-trained (+2.1 absolute nDCG@10) and multi-task trained (+3.2) models.\nThis is achieved by employing our routing mechanism, which surpasses other\nrouting techniques (+1.8 on average) commonly used in language modeling.\nFurthermore, the benefit generalizes well to other datasets, even in the\nabsence of a specific expert on the dataset. To our knowledge, RouterRetriever\nis the first work to demonstrate the advantages of using multiple\ndomain-specific expert embedding models with effective routing over a single,\ngeneral-purpose embedding model in retrieval tasks.\n","authors":["Hyunji Lee","Luca Soldaini","Arman Cohan","Minjoon Seo","Kyle Lo"],"pdf_url":"https://arxiv.org/pdf/2409.02685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.09979v2","updated":"2024-09-04T12:33:24Z","published":"2024-06-14T12:41:07Z","title":"HIRO: Hierarchical Information Retrieval Optimization","summary":" Retrieval-Augmented Generation (RAG) has revolutionized natural language\nprocessing by dynamically integrating external knowledge into Large Language\nModels (LLMs), addressing their limitation of static training datasets. Recent\nimplementations of RAG leverage hierarchical data structures, which organize\ndocuments at various levels of summarization and information density. This\ncomplexity, however, can cause LLMs to \"choke\" on information overload,\nnecessitating more sophisticated querying mechanisms. In this context, we\nintroduce Hierarchical Information Retrieval Optimization (HIRO), a novel\nquerying approach that employs a Depth-First Search (DFS)-based recursive\nsimilarity score calculation and branch pruning. This method uniquely minimizes\nthe context delivered to the LLM without informational loss, effectively\nmanaging the challenge of excessive data. HIRO's refined approach is validated\nby a 10.85% improvement in performance on the NarrativeQA dataset.\n","authors":["Krish Goel","Mahek Chandak"],"pdf_url":"https://arxiv.org/pdf/2406.09979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07107v4","updated":"2024-09-04T11:39:56Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zheng Liu","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v4.pdf","comment":"updated to version 3"},{"id":"http://arxiv.org/abs/2409.01137v2","updated":"2024-09-04T10:58:57Z","published":"2024-09-02T10:19:31Z","title":"Smart E-commerce Recommendations with Semantic AI","summary":" In e-commerce, web mining for page recommendations is widely used but often\nfails to meet user needs. To address this, we propose a novel solution\ncombining semantic web mining with BP neural networks. We process user search\nlogs to extract five key features: content priority, time spent, user feedback,\nrecommendation semantics, and input deviation. These features are then fed into\na BP neural network to classify and prioritize web pages. The prioritized pages\nare recommended to users. Using book sales pages for testing, our results\ndemonstrate that this solution can quickly and accurately identify the pages\nusers need. Our approach ensures that recommendations are more relevant and\ntailored to individual preferences, enhancing the online shopping experience.\nBy leveraging advanced semantic analysis and neural network techniques, we\nbridge the gap between user expectations and actual recommendations. This\ninnovative method not only improves accuracy but also speeds up the\nrecommendation process, making it a valuable tool for e-commerce platforms\naiming to boost user satisfaction and engagement. Additionally, our system\nability to handle large datasets and provide real-time recommendations makes it\na scalable and efficient solution for modern e-commerce challenges.\n","authors":["M. Badouch","M. Boutaounte"],"pdf_url":"https://arxiv.org/pdf/2409.01137v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2409.02599v1","updated":"2024-09-04T10:30:11Z","published":"2024-09-04T10:30:11Z","title":"A Fashion Item Recommendation Model in Hyperbolic Space","summary":" In this work, we propose a fashion item recommendation model that\nincorporates hyperbolic geometry into user and item representations. Using\nhyperbolic space, our model aims to capture implicit hierarchies among items\nbased on their visual data and users' purchase history. During training, we\napply a multi-task learning framework that considers both hyperbolic and\nEuclidean distances in the loss function. Our experiments on three data sets\nshow that our model performs better than previous models trained in Euclidean\nspace only, confirming the effectiveness of our model. Our ablation studies\nshow that multi-task learning plays a key role, and removing the Euclidean loss\nsubstantially deteriorates the model performance.\n","authors":["Ryotaro Shimizu","Yu Wang","Masanari Kimura","Yuki Hirakawa","Takashi Wada","Yuki Saito","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.02599v1.pdf","comment":"This work was presented at the CVFAD Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2409.02580v1","updated":"2024-09-04T10:03:09Z","published":"2024-09-04T10:03:09Z","title":"AlignGroup: Learning and Aligning Group Consensus with Member\n Preferences for Group Recommendation","summary":" Group activities are important behaviors in human society, providing\npersonalized recommendations for groups is referred to as the group\nrecommendation task. Existing methods can usually be categorized into two\nstrategies to infer group preferences: 1) determining group preferences by\naggregating members' personalized preferences, and 2) inferring group consensus\nby capturing group members' coherent decisions after common compromises.\nHowever, the former would suffer from the lack of group-level considerations,\nand the latter overlooks the fine-grained preferences of individual users. To\nthis end, we propose a novel group recommendation method AlignGroup, which\nfocuses on both group consensus and individual preferences of group members to\ninfer the group decision-making. Specifically, AlignGroup explores group\nconsensus through a well-designed hypergraph neural network that efficiently\nlearns intra- and inter-group relationships. Moreover, AlignGroup innovatively\nutilizes a self-supervised alignment task to capture fine-grained group\ndecision-making by aligning the group consensus with members' common\npreferences. Extensive experiments on two real-world datasets validate that our\nAlignGroup outperforms the state-of-the-art on both the group recommendation\ntask and the user recommendation task, as well as outperforms the efficiency of\nmost baselines.\n","authors":["Jinfeng Xu","Zheyu Chen","Jinze Li","Shuo Yang","Hewei Wang","Edith C. -H. Ngai"],"pdf_url":"https://arxiv.org/pdf/2409.02580v1.pdf","comment":"10 pages, accepted by CIKM 2024"},{"id":"http://arxiv.org/abs/2409.02571v1","updated":"2024-09-04T09:41:52Z","published":"2024-09-04T09:41:52Z","title":"iRangeGraph: Improvising Range-dedicated Graphs for Range-filtering\n Nearest Neighbor Search","summary":" Range-filtering approximate nearest neighbor (RFANN) search is attracting\nincreasing attention in academia and industry. Given a set of data objects,\neach being a pair of a high-dimensional vector and a numeric value, an RFANN\nquery with a vector and a numeric range as parameters returns the data object\nwhose numeric value is in the query range and whose vector is nearest to the\nquery vector. To process this query, a recent study proposes to build $O(n^2)$\ndedicated graph-based indexes for all possible query ranges to enable efficient\nprocessing on a database of $n$ objects. As storing all these indexes is\nprohibitively expensive, the study constructs compressed indexes instead, which\nreduces the memory consumption considerably. However, this incurs suboptimal\nperformance because the compression is lossy. In this study, instead of\nmaterializing a compressed index for every possible query range in preparation\nfor querying, we materialize graph-based indexes, called elemental graphs, for\na moderate number of ranges. We then provide an effective and efficient\nalgorithm that during querying can construct an index for any query range using\nthe elemental graphs. We prove that the time needed to construct such an index\nis low. We also cover an experimental study on real-world datasets that\nprovides evidence that the materialized elemental graphs only consume moderate\nspace and that the proposed method is capable of superior and stable query\nperformance across different query workloads.\n","authors":["Yuexuan Xu","Jianyang Gao","Yutong Gou","Cheng Long","Christian S. Jensen"],"pdf_url":"https://arxiv.org/pdf/2409.02571v1.pdf","comment":"The paper has been accepted by SIGMOD 2025"},{"id":"http://arxiv.org/abs/2404.06900v3","updated":"2024-09-04T06:55:21Z","published":"2024-04-10T10:45:30Z","title":"NFARec: A Negative Feedback-Aware Recommender Model","summary":" Graph neural network (GNN)-based models have been extensively studied for\nrecommendations, as they can extract high-order collaborative signals\naccurately which is required for high-quality recommender systems. However,\nthey neglect the valuable information gained through negative feedback in two\naspects: (1) different users might hold opposite feedback on the same item,\nwhich hampers optimal information propagation in GNNs, and (2) even when an\nitem vastly deviates from users' preferences, they might still choose it and\nprovide a negative rating. In this paper, we propose a negative feedback-aware\nrecommender model (NFARec) that maximizes the leverage of negative feedback. To\ntransfer information to multi-hop neighbors along an optimal path effectively,\nNFARec adopts a feedback-aware correlation that guides hypergraph convolutions\n(HGCs) to learn users' structural representations. Moreover, NFARec\nincorporates an auxiliary task - predicting the feedback sentiment polarity\n(i.e., positive or negative) of the next interaction - based on the Transformer\nHawkes Process. The task is beneficial for understanding users by learning the\nsentiment expressed in their previous sequential feedback patterns and\npredicting future interactions. Extensive experiments demonstrate that NFARec\noutperforms competitive baselines. Our source code and data are released at\nhttps://github.com/WangXFng/NFARec.\n","authors":["Xinfeng Wang","Fumiyo Fukumoto","Jin Cui","Yoshimi Suzuki","Dongjin Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06900v3.pdf","comment":"Accepted to SIGIR 2024"},{"id":"http://arxiv.org/abs/2404.06895v3","updated":"2024-09-04T06:51:55Z","published":"2024-04-10T10:38:24Z","title":"CaDRec: Contextualized and Debiased Recommender Model","summary":" Recommender models aimed at mining users' behavioral patterns have raised\ngreat attention as one of the essential applications in daily life. Recent work\non graph neural networks (GNNs) or debiasing methods has attained remarkable\ngains. However, they still suffer from (1) over-smoothing node embeddings\ncaused by recursive convolutions with GNNs, and (2) the skewed distribution of\ninteractions due to popularity and user-individual biases. This paper proposes\na contextualized and debiased recommender model (CaDRec). To overcome the\nover-smoothing issue, we explore a novel hypergraph convolution operator that\ncan select effective neighbors during convolution by introducing both\nstructural context and sequential context. To tackle the skewed distribution,\nwe propose two strategies for disentangling interactions: (1) modeling\nindividual biases to learn unbiased item embeddings, and (2) incorporating item\npopularity with positional encoding. Moreover, we mathematically show that the\nimbalance of the gradients to update item embeddings exacerbates the popularity\nbias, thus adopting regularization and weighting schemes as solutions.\nExtensive experiments on four datasets demonstrate the superiority of the\nCaDRec against state-of-the-art (SOTA) methods. Our source code and data are\nreleased at https://github.com/WangXFng/CaDRec.\n","authors":["Xinfeng Wang","Fumiyo Fukumoto","Jin Cui","Yoshimi Suzuki","Jiyi Li","Dongjin Yu"],"pdf_url":"https://arxiv.org/pdf/2404.06895v3.pdf","comment":"Accepted to SIGIR 2024"},{"id":"http://arxiv.org/abs/2408.15796v2","updated":"2024-09-04T06:36:22Z","published":"2024-08-28T13:42:28Z","title":"Evaluating Named Entity Recognition Using Few-Shot Prompting with Large\n Language Models","summary":" This paper evaluates Few-Shot Prompting with Large Language Models for Named\nEntity Recognition (NER). Traditional NER systems rely on extensive labeled\ndatasets, which are costly and time-consuming to obtain. Few-Shot Prompting or\nin-context learning enables models to recognize entities with minimal examples.\nWe assess state-of-the-art models like GPT-4 in NER tasks, comparing their\nfew-shot performance to fully supervised benchmarks. Results show that while\nthere is a performance gap, large models excel in adapting to new entity types\nand domains with very limited data. We also explore the effects of prompt\nengineering, guided output format and context length on performance. This study\nunderscores Few-Shot Learning's potential to reduce the need for large labeled\ndatasets, enhancing NER scalability and accessibility.\n","authors":["Hédi Zeghidi","Ludovic Moncla"],"pdf_url":"https://arxiv.org/pdf/2408.15796v2.pdf","comment":"Github repo: https://github.com/GEODE-project/ner-llm"},{"id":"http://arxiv.org/abs/2409.02455v1","updated":"2024-09-04T05:36:00Z","published":"2024-09-04T05:36:00Z","title":"An Effective Tag Assignment Approach for Billboard Advertisement","summary":" Billboard Advertisement has gained popularity due to its significant outrage\nin return on investment. To make this advertisement approach more effective,\nthe relevant information about the product needs to be reached to the relevant\nset of people. This can be achieved if the relevant set of tags can be mapped\nto the correct slots. Formally, we call this problem the Tag Assignment Problem\nin Billboard Advertisement. Given trajectory, billboard database, and a set of\nselected billboard slots and tags, this problem asks to output a mapping of\nselected tags to the selected slots so that the influence is maximized. We\nmodel this as a variant of traditional bipartite matching called One-To-Many\nBipartite Matching (OMBM). Unlike traditional bipartite matching, a tag can be\nassigned to only one slot; in the OMBM, a tag can be assigned to multiple slots\nwhile the vice versa can not happen. We propose an iterative solution approach\nthat incrementally allocates the tags to the slots. The proposed methodology\nhas been explained with an illustrated example. A complexity analysis of the\nproposed solution approach has also been conducted. The experimental results on\nreal-world trajectory and billboard datasets prove our claim on the\neffectiveness and efficiency of the proposed solution.\n","authors":["Dildar Ali","Harishchandra Kumar","Suman Banerjee","Yamuna Prasad"],"pdf_url":"https://arxiv.org/pdf/2409.02455v1.pdf","comment":"This Paper has been accepted at The 25th International Web\n Information Systems Engineering Conference (WISE-2024)"},{"id":"http://arxiv.org/abs/2408.16672v3","updated":"2024-09-04T05:09:00Z","published":"2024-08-29T16:21:00Z","title":"Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction\n Retriever","summary":" Multi-vector dense models, such as ColBERT, have proven highly effective in\ninformation retrieval. ColBERT's late interaction scoring approximates the\njoint query-document attention seen in cross-encoders while maintaining\ninference efficiency closer to traditional dense retrieval models, thanks to\nits bi-encoder architecture and recent optimizations in indexing and search. In\nthis paper, we introduce a novel architecture and a training framework to\nsupport long context window and multilingual retrieval. Our new model,\nJina-ColBERT-v2, demonstrates strong performance across a range of English and\nmultilingual retrieval tasks,\n","authors":["Rohan Jha","Bo Wang","Michael Günther","Georgios Mastrapas","Saba Sturua","Isabelle Mohr","Andreas Koukounas","Mohammad Kalim Akram","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2408.16672v3.pdf","comment":"8 pages, references at pp7,8; EMNLP workshop submission"},{"id":"http://arxiv.org/abs/2409.02425v1","updated":"2024-09-04T04:12:22Z","published":"2024-09-04T04:12:22Z","title":"Deep Adaptive Interest Network: Personalized Recommendation with\n Context-Aware Learning","summary":" In personalized recommendation systems, accurately capturing users' evolving\ninterests and combining them with contextual information is a critical research\narea. This paper proposes a novel model called the Deep Adaptive Interest\nNetwork (DAIN), which dynamically models users' interests while incorporating\ncontext-aware learning mechanisms to achieve precise and adaptive personalized\nrecommendations. DAIN leverages deep learning techniques to build an adaptive\ninterest network structure that can capture users' interest changes in\nreal-time while further optimizing recommendation results by integrating\ncontextual information. Experiments conducted on several public datasets\ndemonstrate that DAIN excels in both recommendation performance and\ncomputational efficiency. This research not only provides a new solution for\npersonalized recommendation systems but also offers fresh insights into the\napplication of context-aware learning in recommendation systems.\n","authors":["Shuaishuai Huang","Haowei Yang","You Yao","Xueting Lin","Yuming Tu"],"pdf_url":"https://arxiv.org/pdf/2409.02425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02343v1","updated":"2024-09-04T00:10:36Z","published":"2024-09-04T00:10:36Z","title":"NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for\n Retrieval","summary":" $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval)\nfrom pre-trained embedding models is the predominant retrieval method for text\nand images, as well as Retrieval-Augmented Generation (RAG) pipelines. In\npractice, application developers often fine-tune the embeddings to improve\ntheir accuracy on the dataset and query workload in hand. Existing approaches\neither fine-tune the pre-trained model itself or, more efficiently, but at the\ncost of accuracy, train adaptor models to transform the output of the\npre-trained model. We present NUDGE, a family of novel non-parametric embedding\nfine-tuning approaches that are significantly more accurate and efficient than\nboth sets of existing approaches. NUDGE directly modifies the embeddings of\ndata records to maximize the accuracy of $k$-NN retrieval. We present a\nthorough theoretical and experimental study of NUDGE's non-parametric approach.\nWe show that even though the underlying problem is NP-Hard, constrained\nvariations can be solved efficiently. These constraints additionally ensure\nthat the changes to the embeddings are modest, avoiding large distortions to\nthe semantics learned during pre-training. In experiments across five\npre-trained models and nine standard text and image retrieval datasets, NUDGE\nruns in minutes and often improves NDCG@10 by more than 10% over existing\nfine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase\nin accuracy and runs 200x and 3x faster, respectively, over fine-tuning the\npre-trained model and training adaptors.\n","authors":["Sepanta Zeighami","Zac Wellmer","Aditya Parameswaran"],"pdf_url":"https://arxiv.org/pdf/2409.02343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02965v1","updated":"2024-09-04T02:17:32Z","published":"2024-09-04T02:17:32Z","title":"Do We Trust What They Say or What They Do? A Multimodal User Embedding\n Provides Personalized Explanations","summary":" With the rapid development of social media, the importance of analyzing\nsocial network user data has also been put on the agenda. User representation\nlearning in social media is a critical area of research, based on which we can\nconduct personalized content delivery, or detect malicious actors. Being more\ncomplicated than many other types of data, social network user data has\ninherent multimodal nature. Various multimodal approaches have been proposed to\nharness both text (i.e. post content) and relation (i.e. inter-user\ninteraction) information to learn user embeddings of higher quality. The advent\nof Graph Neural Network models enables more end-to-end integration of user text\nembeddings and user interaction graphs in social networks. However, most of\nthose approaches do not adequately elucidate which aspects of the data - text\nor graph structure information - are more helpful for predicting each specific\nuser under a particular task, putting some burden on personalized downstream\nanalysis and untrustworthy information filtering. We propose a simple yet\neffective framework called Contribution-Aware Multimodal User Embedding (CAMUE)\nfor social networks. We have demonstrated with empirical evidence, that our\napproach can provide personalized explainable predictions, automatically\nmitigating the impact of unreliable information. We also conducted case studies\nto show how reasonable our results are. We observe that for most users, graph\nstructure information is more trustworthy than text information, but there are\nsome reasonable cases where text helps more. Our work paves the way for more\nexplainable, reliable, and effective social media user embedding which allows\nfor better personalized content delivery.\n","authors":["Zhicheng Ren","Zhiping Xiao","Yizhou Sun"],"pdf_url":"https://arxiv.org/pdf/2409.02965v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.13989v3","updated":"2024-09-04T17:52:37Z","published":"2024-07-19T02:34:10Z","title":"Enhancing Graph Neural Networks with Limited Labeled Data by Actively\n Distilling Knowledge from Large Language Models","summary":" Graphs are pervasive in the real-world, such as social network analysis,\nbioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great\nability in node classification, a fundamental task on graphs. Unfortunately,\nconventional GNNs still face challenges in scenarios with few labeled nodes,\ndespite the prevalence of few-shot node classification tasks in real-world\napplications. To address this challenge, various approaches have been proposed,\nincluding graph meta-learning, transfer learning, and methods based on Large\nLanguage Models (LLMs). However, traditional meta-learning and transfer\nlearning methods often require prior knowledge from base classes or fail to\nexploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based\nmethods may overlook the zero-shot capabilities of LLMs and rely heavily on the\nquality of generated contexts. In this paper, we propose a novel approach that\nintegrates LLMs and GNNs, leveraging the zero-shot inference and reasoning\ncapabilities of LLMs and employing a Graph-LLM-based active learning paradigm\nto enhance GNNs' performance. Extensive experiments demonstrate the\neffectiveness of our model in improving node classification accuracy with\nconsiderably limited labeled data, surpassing state-of-the-art baselines by\nsignificant margins.\n","authors":["Quan Li","Tianxiang Zhao","Lingwei Chen","Junjie Xu","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13989v3.pdf","comment":"10 pages, 3 Figures"},{"id":"http://arxiv.org/abs/2407.02461v5","updated":"2024-09-04T17:48:46Z","published":"2024-07-02T17:40:06Z","title":"Decentralized Intelligence Network (DIN)","summary":" Decentralized Intelligence Network (DIN) is a theoretical framework designed\nto address challenges in AI development, particularly focusing on data\nfragmentation and siloing issues. It facilitates effective AI training within\nsovereign data networks by overcoming barriers to accessing diverse data\nsources, leveraging: 1) personal data stores to ensure data sovereignty, where\ndata remains securely within Participants' control; 2) a scalable federated\nlearning protocol implemented on a public blockchain for decentralized AI\ntraining, where only model parameter updates are shared, keeping data within\nthe personal data stores; and 3) a scalable, trustless cryptographic rewards\nmechanism on a public blockchain to incentivize participation and ensure fair\nreward distribution through a decentralized auditing protocol. This approach\nguarantees that no entity can prevent or control access to training data or\ninfluence financial benefits, as coordination and reward distribution are\nmanaged on the public blockchain with an immutable record. The framework\nsupports effective AI training by allowing Participants to maintain control\nover their data, benefit financially, and contribute to a decentralized,\nscalable ecosystem that leverages collective AI to develop beneficial\nalgorithms.\n","authors":["Abraham Nash"],"pdf_url":"https://arxiv.org/pdf/2407.02461v5.pdf","comment":"16 pages, 1 figure. DIN was presented by the author as a speaker at\n the Summit on Responsible Decentralized Intelligence - Future of\n Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the\n Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC"},{"id":"http://arxiv.org/abs/2409.02908v1","updated":"2024-09-04T17:48:19Z","published":"2024-09-04T17:48:19Z","title":"Masked Diffusion Models are Secretly Time-Agnostic Masked Models and\n Exploit Inaccurate Categorical Sampling","summary":" Masked diffusion models (MDMs) have emerged as a popular research topic for\ngenerative modeling of discrete data, thanks to their superior performance over\nother discrete diffusion models, and are rivaling the auto-regressive models\n(ARMs) for language modeling tasks. The recent effort in simplifying the masked\ndiffusion framework further leads to alignment with continuous-space diffusion\nmodels and more principled training and sampling recipes. In this paper,\nhowever, we reveal that both training and sampling of MDMs are theoretically\nfree from the time variable, arguably the key signature of diffusion models,\nand are instead equivalent to masked models. The connection on the sampling\naspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we\nshow that the FHS is theoretically equivalent to MDMs' original generation\nprocess while significantly alleviating the time-consuming categorical sampling\nand achieving a 20$\\times$ speedup. In addition, our investigation challenges\nprevious claims that MDMs can surpass ARMs in generative perplexity. We\nidentify, for the first time, an underlying numerical issue, even with the\n32-bit floating-point precision, which results in inaccurate categorical\nsampling. We show that the numerical issue lowers the effective temperature\nboth theoretically and empirically, leading to unfair assessments of MDMs'\ngeneration results in the previous literature.\n","authors":["Kaiwen Zheng","Yongxin Chen","Hanzi Mao","Ming-Yu Liu","Jun Zhu","Qinsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02908v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2402.11126v2","updated":"2024-09-04T17:46:07Z","published":"2024-02-16T23:21:40Z","title":"Kolmogorov n-Widths for Multitask Physics-Informed Machine Learning\n (PIML) Methods: Towards Robust Metrics","summary":" Physics-informed machine learning (PIML) as a means of solving partial\ndifferential equations (PDE) has garnered much attention in the Computational\nScience and Engineering (CS&E) world. This topic encompasses a broad array of\nmethods and models aimed at solving a single or a collection of PDE problems,\ncalled multitask learning. PIML is characterized by the incorporation of\nphysical laws into the training process of machine learning models in lieu of\nlarge data when solving PDE problems. Despite the overall success of this\ncollection of methods, it remains incredibly difficult to analyze, benchmark,\nand generally compare one approach to another. Using Kolmogorov n-widths as a\nmeasure of effectiveness of approximating functions, we judiciously apply this\nmetric in the comparison of various multitask PIML architectures. We compute\nlower accuracy bounds and analyze the model's learned basis functions on\nvarious PDE problems. This is the first objective metric for comparing\nmultitask PIML architectures and helps remove uncertainty in model validation\nfrom selective sampling and overfitting. We also identify avenues of\nimprovement for model architectures, such as the choice of activation function,\nwhich can drastically affect model generalization to \"worst-case\" scenarios,\nwhich is not observed when reporting task-specific errors. We also incorporate\nthis metric into the optimization process through regularization, which\nimproves the models' generalizability over the multitask PDE problem.\n","authors":["Michael Penwarden","Houman Owhadi","Robert M. Kirby"],"pdf_url":"https://arxiv.org/pdf/2402.11126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07703v2","updated":"2024-09-04T17:45:51Z","published":"2022-10-14T10:54:11Z","title":"Hybrid Decentralized Optimization: Leveraging Both First- and\n Zeroth-Order Optimizers for Faster Convergence","summary":" Distributed optimization is the standard way of speeding up machine learning\ntraining, and most of the research in the area focuses on distributed\nfirst-order, gradient-based methods. Yet, there are settings where some\ncomputationally-bounded nodes may not be able to implement first-order,\ngradient-based optimization, while they could still contribute to joint\noptimization tasks. In this paper, we initiate the study of hybrid\ndecentralized optimization, studying settings where nodes with zeroth-order and\nfirst-order optimization capabilities co-exist in a distributed system, and\nattempt to jointly solve an optimization task over some data distribution. We\nessentially show that, under reasonable parameter settings, such a system can\nnot only withstand noisier zeroth-order agents but can even benefit from\nintegrating such agents into the optimization process, rather than ignoring\ntheir information. At the core of our approach is a new analysis of distributed\noptimization with noisy and possibly-biased gradient estimators, which may be\nof independent interest. Our results hold for both convex and non-convex\nobjectives. Experimental results on standard optimization tasks confirm our\nanalysis, showing that hybrid first-zeroth order optimization can be practical,\neven when training deep neural networks.\n","authors":["Matin Ansaripour","Shayan Talaei","Giorgi Nadiradze","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2210.07703v2.pdf","comment":"Shayan Talaei and Matin Ansaripour contributed equally to this work"},{"id":"http://arxiv.org/abs/2409.02901v1","updated":"2024-09-04T17:44:52Z","published":"2024-09-04T17:44:52Z","title":"Topological Methods in Machine Learning: A Tutorial for Practitioners","summary":" Topological Machine Learning (TML) is an emerging field that leverages\ntechniques from algebraic topology to analyze complex data structures in ways\nthat traditional machine learning methods may not capture. This tutorial\nprovides a comprehensive introduction to two key TML techniques, persistent\nhomology and the Mapper algorithm, with an emphasis on practical applications.\nPersistent homology captures multi-scale topological features such as clusters,\nloops, and voids, while the Mapper algorithm creates an interpretable graph\nsummarizing high-dimensional data. To enhance accessibility, we adopt a\ndata-centric approach, enabling readers to gain hands-on experience applying\nthese techniques to relevant tasks. We provide step-by-step explanations,\nimplementations, hands-on examples, and case studies to demonstrate how these\ntools can be applied to real-world problems. The goal is to equip researchers\nand practitioners with the knowledge and resources to incorporate TML into\ntheir work, revealing insights often hidden from conventional machine learning\nmethods. The tutorial code is available at\nhttps://github.com/cakcora/TopologyForML\n","authors":["Baris Coskunuzer","Cüneyt Gürcan Akçora"],"pdf_url":"https://arxiv.org/pdf/2409.02901v1.pdf","comment":"54 pages, 35 figures"},{"id":"http://arxiv.org/abs/2409.02891v1","updated":"2024-09-04T17:31:20Z","published":"2024-09-04T17:31:20Z","title":"Regional data-driven weather modeling with a global stretched-grid","summary":" A data-driven model (DDM) suitable for regional weather forecasting\napplications is presented. The model extends the Artificial Intelligence\nForecasting System by introducing a stretched-grid architecture that dedicates\nhigher resolution over a regional area of interest and maintains a lower\nresolution elsewhere on the globe. The model is based on graph neural networks,\nwhich naturally affords arbitrary multi-resolution grid configurations.\n The model is applied to short-range weather prediction for the Nordics,\nproducing forecasts at 2.5 km spatial and 6 h temporal resolution. The model is\npre-trained on 43 years of global ERA5 data at 31 km resolution and is further\nrefined using 3.3 years of 2.5 km resolution operational analyses from the\nMetCoOp Ensemble Prediction System (MEPS). The performance of the model is\nevaluated using surface observations from measurement stations across Norway\nand is compared to short-range weather forecasts from MEPS. The DDM outperforms\nboth the control run and the ensemble mean of MEPS for 2 m temperature. The\nmodel also produces competitive precipitation and wind speed forecasts, but is\nshown to underestimate extreme events.\n","authors":["Thomas Nils Nipen","Håvard Homleid Haugen","Magnus Sikora Ingstad","Even Marius Nordhagen","Aram Farhad Shafiq Salihi","Paulina Tedesco","Ivar Ambjørn Seierstad","Jørn Kristiansen","Simon Lang","Mihai Alexe","Jesper Dramsch","Baudouin Raoult","Gert Mertes","Matthew Chantry"],"pdf_url":"https://arxiv.org/pdf/2409.02891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18322v2","updated":"2024-09-04T17:16:05Z","published":"2024-07-01T19:52:41Z","title":"The Need for Guardrails with Large Language Models in Medical\n Safety-Critical Settings: An Artificial Intelligence Application in the\n Pharmacovigilance Ecosystem","summary":" Large language models (LLMs) are useful tools with the capacity for\nperforming specific types of knowledge work at an effective scale. However, LLM\ndeployments in high-risk and safety-critical domains pose unique challenges,\nnotably the issue of ``hallucination,'' where LLMs can generate fabricated\ninformation. This is particularly concerning in settings such as drug safety,\nwhere inaccuracies could lead to patient harm. To mitigate these risks, we have\ndeveloped and demonstrated a proof of concept suite of guardrails specifically\ndesigned to mitigate certain types of hallucinations and errors for drug\nsafety, and potentially applicable to other medical safety-critical contexts.\nThese guardrails include mechanisms to detect anomalous documents to prevent\nthe ingestion of inappropriate data, identify incorrect drug names or adverse\nevent terms, and convey uncertainty in generated content. We integrated these\nguardrails with an LLM fine-tuned for a text-to-text task, which involves\nconverting both structured and unstructured data within adverse event reports\ninto natural language. This method was applied to translate individual case\nsafety reports, demonstrating effective application in a pharmacovigilance\nprocessing task. Our guardrail framework offers a set of tools with broad\napplicability across various domains, ensuring LLMs can be safely used in\nhigh-risk situations by eliminating the occurrence of key errors, including the\ngeneration of incorrect pharmacovigilance-related terms, thus adhering to\nstringent regulatory and quality standards in medical safety-critical\nenvironments.\n","authors":["Joe B Hakim","Jeffery L Painter","Darmendra Ramcharran","Vijay Kara","Greg Powell","Paulina Sobczak","Chiho Sato","Andrew Bate","Andrew Beam"],"pdf_url":"https://arxiv.org/pdf/2407.18322v2.pdf","comment":"27 pages, 6 figures, 4 tables and supplementary material provided"},{"id":"http://arxiv.org/abs/2409.02882v1","updated":"2024-09-04T17:07:46Z","published":"2024-09-04T17:07:46Z","title":"Benchmarking Spurious Bias in Few-Shot Image Classifiers","summary":" Few-shot image classifiers are designed to recognize and classify new data\nwith minimal supervision and limited data but often show reliance on spurious\ncorrelations between classes and spurious attributes, known as spurious bias.\nSpurious correlations commonly hold in certain samples and few-shot classifiers\ncan suffer from spurious bias induced from them. There is an absence of an\nautomatic benchmarking system to assess the robustness of few-shot classifiers\nagainst spurious bias. In this paper, we propose a systematic and rigorous\nbenchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied\ndegrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates\nfew-shot evaluation tasks with biased attributes so that using them for\npredictions can demonstrate poor performance. To construct these tasks, we\npropose attribute-based sample selection strategies based on a pre-trained\nvision-language model, eliminating the need for manual dataset curation. This\nallows FewSTAB to automatically benchmark spurious bias using any existing test\ndata. FewSTAB offers evaluation results in a new dimension along with a new\ndesign guideline for building robust classifiers. Moreover, it can benchmark\nspurious bias in varied degrees and enable designs for varied degrees of\nrobustness. Its effectiveness is demonstrated through experiments on ten\nfew-shot learning methods across three datasets. We hope our framework can\ninspire new designs of robust few-shot classifiers. Our code is available at\nhttps://github.com/gtzheng/FewSTAB.\n","authors":["Guangtao Zheng","Wenqian Ye","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02882v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2312.14249v2","updated":"2024-09-04T17:04:40Z","published":"2023-12-21T19:06:34Z","title":"GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for\n High-Throughput Omics Data Analysis and Visualization","summary":" The surge in high-throughput omics data has reshaped the landscape of\nbiological research, underlining the need for powerful, user-friendly data\nanalysis and interpretation tools. This paper presents GenoCraft, a web-based\ncomprehensive software solution designed to handle the entire pipeline of omics\ndata processing. GenoCraft offers a unified platform featuring advanced\nbioinformatics tools, covering all aspects of omics data analysis. It\nencompasses a range of functionalities, such as normalization, quality control,\ndifferential analysis, network analysis, pathway analysis, and diverse\nvisualization techniques. This software makes state-of-the-art omics data\nanalysis more accessible to a wider range of users. With GenoCraft, researchers\nand data scientists have access to an array of cutting-edge bioinformatics\ntools under a user-friendly interface, making it a valuable resource for\nmanaging and analyzing large-scale omics data. The API with an interactive web\ninterface is publicly available at https://genocraft.stanford. edu/. We also\nrelease all the codes in https://github.com/futianfan/GenoCraft.\n","authors":["Yingzhou Lu","Minjie Shen","Ling Yue","Chenhao Li","Fan Meng","Xiao Wang","David Herrington","Yue Wang","Yue Zhao","Tianfan Fu","Capucine Van Rechem"],"pdf_url":"https://arxiv.org/pdf/2312.14249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02877v1","updated":"2024-09-04T17:01:02Z","published":"2024-09-04T17:01:02Z","title":"Configurable Foundation Models: Building LLMs from a Modular Perspective","summary":" Advancements in LLMs have recently unveiled challenges tied to computational\nefficiency and continual scalability due to their requirements of huge\nparameters, making the applications and evolution of these models on devices\nwith limited computation resources and scenarios requiring various abilities\nincreasingly cumbersome. Inspired by modularity within the human brain, there\nis a growing tendency to decompose LLMs into numerous functional modules,\nallowing for inference with part of modules and dynamic assembly of modules to\ntackle complex tasks, such as mixture-of-experts. To highlight the inherent\nefficiency and composability of the modular approach, we coin the term brick to\nrepresent each functional module, designating the modularized structure as\nconfigurable foundation models. In this paper, we offer a comprehensive\noverview and investigation of the construction, utilization, and limitation of\nconfigurable foundation models. We first formalize modules into emergent bricks\n- functional neuron partitions that emerge during the pre-training phase, and\ncustomized bricks - bricks constructed via additional post-training to improve\nthe capabilities and knowledge of LLMs. Based on diverse functional bricks, we\nfurther present four brick-oriented operations: retrieval and routing, merging,\nupdating, and growing. These operations allow for dynamic configuration of LLMs\nbased on instructions to handle complex tasks. To verify our perspective, we\nconduct an empirical analysis on widely-used LLMs. We find that the FFN layers\nfollow modular patterns with functional specialization of neurons and\nfunctional neuron partitions. Finally, we highlight several open issues and\ndirections for future research. Overall, this paper aims to offer a fresh\nmodular perspective on existing LLM research and inspire the future creation of\nmore efficient and scalable foundational models.\n","authors":["Chaojun Xiao","Zhengyan Zhang","Chenyang Song","Dazhi Jiang","Feng Yao","Xu Han","Xiaozhi Wang","Shuo Wang","Yufei Huang","Guanyu Lin","Yingfa Chen","Weilin Zhao","Yuge Tu","Zexuan Zhong","Ao Zhang","Chenglei Si","Khai Hao Moo","Chenyang Zhao","Huimin Chen","Yankai Lin","Zhiyuan Liu","Jingbo Shang","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2409.02877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17293v4","updated":"2024-09-04T16:59:27Z","published":"2023-12-28T13:59:43Z","title":"$μ$GUIDE: a framework for quantitative imaging via generalized\n uncertainty-driven inference using deep learning","summary":" This work proposes $\\mu$GUIDE: a general Bayesian framework to estimate\nposterior distributions of tissue microstructure parameters from any given\nbiophysical model or MRI signal representation, with exemplar demonstration in\ndiffusion-weighted MRI. Harnessing a new deep learning architecture for\nautomatic signal feature selection combined with simulation-based inference and\nefficient sampling of the posterior distributions, $\\mu$GUIDE bypasses the high\ncomputational and time cost of conventional Bayesian approaches and does not\nrely on acquisition constraints to define model-specific summary statistics.\nThe obtained posterior distributions allow to highlight degeneracies present in\nthe model definition and quantify the uncertainty and ambiguity of the\nestimated parameters.\n","authors":["Maëliss Jallais","Marco Palombo"],"pdf_url":"https://arxiv.org/pdf/2312.17293v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02871v1","updated":"2024-09-04T16:54:31Z","published":"2024-09-04T16:54:31Z","title":"Hybrid Imitation-Learning Motion Planner for Urban Driving","summary":" With the release of open source datasets such as nuPlan and Argoverse, the\nresearch around learning-based planners has spread a lot in the last years.\nExisting systems have shown excellent capabilities in imitating the human\ndriver behaviour, but they struggle to guarantee safe closed-loop driving.\nConversely, optimization-based planners offer greater security in short-term\nplanning scenarios. To confront this challenge, in this paper we propose a\nnovel hybrid motion planner that integrates both learning-based and\noptimization-based techniques. Initially, a multilayer perceptron (MLP)\ngenerates a human-like trajectory, which is then refined by an\noptimization-based component. This component not only minimizes tracking errors\nbut also computes a trajectory that is both kinematically feasible and\ncollision-free with obstacles and road boundaries. Our model effectively\nbalances safety and human-likeness, mitigating the trade-off inherent in these\nobjectives. We validate our approach through simulation experiments and further\ndemonstrate its efficacy by deploying it in real-world self-driving vehicles.\n","authors":["Cristian Gariboldi","Matteo Corno","Beng Jin"],"pdf_url":"https://arxiv.org/pdf/2409.02871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02869v1","updated":"2024-09-04T16:53:46Z","published":"2024-09-04T16:53:46Z","title":"Look Into the LITE in Deep Learning for Time Series Classification","summary":" Deep learning models have been shown to be a powerful solution for Time\nSeries Classification (TSC). State-of-the-art architectures, while producing\npromising results on the UCR and the UEA archives , present a high number of\ntrainable parameters. This can lead to long training with high CO2 emission,\npower consumption and possible increase in the number of FLoating-point\nOperation Per Second (FLOPS). In this paper, we present a new architecture for\nTSC, the Light Inception with boosTing tEchnique (LITE) with only 2.34% of the\nnumber of parameters of the state-of-the-art InceptionTime model, while\npreserving performance. This architecture, with only 9, 814 trainable\nparameters due to the usage of DepthWise Separable Convolutions (DWSC), is\nboosted by three techniques: multiplexing, custom filters, and dilated\nconvolution. The LITE architecture, trained on the UCR, is 2.78 times faster\nthan InceptionTime and consumes 2.79 times less CO2 and power. To evaluate the\nperformance of the proposed architecture on multivariate time series data, we\nadapt LITE to handle multivariate time series, we call this version LITEMV. To\nbring theory into application, we also conducted experiments using LITEMV on\nmultivariate time series representing human rehabilitation movements, showing\nthat LITEMV not only is the most efficient model but also the best performing\nfor this application on the Kimore dataset, a skeleton based human\nrehabilitation exercises dataset. Moreover, to address the interpretability of\nLITEMV, we present a study using Class Activation Maps to understand the\nclassification decision taken by the model during evaluation.\n","authors":["Ali Ismail-Fawaz","Maxime Devanne","Stefano Berretti","Jonathan Weber","Germain Forestier"],"pdf_url":"https://arxiv.org/pdf/2409.02869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08705v3","updated":"2024-09-04T16:44:57Z","published":"2023-08-16T23:42:03Z","title":"Partially Observable Multi-Agent Reinforcement Learning with Information\n Sharing","summary":" We study provable multi-agent reinforcement learning (RL) in the general\nframework of partially observable stochastic games (POSGs). To circumvent the\nknown hardness results and the use of computationally intractable oracles, we\nadvocate leveraging the potential \\emph{information-sharing} among agents, a\ncommon practice in empirical multi-agent RL, and a standard model for\nmulti-agent control systems with communications. We first establish several\ncomputational complexity results to justify the necessity of\ninformation-sharing, as well as the observability assumption that has enabled\nquasi-efficient single-agent RL with partial observations, for efficiently\nsolving POSGs. {Inspired by the inefficiency of planning in the ground-truth\nmodel,} we then propose to further \\emph{approximate} the shared common\ninformation to construct an {approximate model} of the POSG, in which planning\nan approximate \\emph{equilibrium} (in terms of solving the original POSG) can\nbe quasi-efficient, i.e., of quasi-polynomial-time, under the aforementioned\nassumptions. Furthermore, we develop a partially observable multi-agent RL\nalgorithm that is \\emph{both} statistically and computationally\nquasi-efficient. {Finally, beyond equilibrium learning, we extend our\nalgorithmic framework to finding the \\emph{team-optimal solution} in\ncooperative POSGs, i.e., decentralized partially observable Markov decision\nprocesses, a much more challenging goal. We establish concrete computational\nand sample complexities under several common structural assumptions of the\nmodel.} We hope our study could open up the possibilities of leveraging and\neven designing different \\emph{information structures}, a well-studied notion\nin control theory, for developing both sample- and computation-efficient\npartially observable multi-agent RL.\n","authors":["Xiangyu Liu","Kaiqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08705v3.pdf","comment":"Journal extension of the conference version at ICML 2023. Changed to\n the more general reward function form, added new results for learning in\n Dec-POMDPs, and streamlined proof outlines"},{"id":"http://arxiv.org/abs/2409.02856v1","updated":"2024-09-04T16:29:25Z","published":"2024-09-04T16:29:25Z","title":"Building a Scalable, Effective, and Steerable Search and Ranking\n Platform","summary":" Modern e-commerce platforms offer vast product selections, making it\ndifficult for customers to find items that they like and that are relevant to\ntheir current session intent. This is why it is key for e-commerce platforms to\nhave near real-time scalable and adaptable personalized ranking and search\nsystems. While numerous methods exist in the scientific literature for building\nsuch systems, many are unsuitable for large-scale industrial use due to\ncomplexity and performance limitations. Consequently, industrial ranking\nsystems often resort to computationally efficient yet simplistic retrieval or\ncandidate generation approaches, which overlook near real-time and\nheterogeneous customer signals, which results in a less personalized and\nrelevant experience. Moreover, related customer experiences are served by\ncompletely different systems, which increases complexity, maintenance, and\ninconsistent experiences.\n In this paper, we present a personalized, adaptable near real-time ranking\nplatform that is reusable across various use cases, such as browsing and\nsearch, and that is able to cater to millions of items and customers under\nheavy load (thousands of requests per second). We employ transformer-based\nmodels through different ranking layers which can learn complex behavior\npatterns directly from customer action sequences while being able to\nincorporate temporal (e.g. in-session) and contextual information. We validate\nour system through a series of comprehensive offline and online real-world\nexperiments at a large online e-commerce platform, and we demonstrate its\nsuperiority when compared to existing systems, both in terms of customer\nexperience as well as in net revenue. Finally, we share the lessons learned\nfrom building a comprehensive, modern ranking platform for use in a large-scale\ne-commerce environment.\n","authors":["Marjan Celikik","Jacek Wasilewski","Ana Peleteiro Ramallo","Alexey Kurennoy","Evgeny Labzin","Danilo Ascione","Tural Gurbanov","Géraud Le Falher","Andrii Dzhoha","Ian Harris"],"pdf_url":"https://arxiv.org/pdf/2409.02856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01433v2","updated":"2024-09-04T16:28:36Z","published":"2024-09-02T19:20:26Z","title":"Domain Decomposition-based coupling of Operator Inference reduced order\n models via the Schwarz alternating method","summary":" This paper presents and evaluates an approach for coupling together\nsubdomain-local reduced order models (ROMs) constructed via non-intrusive\noperator inference (OpInf) with each other and with subdomain-local full order\nmodels (FOMs), following a domain decomposition of the spatial geometry on\nwhich a given partial differential equation (PDE) is posed. Joining\nsubdomain-local models is accomplished using the overlapping Schwarz\nalternating method, a minimally-intrusive multiscale coupling technique that\nworks by transforming a monolithic problem into a sequence of subdomain-local\nproblems, which communicate through transmission boundary conditions imposed on\nthe subdomain interfaces. After formulating the overlapping Schwarz alternating\nmethod for OpInf ROMs, termed OpInf-Schwarz, we evaluate the method's accuracy\nand efficiency on several test cases involving the heat equation in two spatial\ndimensions. We demonstrate that the method is capable of coupling together\narbitrary combinations of OpInf ROMs and FOMs, and that speed-ups over a\nmonolithic FOM are possible when performing OpInf ROM coupling.\n","authors":["Ian Moore","Christopher Wentland","Anthony Gruber","Irina Tezaur"],"pdf_url":"https://arxiv.org/pdf/2409.01433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02850v1","updated":"2024-09-04T16:20:57Z","published":"2024-09-04T16:20:57Z","title":"Oops, I Sampled it Again: Reinterpreting Confidence Intervals in\n Few-Shot Learning","summary":" The predominant method for computing confidence intervals (CI) in few-shot\nlearning (FSL) is based on sampling the tasks with replacement, i.e.\\ allowing\nthe same samples to appear in multiple tasks. This makes the CI misleading in\nthat it takes into account the randomness of the sampler but not the data\nitself. To quantify the extent of this problem, we conduct a comparative\nanalysis between CIs computed with and without replacement. These reveal a\nnotable underestimation by the predominant method. This observation calls for a\nreevaluation of how we interpret confidence intervals and the resulting\nconclusions in FSL comparative studies. Our research demonstrates that the use\nof paired tests can partially address this issue. Additionally, we explore\nmethods to further reduce the (size of the) CI by strategically sampling tasks\nof a specific size. We also introduce a new optimized benchmark, which can be\naccessed at https://github.com/RafLaf/FSL-benchmark-again\n","authors":["Raphael Lafargue","Luke Smith","Franck Vermet","Mathias Löwe","Ian Reid","Vincent Gripon","Jack Valmadre"],"pdf_url":"https://arxiv.org/pdf/2409.02850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02842v1","updated":"2024-09-04T16:14:14Z","published":"2024-09-04T16:14:14Z","title":"SNNAX -- Spiking Neural Networks in JAX","summary":" Spiking Neural Networks (SNNs) simulators are essential tools to prototype\nbiologically inspired models and neuromorphic hardware architectures and\npredict their performance. For such a tool, ease of use and flexibility are\ncritical, but so is simulation speed especially given the complexity inherent\nto simulating SNN. Here, we present SNNAX, a JAX-based framework for simulating\nand training such models with PyTorch-like intuitiveness and JAX-like execution\nspeed. SNNAX models are easily extended and customized to fit the desired model\nspecifications and target neuromorphic hardware. Additionally, SNNAX offers key\nfeatures for optimizing the training and deployment of SNNs such as flexible\nautomatic differentiation and just-in-time compilation. We evaluate and compare\nSNNAX to other commonly used machine learning (ML) frameworks used for\nprogramming SNNs. We provide key performance metrics, best practices,\ndocumented examples for simulating SNNs in SNNAX, and implement several\nbenchmarks used in the literature.\n","authors":["Jamie Lohoff","Jan Finkbeiner","Emre Neftci"],"pdf_url":"https://arxiv.org/pdf/2409.02842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08763v4","updated":"2024-09-04T16:13:18Z","published":"2024-03-13T17:58:57Z","title":"Simple and Scalable Strategies to Continually Pre-train Large Language\n Models","summary":" Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to start the process over again once new data becomes available. A much\nmore efficient solution is to continually pre-train these models, saving\nsignificant compute compared to re-training. However, the distribution shift\ninduced by new data typically results in degraded performance on previous data\nor poor adaptation to the new data. In this work, we show that a simple and\nscalable combination of learning rate (LR) re-warming, LR re-decaying, and\nreplay of previous data is sufficient to match the performance of fully\nre-training from scratch on all available data, as measured by the final loss\nand the average score on several language model (LM) evaluation benchmarks.\nSpecifically, we show this for a weak but realistic distribution shift between\ntwo commonly used LLM pre-training datasets (English$\\rightarrow$English) and a\nstronger distribution shift (English$\\rightarrow$German) at the $405$M\nparameter model scale with large dataset sizes (hundreds of billions of\ntokens). Selecting the weak but realistic shift for larger-scale experiments,\nwe also find that our continual learning strategies match the re-training\nbaseline for a 10B parameter LLM. Our results demonstrate that LLMs can be\nsuccessfully updated via simple and scalable continual learning strategies,\nmatching the re-training baseline using only a fraction of the compute.\nFinally, inspired by previous work, we propose alternatives to the cosine\nlearning rate schedule that help circumvent forgetting induced by LR re-warming\nand that are not bound to a fixed token budget.\n","authors":["Adam Ibrahim","Benjamin Thérien","Kshitij Gupta","Mats L. Richter","Quentin Anthony","Timothée Lesort","Eugene Belilovsky","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2403.08763v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02836v1","updated":"2024-09-04T16:02:30Z","published":"2024-09-04T16:02:30Z","title":"Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency\n Discussions by Few-Shot Learning with Large Language Models","summary":" This study performs analysis of Predictive statements, Hope speech, and\nRegret Detection behaviors within cryptocurrency-related discussions,\nleveraging advanced natural language processing techniques. We introduce a\nnovel classification scheme named \"Prediction statements,\" categorizing\ncomments into Predictive Incremental, Predictive Decremental, Predictive\nNeutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large\nlanguage model, we explore sentiment dynamics across five prominent\ncryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis\nreveals distinct patterns in predictive sentiments, with Matic demonstrating a\nnotably higher propensity for optimistic predictions. Additionally, we\ninvestigate hope and regret sentiments, uncovering nuanced interplay between\nthese emotions and predictive behaviors. Despite encountering limitations\nrelated to data volume and resource availability, our study reports valuable\ndiscoveries concerning investor behavior and sentiment trends within the\ncryptocurrency market, informing strategic decision-making and future research\nendeavors.\n","authors":["Moein Shahiki Tash","Zahra Ahani","Mohim Tash","Olga Kolesnikova","Grigori Sidorov"],"pdf_url":"https://arxiv.org/pdf/2409.02836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20407v3","updated":"2024-09-04T15:57:00Z","published":"2024-05-30T18:25:19Z","title":"Convolutional L2LFlows: Generating Accurate Showers in Highly Granular\n Calorimeters Using Convolutional Normalizing Flows","summary":" In the quest to build generative surrogate models as computationally\nefficient alternatives to rule-based simulations, the quality of the generated\nsamples remains a crucial frontier. So far, normalizing flows have been among\nthe models with the best fidelity. However, as the latent space in such models\nis required to have the same dimensionality as the data space, scaling up\nnormalizing flows to high dimensional datasets is not straightforward. The\nprior L2LFlows approach successfully used a series of separate normalizing\nflows and sequence of conditioning steps to circumvent this problem. In this\nwork, we extend L2LFlows to simulate showers with a 9-times larger profile in\nthe lateral direction. To achieve this, we introduce convolutional layers and\nU-Net-type connections, move from masked autoregressive flows to coupling\nlayers, and demonstrate the successful modelling of showers in the ILD\nElectromagnetic Calorimeter as well as Dataset 3 from the public CaloChallenge\ndataset.\n","authors":["Thorsten Buss","Frank Gaede","Gregor Kasieczka","Claudius Krause","David Shih"],"pdf_url":"https://arxiv.org/pdf/2405.20407v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00717v2","updated":"2024-09-04T15:50:40Z","published":"2024-09-01T13:14:41Z","title":"Multi-Agent Reinforcement Learning from Human Feedback: Data Coverage\n and Algorithmic Techniques","summary":" We initiate the study of Multi-Agent Reinforcement Learning from Human\nFeedback (MARLHF), exploring both theoretical foundations and empirical\nvalidations. We define the task as identifying Nash equilibrium from a\npreference-only offline dataset in general-sum games, a problem marked by the\nchallenge of sparse feedback signals. Our theory establishes the upper\ncomplexity bounds for Nash Equilibrium in effective MARLHF, demonstrating that\nsingle-policy coverage is inadequate and highlighting the importance of\nunilateral dataset coverage. These theoretical insights are verified through\ncomprehensive experiments. To enhance the practical performance, we further\nintroduce two algorithmic techniques. (1) We propose a Mean Squared Error (MSE)\nregularization along the time axis to achieve a more uniform reward\ndistribution and improve reward learning outcomes. (2) We utilize imitation\nlearning to approximate the reference policy, ensuring stability and\neffectiveness in training. Our findings underscore the multifaceted approach\nrequired for MARLHF, paving the way for effective preference-based multi-agent\nsystems.\n","authors":["Natalia Zhang","Xinqi Wang","Qiwen Cui","Runlong Zhou","Sham M. Kakade","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2409.00717v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04346v2","updated":"2024-09-04T15:48:40Z","published":"2024-05-07T14:23:22Z","title":"Revisiting Character-level Adversarial Attacks for Language Models","summary":" Adversarial attacks in Natural Language Processing apply perturbations in the\ncharacter or token levels. Token-level attacks, gaining prominence for their\nuse of gradient-based methods, are susceptible to altering sentence semantics,\nleading to invalid adversarial examples. While character-level attacks easily\nmaintain semantics, they have received less attention as they cannot easily\nadopt popular gradient-based methods, and are thought to be easy to defend.\nChallenging these beliefs, we introduce Charmer, an efficient query-based\nadversarial attack capable of achieving high attack success rate (ASR) while\ngenerating highly similar adversarial examples. Our method successfully targets\nboth small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2,\nCharmer improves the ASR in 4.84% points and the USE similarity in 8% points\nwith respect to the previous art. Our implementation is available in\nhttps://github.com/LIONS-EPFL/Charmer.\n","authors":["Elias Abad Rocamora","Yongtao Wu","Fanghui Liu","Grigorios G. Chrysos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2405.04346v2.pdf","comment":"Accepted in ICML 2024"},{"id":"http://arxiv.org/abs/2409.02817v1","updated":"2024-09-04T15:35:18Z","published":"2024-09-04T15:35:18Z","title":"Obsidian: Cooperative State-Space Exploration for Performant Inference\n on Secure ML Accelerators","summary":" Trusted execution environments (TEEs) for machine learning accelerators are\nindispensable in secure and efficient ML inference. Optimizing workloads\nthrough state-space exploration for the accelerator architectures improves\nperformance and energy consumption. However, such explorations are expensive\nand slow due to the large search space. Current research has to use fast\nanalytical models that forego critical hardware details and cross-layer\nopportunities unique to the hardware security primitives. While cycle-accurate\nmodels can theoretically reach better designs, their high runtime cost\nrestricts them to a smaller state space.\n We present Obsidian, an optimization framework for finding the optimal\nmapping from ML kernels to a secure ML accelerator. Obsidian addresses the\nabove challenge by exploring the state space using analytical and\ncycle-accurate models cooperatively. The two main exploration components\ninclude: (1) A secure accelerator analytical model, that includes the effect of\nsecure hardware while traversing the large mapping state space and produce the\nbest m model mappings; (2) A compiler profiling step on a cycle-accurate model,\nthat captures runtime bottlenecks to further improve execution runtime, energy\nand resource utilization and find the optimal model mapping.\n We compare our results to a baseline secure accelerator, comprising of the\nstate-of-the-art security schemes obtained from guardnn [ 33 ] and sesame [11].\nThe analytical model reduces the inference latency by 20.5% for a cloud and\n8.4% for an edge deployment with an energy improvement of 24% and 19%\nrespectively. The cycle-accurate model, further reduces the latency by 9.1% for\na cloud and 12.2% for an edge with an energy improvement of 13.8% and 13.1%.\n","authors":["Sarbartha Banerjee","Shijia Wei","Prakash Ramrakhyani","Mohit Tiwari"],"pdf_url":"https://arxiv.org/pdf/2409.02817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02802v1","updated":"2024-09-04T15:22:08Z","published":"2024-09-04T15:22:08Z","title":"Boosting Certificate Robustness for Time Series Classification with\n Efficient Self-Ensemble","summary":" Recently, the issue of adversarial robustness in the time series domain has\ngarnered significant attention. However, the available defense mechanisms\nremain limited, with adversarial training being the predominant approach,\nthough it does not provide theoretical guarantees. Randomized Smoothing has\nemerged as a standout method due to its ability to certify a provable lower\nbound on robustness radius under $\\ell_p$-ball attacks. Recognizing its\nsuccess, research in the time series domain has started focusing on these\naspects. However, existing research predominantly focuses on time series\nforecasting, or under the non-$\\ell_p$ robustness in statistic feature\naugmentation for time series classification~(TSC). Our review found that\nRandomized Smoothing performs modestly in TSC, struggling to provide effective\nassurances on datasets with poor robustness. Therefore, we propose a\nself-ensemble method to enhance the lower bound of the probability confidence\nof predicted labels by reducing the variance of classification margins, thereby\ncertifying a larger radius. This approach also addresses the computational\noverhead issue of Deep Ensemble~(DE) while remaining competitive and, in some\ncases, outperforming it in terms of robustness. Both theoretical analysis and\nexperimental results validate the effectiveness of our method, demonstrating\nsuperior performance in robustness testing compared to baseline approaches.\n","authors":["Chang Dong","Zhengyang Li","Liangwei Zheng","Weitong Chen","Wei Emma Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02802v1.pdf","comment":"6 figures, 4 tables, 10 pages"},{"id":"http://arxiv.org/abs/2405.01704v2","updated":"2024-09-04T15:16:46Z","published":"2024-05-02T20:03:13Z","title":"Privacy-aware Berrut Approximated Coded Computing for Federated Learning","summary":" Federated Learning (FL) is an interesting strategy that enables the\ncollaborative training of an AI model among different data owners without\nrevealing their private datasets. Even so, FL has some privacy vulnerabilities\nthat have been tried to be overcome by applying some techniques like\nDifferential Privacy (DP), Homomorphic Encryption (HE), or Secure Multi-Party\nComputation (SMPC). However, these techniques have some important drawbacks\nthat might narrow their range of application: problems to work with non-linear\nfunctions and to operate large matrix multiplications and high communication\nand computational costs to manage semi-honest nodes. In this context, we\npropose a solution to guarantee privacy in FL schemes that simultaneously\nsolves the previously mentioned problems. Our proposal is based on the Berrut\nApproximated Coded Computing, a technique from the Coded Distributed Computing\nparadigm, adapted to a Secret Sharing configuration, to provide input privacy\nto FL in a scalable way. It can be applied for computing non-linear functions\nand treats the special case of distributed matrix multiplication, a key\nprimitive at the core of many automated learning tasks. Because of these\ncharacteristics, it could be applied in a wide range of FL scenarios, since it\nis independent of the machine learning models or aggregation algorithms used in\nthe FL scheme. We provide analysis of the achieved privacy and complexity of\nour solution and, due to the extensive numerical results performed, a good\ntrade-off between privacy and precision can be observed.\n","authors":["Xavier Martínez Luaña","Rebeca P. Díaz Redondo","Manuel Fernández Veiga"],"pdf_url":"https://arxiv.org/pdf/2405.01704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15950v2","updated":"2024-09-04T15:08:49Z","published":"2024-05-24T21:34:16Z","title":"A Systematic Bias of Machine Learning Regression Models and Its\n Correction: an Application to Imaging-based Brain Age Prediction","summary":" Machine learning models for continuous outcomes often yield systematically\nbiased predictions, particularly for values that largely deviate from the mean.\nSpecifically, predictions for large-valued outcomes tend to be negatively\nbiased (underestimating actual values), while those for small-valued outcomes\nare positively biased (overestimating actual values). We refer to this linear\ncentral tendency warped bias as the \"systematic bias of machine learning\nregression\". In this paper, we first demonstrate that this systematic\nprediction bias persists across various machine learning regression models, and\nthen delve into its theoretical underpinnings. To address this issue, we\npropose a general constrained optimization approach designed to correct this\nbias and develop computationally efficient implementation algorithms.\nSimulation results indicate that our correction method effectively eliminates\nthe bias from the predicted outcomes. We apply the proposed approach to the\nprediction of brain age using neuroimaging data. In comparison to competing\nmachine learning regression models, our method effectively addresses the\nlongstanding issue of \"systematic bias of machine learning regression\" in\nneuroimaging-based brain age calculation, yielding unbiased predictions of\nbrain age.\n","authors":["Hwiyoung Lee","Shuo Chen"],"pdf_url":"https://arxiv.org/pdf/2405.15950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02792v1","updated":"2024-09-04T15:06:44Z","published":"2024-09-04T15:06:44Z","title":"UnLearning from Experience to Avoid Spurious Correlations","summary":" While deep neural networks can achieve state-of-the-art performance in many\ntasks, these models are more fragile than they appear. They are prone to\nlearning spurious correlations in their training data, leading to surprising\nfailure cases. In this paper, we propose a new approach that addresses the\nissue of spurious correlations: UnLearning from Experience (ULE). Our method is\nbased on using two classification models trained in parallel: student and\nteacher models. Both models receive the same batches of training data. The\nstudent model is trained with no constraints and pursues the spurious\ncorrelations in the data. The teacher model is trained to solve the same\nclassification problem while avoiding the mistakes of the student model. As\ntraining is done in parallel, the better the student model learns the spurious\ncorrelations, the more robust the teacher model becomes. The teacher model uses\nthe gradient of the student's output with respect to its input to unlearn\nmistakes made by the student. We show that our method is effective on the\nWaterbirds, CelebA, Spawrious and UrbanCars datasets.\n","authors":["Jeff Mitchell","Jesús Martínez del Rincón","Niall McLaughlin"],"pdf_url":"https://arxiv.org/pdf/2409.02792v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2409.02778v1","updated":"2024-09-04T14:56:28Z","published":"2024-09-04T14:56:28Z","title":"Regularized Multi-output Gaussian Convolution Process with Domain\n Adaptation","summary":" Multi-output Gaussian process (MGP) has been attracting increasing attention\nas a transfer learning method to model multiple outputs. Despite its high\nflexibility and generality, MGP still faces two critical challenges when\napplied to transfer learning. The first one is negative transfer, which occurs\nwhen there exists no shared information among the outputs. The second challenge\nis the input domain inconsistency, which is commonly studied in transfer\nlearning yet not explored in MGP. In this paper, we propose a regularized MGP\nmodeling framework with domain adaptation to overcome these challenges. More\nspecifically, a sparse covariance matrix of MGP is proposed by using\nconvolution process, where penalization terms are added to adaptively select\nthe most informative outputs for knowledge transfer. To deal with the domain\ninconsistency, a domain adaptation method is proposed by marginalizing\ninconsistent features and expanding missing features to align the input domains\namong different outputs. Statistical properties of the proposed method are\nprovided to guarantee the performance practically and asymptotically. The\nproposed framework outperforms state-of-the-art benchmarks in comprehensive\nsimulation studies and one real case study of a ceramic manufacturing process.\nThe results demonstrate the effectiveness of our method in dealing with both\nthe negative transfer and the domain inconsistency.\n","authors":["Wang Xinming","Wang Chao","Song Xuan","Kirby Levi","Wu Jianguo"],"pdf_url":"https://arxiv.org/pdf/2409.02778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08381v2","updated":"2024-09-04T14:52:59Z","published":"2024-08-15T18:54:31Z","title":"Pre-processing and Compression: Understanding Hidden Representation\n Refinement Across Imaging Domains via Intrinsic Dimension","summary":" In recent years, there has been interest in how geometric properties such as\nintrinsic dimension (ID) of a neural network's hidden representations change\nthrough its layers, and how such properties are predictive of important model\nbehavior such as generalization ability. However, evidence has begun to emerge\nthat such behavior can change significantly depending on the domain of the\nnetwork's training data, such as natural versus medical images. Here, we\nfurther this inquiry by exploring how the ID of a network's learned\nrepresentations changes through its layers, in essence, characterizing how the\nnetwork successively refines the information content of input data to be used\nfor predictions. Analyzing eleven natural and medical image datasets across six\nnetwork architectures, we find that how ID changes through the network differs\nnoticeably between natural and medical image models. Specifically, medical\nimage models peak in representation ID earlier in the network, implying a\ndifference in the image features and their abstractness that are typically used\nfor downstream tasks in these domains. Additionally, we discover a strong\ncorrelation of this peak representation ID with the ID of the data in its input\nspace, implying that the intrinsic information content of a model's learned\nrepresentations is guided by that of the data it was trained on. Overall, our\nfindings emphasize notable discrepancies in network behavior between natural\nand non-natural imaging domains regarding hidden representation information\ncontent, and provide further insights into how a network's learned features are\nshaped by its training data.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.08381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02772v1","updated":"2024-09-04T14:51:36Z","published":"2024-09-04T14:51:36Z","title":"Unifying Causal Representation Learning with the Invariance Principle","summary":" Causal representation learning aims at recovering latent causal variables\nfrom high-dimensional observations to solve causal downstream tasks, such as\npredicting the effect of new interventions or more robust classification. A\nplethora of methods have been developed, each tackling carefully crafted\nproblem settings that lead to different types of identifiability. The folklore\nis that these different settings are important, as they are often linked to\ndifferent rungs of Pearl's causal hierarchy, although not all neatly fit. Our\nmain contribution is to show that many existing causal representation learning\napproaches methodologically align the representation to known data symmetries.\nIdentification of the variables is guided by equivalence classes across\ndifferent data pockets that are not necessarily causal. This result suggests\nimportant implications, allowing us to unify many existing approaches in a\nsingle method that can mix and match different assumptions, including\nnon-causal ones, based on the invariances relevant to our application. It also\nsignificantly benefits applicability, which we demonstrate by improving\ntreatment effect estimation on real-world high-dimensional ecological data.\nOverall, this paper clarifies the role of causality assumptions in the\ndiscovery of causal variables and shifts the focus to preserving data\nsymmetries.\n","authors":["Dingling Yao","Dario Rancati","Riccardo Cadei","Marco Fumero","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2409.02772v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2407.13703v3","updated":"2024-09-04T14:41:26Z","published":"2024-06-26T08:59:49Z","title":"Energy-Efficient Channel Decoding for Wireless Federated Learning:\n Convergence Analysis and Adaptive Design","summary":" One of the most critical challenges for deploying distributed learning\nsolutions, such as federated learning (FL), in wireless networks is the limited\nbattery capacity of mobile clients. While it is a common belief that the major\nenergy consumption of mobile clients comes from the uplink data transmission,\nthis paper presents a novel finding, namely channel decoding also contributes\nsignificantly to the overall energy consumption of mobile clients in FL.\nMotivated by this new observation, we propose an energy-efficient adaptive\nchannel decoding scheme that leverages the intrinsic robustness of FL to model\nerrors. In particular, the robustness is exploited to reduce the energy\nconsumption of channel decoders at mobile clients by adaptively adjusting the\nnumber of decoding iterations. We theoretically prove that wireless FL with\ncommunication errors can converge at the same rate as the case with error-free\ncommunication provided the bit error rate (BER) is properly constrained. An\nadaptive channel decoding scheme is then proposed to improve the energy\nefficiency of wireless FL systems. Experimental results demonstrate that the\nproposed method maintains the same learning accuracy while reducing the channel\ndecoding energy consumption by ~20% when compared to an existing approach.\n","authors":["Linping Qu","Yuyi Mao","Shenghui Song","Chi-Ying Tsui"],"pdf_url":"https://arxiv.org/pdf/2407.13703v3.pdf","comment":"This paper has been accepted by the IEEE TWC. Copyright may be\n transferred without notice, after which this version may no longer be\n accessible"},{"id":"http://arxiv.org/abs/2409.00105v2","updated":"2024-09-04T14:40:14Z","published":"2024-08-27T14:40:16Z","title":"Negation Blindness in Large Language Models: Unveiling the NO Syndrome\n in Image Generation","summary":" Foundational Large Language Models (LLMs) have changed the way we perceive\ntechnology. They have been shown to excel in tasks ranging from poem writing\nand coding to essay generation and puzzle solving. With the incorporation of\nimage generation capability, they have become more comprehensive and versatile\nAI tools. At the same time, researchers are striving to identify the\nlimitations of these tools to improve them further. Currently identified flaws\ninclude hallucination, biases, and bypassing restricted commands to generate\nharmful content. In the present work, we have identified a fundamental\nlimitation related to the image generation ability of LLMs, and termed it The\nNO Syndrome. This negation blindness refers to LLMs inability to correctly\ncomprehend NO related natural language prompts to generate the desired images.\nInterestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found\nto be suffering from this syndrome. To demonstrate the generalization of this\nlimitation, we carried out simulation experiments and conducted entropy-based\nand benchmark statistical analysis tests on various LLMs in multiple languages,\nincluding English, Hindi, and French. We conclude that the NO syndrome is a\nsignificant flaw in current LLMs that needs to be addressed. A related finding\nof this study showed a consistent discrepancy between image and textual\nresponses as a result of this NO syndrome. We posit that the introduction of a\nnegation context-aware reinforcement learning based feedback loop between the\nLLMs textual response and generated image could help ensure the generated text\nis based on both the LLMs correct contextual understanding of the negation\nquery and the generated visual output.\n","authors":["Mohammad Nadeem","Shahab Saquib Sohail","Erik Cambria","Björn W. Schuller","Amir Hussain"],"pdf_url":"https://arxiv.org/pdf/2409.00105v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.02747v1","updated":"2024-09-04T14:26:58Z","published":"2024-09-04T14:26:58Z","title":"Tractable Offline Learning of Regular Decision Processes","summary":" This work studies offline Reinforcement Learning (RL) in a class of\nnon-Markovian environments called Regular Decision Processes (RDPs). In RDPs,\nthe unknown dependency of future observations and rewards from the past\ninteractions can be captured by some hidden finite-state automaton. For this\nreason, many RDP algorithms first reconstruct this unknown dependency using\nautomata learning techniques. In this paper, we show that it is possible to\novercome two strong limitations of previous offline RL algorithms for RDPs,\nnotably RegORL. This can be accomplished via the introduction of two original\ntechniques: the development of a new pseudometric based on formal languages,\nwhich removes a problematic dependency on\n$L_\\infty^\\mathsf{p}$-distinguishability parameters, and the adoption of\nCount-Min-Sketch (CMS), instead of naive counting. The former reduces the\nnumber of samples required in environments that are characterized by a low\ncomplexity in language-theoretic terms. The latter alleviates the memory\nrequirements for long planning horizons. We derive the PAC sample complexity\nbounds associated to each of these techniques, and we validate the approach\nexperimentally.\n","authors":["Ahana Deb","Roberto Cipollone","Anders Jonsson","Alessandro Ronca","Mohammad Sadegh Talebi"],"pdf_url":"https://arxiv.org/pdf/2409.02747v1.pdf","comment":"To appear in EWRL 2024"},{"id":"http://arxiv.org/abs/2408.16945v3","updated":"2024-09-04T14:25:47Z","published":"2024-08-29T23:51:51Z","title":"Different Victims, Same Layout: Email Visual Similarity Detection for\n Enhanced Email Protection","summary":" In the pursuit of an effective spam detection system, the focus has often\nbeen on identifying known spam patterns either through rule-based detection\nsystems or machine learning (ML) solutions that rely on keywords. However, both\nsystems are susceptible to evasion techniques and zero-day attacks that can be\nachieved at low cost. Therefore, an email that bypassed the defense system once\ncan do it again in the following days, even though rules are updated or the ML\nmodels are retrained. The recurrence of failures to detect emails that exhibit\nlayout similarities to previously undetected spam is concerning for customers\nand can erode their trust in a company. Our observations show that threat\nactors reuse email kits extensively and can bypass detection with little\neffort, for example, by making changes to the content of emails. In this work,\nwe propose an email visual similarity detection approach, named Pisco, to\nimprove the detection capabilities of an email threat defense system. We apply\nour proof of concept to some real-world samples received from different\nsources. Our results show that email kits are being reused extensively and\nvisually similar emails are sent to our customers at various time intervals.\nTherefore, this method could be very helpful in situations where detection\nengines that rely on textual features and keywords are bypassed, an occurrence\nour observations show happens frequently.\n","authors":["Sachin Shukla","Omid Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2408.16945v3.pdf","comment":"To be published in the proceedings of the ACM Conference on Computer\n and Communications Security (ACM CCS 2024)"},{"id":"http://arxiv.org/abs/2409.02740v1","updated":"2024-09-04T14:21:00Z","published":"2024-09-04T14:21:00Z","title":"Convolutional Neural Networks for Automated Cellular Automaton\n Classification","summary":" The emergent dynamics in spacetime diagrams of cellular automata (CAs) is\noften organised by means of a number of behavioural classes. Whilst\nclassification of elementary CAs is feasible and well-studied, non-elementary\nCAs are generally too diverse and numerous to exhaustively classify manually.\nIn this chapter we treat the spacetime diagram as a digital image, and\nimplement simple computer vision techniques to perform an automated\nclassification of elementary cellular automata into the five Li-Packard\nclasses. In particular, we present a supervised learning task to a\nconvolutional neural network, in such a way that it may be generalised to\nnon-elementary CAs. If we want to do so, we must divert the algorithm's focus\naway from the underlying 'microscopic' local updates. We first show that\npreviously developed deep learning approaches have in fact been trained to\nidentify the local update rule, rather than directly focus on the mesoscopic\npatterns that are associated with the particular behavioural classes. By means\nof a well-argued neural network design, as well as a number of data\naugmentation techniques, we then present a convolutional neural network that\nperforms nearly perfectly at identifying the behavioural class, without\nnecessarily first identifying the underlying microscopic dynamics.\n","authors":["Michiel Rollier","Aisling J. Daly","Jan M. Baetens"],"pdf_url":"https://arxiv.org/pdf/2409.02740v1.pdf","comment":"19 pages, 12 figures, book chapter"},{"id":"http://arxiv.org/abs/2402.02438v2","updated":"2024-09-04T14:14:17Z","published":"2024-02-04T10:27:42Z","title":"Fast and interpretable Support Vector Classification based on the\n truncated ANOVA decomposition","summary":" Support Vector Machines (SVMs) are an important tool for performing\nclassification on scattered data, where one usually has to deal with many data\npoints in high-dimensional spaces. We propose solving SVMs in primal form using\nfeature maps based on trigonometric functions or wavelets. In small dimensional\nsettings the Fast Fourier Transform (FFT) and related methods are a powerful\ntool in order to deal with the considered basis functions. For growing\ndimensions the classical FFT-based methods become inefficient due to the curse\nof dimensionality. Therefore, we restrict ourselves to multivariate basis\nfunctions, each of which only depends on a small number of dimensions. This is\nmotivated by the well-known sparsity of effects and recent results regarding\nthe reconstruction of functions from scattered data in terms of truncated\nanalysis of variance (ANOVA) decompositions, which makes the resulting model\neven interpretable in terms of importance of the features as well as their\ncouplings. The usage of small superposition dimensions has the consequence that\nthe computational effort no longer grows exponentially but only polynomially\nwith respect to the dimension. In order to enforce sparsity regarding the basis\ncoefficients, we use the frequently applied $\\ell_2$-norm and, in addition,\n$\\ell_1$-norm regularization. The found classifying function, which is the\nlinear combination of basis functions, and its variance can then be analyzed in\nterms of the classical ANOVA decomposition of functions. Based on numerical\nexamples we show that we are able to recover the signum of a function that\nperfectly fits our model assumptions. Furthermore, we perform classification on\ndifferent artificial and real-world data sets. We obtain better results with\n$\\ell_1$-norm regularization, both in terms of accuracy and clarity of\ninterpretability.\n","authors":["Kseniya Akhalaya","Franziska Nestler","Daniel Potts"],"pdf_url":"https://arxiv.org/pdf/2402.02438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.12965v2","updated":"2024-09-04T14:10:43Z","published":"2024-05-21T17:45:36Z","title":"The future of cosmological likelihood-based inference: accelerated\n high-dimensional parameter estimation and model comparison","summary":" We advocate for a new paradigm of cosmological likelihood-based inference,\nleveraging recent developments in machine learning and its underlying\ntechnology, to accelerate Bayesian inference in high-dimensional settings.\nSpecifically, we combine (i) emulation, where a machine learning model is\ntrained to mimic cosmological observables, e.g. CosmoPower-JAX; (ii)\ndifferentiable and probabilistic programming, e.g. JAX and NumPyro,\nrespectively; (iii) scalable Markov chain Monte Carlo (MCMC) sampling\ntechniques that exploit gradients, e.g. Hamiltonian Monte Carlo; and (iv)\ndecoupled and scalable Bayesian model selection techniques that compute the\nBayesian evidence purely from posterior samples, e.g. the learned harmonic mean\nimplemented in harmonic. This paradigm allows us to carry out a complete\nBayesian analysis, including both parameter estimation and model selection, in\na fraction of the time of traditional approaches. First, we demonstrate the\napplication of this paradigm on a simulated cosmic shear analysis for a Stage\nIV survey in 37- and 39-dimensional parameter spaces, comparing $\\Lambda$CDM\nand a dynamical dark energy model ($w_0w_a$CDM). We recover posterior contours\nand evidence estimates that are in excellent agreement with those computed by\nthe traditional nested sampling approach while reducing the computational cost\nfrom 8 months on 48 CPU cores to 2 days on 12 GPUs. Second, we consider a joint\nanalysis between three simulated next-generation surveys, each performing a\n3x2pt analysis, resulting in 157- and 159-dimensional parameter spaces.\nStandard nested sampling techniques are simply unlikely to be feasible in this\nhigh-dimensional setting, requiring a projected 12 years of compute time on 48\nCPU cores; on the other hand, the proposed approach only requires 8 days of\ncompute time on 24 GPUs. All packages used in our analyses are publicly\navailable.\n","authors":["Davide Piras","Alicja Polanska","Alessio Spurio Mancini","Matthew A. Price","Jason D. McEwen"],"pdf_url":"https://arxiv.org/pdf/2405.12965v2.pdf","comment":"14 pages, 6 figures. Accepted for publication in the Open Journal of\n Astrophysics. Codes available at\n https://github.com/alessiospuriomancini/cosmopower,\n https://github.com/dpiras/cosmopower-jax,\n https://github.com/astro-informatics/harmonic/"},{"id":"http://arxiv.org/abs/2409.02730v1","updated":"2024-09-04T14:03:08Z","published":"2024-09-04T14:03:08Z","title":"Complete and Efficient Covariants for 3D Point Configurations with\n Application to Learning Molecular Quantum Properties","summary":" When modeling physical properties of molecules with machine learning, it is\ndesirable to incorporate $SO(3)$-covariance. While such models based on low\nbody order features are not complete, we formulate and prove general\ncompleteness properties for higher order methods, and show that $6k-5$ of these\nfeatures are enough for up to $k$ atoms. We also find that the Clebsch--Gordan\noperations commonly used in these methods can be replaced by matrix\nmultiplications without sacrificing completeness, lowering the scaling from\n$O(l^6)$ to $O(l^3)$ in the degree of the features. We apply this to quantum\nchemistry, but the proposed methods are generally applicable for problems\ninvolving 3D point configurations.\n","authors":["Hartmut Maennel","Oliver T. Unke","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2409.02730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02728v1","updated":"2024-09-04T14:01:56Z","published":"2024-09-04T14:01:56Z","title":"Task-Oriented Communication for Graph Data: A Graph Information\n Bottleneck Approach","summary":" Graph data, essential in fields like knowledge representation and social\nnetworks, often involves large networks with many nodes and edges. Transmitting\nthese graphs can be highly inefficient due to their size and redundancy for\nspecific tasks. This paper introduces a method to extract a smaller,\ntask-focused subgraph that maintains key information while reducing\ncommunication overhead. Our approach utilizes graph neural networks (GNNs) and\nthe graph information bottleneck (GIB) principle to create a compact,\ninformative, and robust graph representation suitable for transmission. The\nchallenge lies in the irregular structure of graph data, making GIB\noptimization complex. We address this by deriving a tractable variational upper\nbound for the objective function. Additionally, we propose the VQ-GIB\nmechanism, integrating vector quantization (VQ) to convert subgraph\nrepresentations into a discrete codebook sequence, compatible with existing\ndigital communication systems. Our experiments show that this GIB-based method\nsignificantly lowers communication costs while preserving essential\ntask-related information. The approach demonstrates robust performance across\nvarious communication channels, suitable for both continuous and discrete\nsystems.\n","authors":["Shujing Li","Yanhu Wang","Shuaishuai Guo","Chenyuan Feng"],"pdf_url":"https://arxiv.org/pdf/2409.02728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02712v1","updated":"2024-09-04T13:49:45Z","published":"2024-09-04T13:49:45Z","title":"A Data Selection Approach for Enhancing Low Resource Machine Translation\n Using Cross-Lingual Sentence Representations","summary":" Machine translation in low-resource language pairs faces significant\nchallenges due to the scarcity of parallel corpora and linguistic resources.\nThis study focuses on the case of English-Marathi language pairs, where\nexisting datasets are notably noisy, impeding the performance of machine\ntranslation models. To mitigate the impact of data quality issues, we propose a\ndata filtering approach based on cross-lingual sentence representations. Our\nmethodology leverages a multilingual SBERT model to filter out problematic\ntranslations in the training data. Specifically, we employ an IndicSBERT\nsimilarity model to assess the semantic equivalence between original and\ntranslated sentences, allowing us to retain linguistically correct translations\nwhile discarding instances with substantial deviations. The results demonstrate\na significant improvement in translation quality over the baseline\npost-filtering with IndicSBERT. This illustrates how cross-lingual sentence\nrepresentations can reduce errors in machine translation scenarios with limited\nresources. By integrating multilingual sentence BERT models into the\ntranslation pipeline, this research contributes to advancing machine\ntranslation techniques in low-resource environments. The proposed method not\nonly addresses the challenges in English-Marathi language pairs but also\nprovides a valuable framework for enhancing translation quality in other\nlow-resource language translation tasks.\n","authors":["Nidhi Kowtal","Tejas Deshpande","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.02712v1.pdf","comment":"Accepted at I2CT 2024"},{"id":"http://arxiv.org/abs/2409.02708v1","updated":"2024-09-04T13:44:22Z","published":"2024-09-04T13:44:22Z","title":"Few-shot Multi-Task Learning of Linear Invariant Features with Meta\n Subspace Pursuit","summary":" Data scarcity poses a serious threat to modern machine learning and\nartificial intelligence, as their practical success typically relies on the\navailability of big datasets. One effective strategy to mitigate the issue of\ninsufficient data is to first harness information from other data sources\npossessing certain similarities in the study design stage, and then employ the\nmulti-task or meta learning framework in the analysis stage. In this paper, we\nfocus on multi-task (or multi-source) linear models whose coefficients across\ntasks share an invariant low-rank component, a popular structural assumption\nconsidered in the recent multi-task or meta learning literature. Under this\nassumption, we propose a new algorithm, called Meta Subspace Pursuit\n(abbreviated as Meta-SP), that provably learns this invariant subspace shared\nby different tasks. Under this stylized setup for multi-task or meta learning,\nwe establish both the algorithmic and statistical guarantees of the proposed\nmethod. Extensive numerical experiments are conducted, comparing Meta-SP\nagainst several competing methods, including popular, off-the-shelf\nmodel-agnostic meta learning algorithms such as ANIL. These experiments\ndemonstrate that Meta-SP achieves superior performance over the competing\nmethods in various aspects.\n","authors":["Chaozhi Zhang","Lin Liu","Xiaoqun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02697v1","updated":"2024-09-04T13:33:38Z","published":"2024-09-04T13:33:38Z","title":"Decision Transformer for Enhancing Neural Local Search on the Job Shop\n Scheduling Problem","summary":" The job shop scheduling problem (JSSP) and its solution algorithms have been\nof enduring interest in both academia and industry for decades. In recent\nyears, machine learning (ML) is playing an increasingly important role in\nadvancing existing and building new heuristic solutions for the JSSP, aiming to\nfind better solutions in shorter computation times. In this paper we build on\ntop of a state-of-the-art deep reinforcement learning (DRL) agent, called\nNeural Local Search (NLS), which can efficiently and effectively control a\nlarge local neighborhood search on the JSSP. In particular, we develop a method\nfor training the decision transformer (DT) algorithm on search trajectories\ntaken by a trained NLS agent to further improve upon the learned\ndecision-making sequences. Our experiments show that the DT successfully learns\nlocal search strategies that are different and, in many cases, more effective\nthan those of the NLS agent itself. In terms of the tradeoff between solution\nquality and acceptable computational time needed for the search, the DT is\nparticularly superior in application scenarios where longer computational times\nare acceptable. In this case, it makes up for the longer inference times\nrequired per search step, which are caused by the larger neural network\narchitecture, through better quality decisions per step. Thereby, the DT\nachieves state-of-the-art results for solving the JSSP with ML-enhanced search.\n","authors":["Constantin Waubert de Puiseau","Fabian Wolz","Merlin Montag","Jannik Peters","Hasan Tercan","Tobias Meisen"],"pdf_url":"https://arxiv.org/pdf/2409.02697v1.pdf","comment":"currently under review for IEEE Transactions on Cybernetics"},{"id":"http://arxiv.org/abs/2402.10502v2","updated":"2024-09-04T13:31:31Z","published":"2024-02-16T08:21:43Z","title":"A possible late-time transition of $M_B$ inferred via neural networks","summary":" The strengthening of tensions in the cosmological parameters has led to a\nreconsideration of fundamental aspects of standard cosmology. The tension in\nthe Hubble constant can also be viewed as a tension between local and early\nUniverse constraints on the absolute magnitude $M_B$ of Type Ia supernova. In\nthis work, we reconsider the possibility of a variation of this parameter in a\nmodel-independent way. We employ neural networks to agnostically constrain the\nvalue of the absolute magnitude as well as assess the impact and statistical\nsignificance of a variation in $M_B$ with redshift from the Pantheon+\ncompilation, together with a thorough analysis of the neural network\narchitecture. We find an indication for a possible transition redshift at the\n$z\\approx 1$ region.\n","authors":["Purba Mukherjee","Konstantinos F. Dialektopoulos","Jackson Levi Said","Jurgen Mifsud"],"pdf_url":"https://arxiv.org/pdf/2402.10502v2.pdf","comment":"13 pages, 9 sets of figures, 2 tables. To appear in JCAP"},{"id":"http://arxiv.org/abs/2408.16122v2","updated":"2024-09-04T13:28:34Z","published":"2024-08-28T20:22:09Z","title":"Variational Mode Decomposition and Linear Embeddings are What You Need\n For Time-Series Forecasting","summary":" Time-series forecasting often faces challenges due to data volatility, which\ncan lead to inaccurate predictions. Variational Mode Decomposition (VMD) has\nemerged as a promising technique to mitigate volatility by decomposing data\ninto distinct modes, thereby enhancing forecast accuracy. In this study, we\nintegrate VMD with linear models to develop a robust forecasting framework. Our\napproach is evaluated on 13 diverse datasets, including ETTm2, WindTurbine, M4,\nand 10 air quality datasets from various Southeast Asian cities. The\neffectiveness of the VMD strategy is assessed by comparing Root Mean Squared\nError (RMSE) values from models utilizing VMD against those without it.\nAdditionally, we benchmark linear-based models against well-known neural\nnetwork architectures such as LSTM, Bidirectional LSTM, and RNN. The results\ndemonstrate a significant reduction in RMSE across nearly all models following\nVMD application. Notably, the Linear + VMD model achieved the lowest average\nRMSE in univariate forecasting at 0.619. In multivariate forecasting, the\nDLinear + VMD model consistently outperformed others, attaining the lowest RMSE\nacross all datasets with an average of 0.019. These findings underscore the\neffectiveness of combining VMD with linear models for superior time-series\nforecasting.\n","authors":["Hafizh Raihan Kurnia Putra","Novanto Yudistira","Tirana Noor Fatyanosa"],"pdf_url":"https://arxiv.org/pdf/2408.16122v2.pdf","comment":"For associated repository, see\n https://github.com/Espalemit/VMD-With-LTSF-Linear.git"},{"id":"http://arxiv.org/abs/2409.02686v1","updated":"2024-09-04T13:17:09Z","published":"2024-09-04T13:17:09Z","title":"Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for\n Problem-Solving Improvement of LLMs","summary":" Large Language Models (LLMs) have demonstrated remarkable efficiency in\ntackling various tasks based on human instructions, but recent studies reveal\nthat these models often fail to achieve satisfactory results on questions\ninvolving reasoning, such as mathematics or physics questions. This phenomenon\nis usually attributed to the uncertainty regarding whether these models could\ngenuinely comprehend the knowledge embedded in the text or merely learn to\nreplicate the token distribution without a true understanding of the content.\nIn this paper, we delve into this problem and aim to enhance the reasoning\ncapabilities of LLMs. First, we investigate if the model has genuine reasoning\ncapabilities by visualizing the text generation process at the attention and\nrepresentation level. Then, we formulate the reasoning process of LLMs into a\ncausal framework, which provides a formal explanation of the problems we\nobserve in the visualization. Finally, building upon this causal framework, we\npropose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient\nfine-tuning (PEFT) method to enhance the model's reasoning capabilities by\nencouraging the model to extract the general problem-solving skills and apply\nthese skills to different questions. Experiments show that our method\noutperforms the baseline consistently across multiple benchmarks, and with only\n1.2M tunable parameters, we achieve better or comparable results to other\nfine-tuning methods. This demonstrates the effectiveness and efficiency of our\nmethod in improving the overall accuracy and reliability of LLMs.\n","authors":["Ruoyu Wang","Xiaoxuan Li","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02684v1","updated":"2024-09-04T13:16:20Z","published":"2024-09-04T13:16:20Z","title":"Neural timescales from a computational perspective","summary":" Timescales of neural activity are diverse across and within brain areas, and\nexperimental observations suggest that neural timescales reflect information in\ndynamic environments. However, these observations do not specify how neural\ntimescales are shaped, nor whether particular timescales are necessary for\nneural computations and brain function. Here, we take a complementary\nperspective and synthesize three directions where computational methods can\ndistill the broad set of empirical observations into quantitative and testable\ntheories: We review (i) how data analysis methods allow us to capture different\ntimescales of neural dynamics across different recording modalities, (ii) how\ncomputational models provide a mechanistic explanation for the emergence of\ndiverse timescales, and (iii) how task-optimized models in machine learning\nuncover the functional relevance of neural timescales. This integrative\ncomputational approach, combined with empirical findings, would provide a more\nholistic understanding of how neural timescales capture the relationship\nbetween brain structure, dynamics, and behavior.\n","authors":["Roxana Zeraati","Anna Levina","Jakob H. Macke","Richard Gao"],"pdf_url":"https://arxiv.org/pdf/2409.02684v1.pdf","comment":"18 pages, 4 figures, 2 boxes"},{"id":"http://arxiv.org/abs/2409.02681v1","updated":"2024-09-04T13:11:59Z","published":"2024-09-04T13:11:59Z","title":"Neural Networks with LSTM and GRU in Modeling Active Fires in the Amazon","summary":" This study presents a comprehensive methodology for modeling and forecasting\nthe historical time series of fire spots detected by the AQUA_M-T satellite in\nthe Amazon, Brazil. The approach utilizes a mixed Recurrent Neural Network\n(RNN) model, combining Long Short-Term Memory (LSTM) and Gated Recurrent Unit\n(GRU) architectures to predict monthly accumulations of daily detected fire\nspots. A summary of the data revealed a consistent seasonality over time, with\nannual maximum and minimum fire spot values tending to repeat at the same\nperiods each year. The primary objective is to verify whether the forecasts\ncapture this inherent seasonality through rigorous statistical analysis. The\nmethodology involved careful data preparation, model configuration, and\ntraining using cross-validation with two seeds, ensuring that the data\ngeneralizes well to the test and validation sets, and confirming the\nconvergence of the model parameters. The results indicate that the mixed LSTM\nand GRU model offers improved accuracy in forecasting 12 months ahead,\ndemonstrating its effectiveness in capturing complex temporal patterns and\nmodeling the observed time series. This research significantly contributes to\nthe application of deep learning techniques in environmental monitoring,\nspecifically in fire spot forecasting. In addition to improving forecast\naccuracy, the proposed approach highlights the potential for adaptation to\nother time series forecasting challenges, opening new avenues for research and\ndevelopment in machine learning and natural phenomenon prediction. Keywords:\nTime Series Forecasting, Recurrent Neural Networks, Deep Learning.\n","authors":["Ramon Tavares"],"pdf_url":"https://arxiv.org/pdf/2409.02681v1.pdf","comment":"16 pages, in Portuguese language, 24 figures"},{"id":"http://arxiv.org/abs/2212.05782v2","updated":"2024-09-04T13:06:58Z","published":"2022-12-12T09:09:39Z","title":"GT-CausIn: a novel causal-based insight for traffic prediction","summary":" Traffic forecasting is an important application of spatiotemporal series\nprediction. Among different methods, graph neural networks have achieved so far\nthe most promising results, learning relations between graph nodes then becomes\na crucial task. However, improvement space is very limited when these relations\nare learned in a node-to-node manner. The challenge stems from (1) obscure\ntemporal dependencies between different stations, (2) difficulties in defining\nvariables beyond the node level, and (3) no ready-made method to validate the\nlearned relations. To confront these challenges, we define legitimate traffic\ncausal variables to discover the causal relation inside the traffic network,\nwhich is carefully checked with statistic tools and case analysis. We then\npresent a novel model named Graph Spatial-Temporal Network Based on Causal\nInsight (GT-CausIn), where prior learned causal information is integrated with\ngraph diffusion layers and temporal convolutional network (TCN) layers.\nExperiments are carried out on two real-world traffic datasets: PEMS-BAY and\nMETR-LA, which show that GT-CausIn significantly outperforms the\nstate-of-the-art models on mid-term and long-term prediction.\n","authors":["Ting Gao","Rodrigo Kappes Marques","Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2212.05782v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02672v1","updated":"2024-09-04T13:00:59Z","published":"2024-09-04T13:00:59Z","title":"Independence Constrained Disentangled Representation Learning from\n Epistemological Perspective","summary":" Disentangled Representation Learning aims to improve the explainability of\ndeep learning methods by training a data encoder that identifies semantically\nmeaningful latent variables in the data generation process. Nevertheless, there\nis no consensus regarding a universally accepted definition for the objective\nof disentangled representation learning. In particular, there is a considerable\namount of discourse regarding whether should the latent variables be mutually\nindependent or not. In this paper, we first investigate these arguments on the\ninterrelationships between latent variables by establishing a conceptual bridge\nbetween Epistemology and Disentangled Representation Learning. Then, inspired\nby these interdisciplinary concepts, we introduce a two-level latent space\nframework to provide a general solution to the prior arguments on this issue.\nFinally, we propose a novel method for disentangled representation learning by\nemploying an integration of mutual information constraint and independence\nconstraint within the Generative Adversarial Network (GAN) framework.\nExperimental results demonstrate that our proposed method consistently\noutperforms baseline approaches in both quantitative and qualitative\nevaluations. The method exhibits strong performance across multiple commonly\nused metrics and demonstrates a great capability in disentangling various\nsemantic factors, leading to an improved quality of controllable generation,\nwhich consequently benefits the explainability of the algorithm.\n","authors":["Ruoyu Wang","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01821v2","updated":"2024-09-04T12:58:11Z","published":"2024-09-03T12:03:45Z","title":"When Does Visual Prompting Outperform Linear Probing for Vision-Language\n Models? A Likelihood Perspective","summary":" Adapting pre-trained models to new tasks can exhibit varying effectiveness\nacross datasets. Visual prompting, a state-of-the-art parameter-efficient\ntransfer learning method, can significantly improve the performance of\nout-of-distribution tasks. On the other hand, linear probing, a standard\ntransfer learning method, can sometimes become the best approach. We propose a\nlog-likelihood ratio (LLR) approach to analyze the comparative benefits of\nvisual prompting and linear probing. By employing the LLR score alongside\nresource-efficient visual prompts approximations, our cost-effective measure\nattains up to a 100-fold reduction in run time compared to full training, while\nachieving prediction accuracies up to 91%. The source code is available at\nhttps://github.com/IBM/VP-LLR.\n","authors":["Hsi-Ai Tsao","Lei Hsiung","Pin-Yu Chen","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2409.01821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02669v1","updated":"2024-09-04T12:53:26Z","published":"2024-09-04T12:53:26Z","title":"Causality-Aware Transformer Networks for Robotic Navigation","summary":" Recent advances in machine learning algorithms have garnered growing interest\nin developing versatile Embodied AI systems. However, current research in this\ndomain reveals opportunities for improvement. First, the direct adoption of\nRNNs and Transformers often overlooks the specific differences between Embodied\nAI and traditional sequential data modelling, potentially limiting its\nperformance in Embodied AI tasks. Second, the reliance on task-specific\nconfigurations, such as pre-trained modules and dataset-specific logic,\ncompromises the generalizability of these methods. We address these constraints\nby initially exploring the unique differences between Embodied AI tasks and\nother sequential data tasks through the lens of Causality, presenting a causal\nframework to elucidate the inadequacies of conventional sequential methods for\nEmbodied AI. By leveraging this causal perspective, we propose Causality-Aware\nTransformer (CAT) Networks for Navigation, featuring a Causal Understanding\nModule to enhance the models's Environmental Understanding capability.\nMeanwhile, our method is devoid of task-specific inductive biases and can be\ntrained in an End-to-End manner, which enhances the method's generalizability\nacross various contexts. Empirical evaluations demonstrate that our methodology\nconsistently surpasses benchmark performances across a spectrum of settings,\ntasks and simulation environments. Extensive ablation studies reveal that the\nperformance gains can be attributed to the Causal Understanding Module, which\ndemonstrates effectiveness and efficiency in both Reinforcement Learning and\nSupervised Learning settings.\n","authors":["Ruoyu Wang","Yao Liu","Yuanjiang Cao","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2409.02669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02668v1","updated":"2024-09-04T12:51:41Z","published":"2024-09-04T12:51:41Z","title":"Introduction to Machine Learning","summary":" This book introduces the mathematical foundations and techniques that lead to\nthe development and analysis of many of the algorithms that are used in machine\nlearning. It starts with an introductory chapter that describes notation used\nthroughout the book and serve at a reminder of basic concepts in calculus,\nlinear algebra and probability and also introduces some measure theoretic\nterminology, which can be used as a reading guide for the sections that use\nthese tools. The introductory chapters also provide background material on\nmatrix analysis and optimization. The latter chapter provides theoretical\nsupport to many algorithms that are used in the book, including stochastic\ngradient descent, proximal methods, etc. After discussing basic concepts for\nstatistical prediction, the book includes an introduction to reproducing kernel\ntheory and Hilbert space techniques, which are used in many places, before\naddressing the description of various algorithms for supervised statistical\nlearning, including linear methods, support vector machines, decision trees,\nboosting, or neural networks. The subject then switches to generative methods,\nstarting with a chapter that presents sampling methods and an introduction to\nthe theory of Markov chains. The following chapter describe the theory of\ngraphical models, an introduction to variational methods for models with latent\nvariables, and to deep-learning based generative models. The next chapters\nfocus on unsupervised learning methods, for clustering, factor analysis and\nmanifold learning. The final chapter of the book is theory-oriented and\ndiscusses concentration inequalities and generalization bounds.\n","authors":["Laurent Younes"],"pdf_url":"https://arxiv.org/pdf/2409.02668v1.pdf","comment":"textbook"},{"id":"http://arxiv.org/abs/2312.02491v2","updated":"2024-09-04T12:43:48Z","published":"2023-12-05T04:43:23Z","title":"Pseudo Replay-based Class Continual Learning for Online New Category\n Anomaly Detection in Additive Manufacturing","summary":" The incorporation of advanced sensors and machine learning techniques has\nenabled modern manufacturing enterprises to perform data-driven\nclassification-based anomaly detection based on the sensor data collected in\nmanufacturing processes. However, one critical challenge is that newly\npresented defect category may manifest as the manufacturing process continues,\nresulting in monitoring performance deterioration of previously trained machine\nlearning models. Hence, there is an increasing need for empowering machine\nlearning models to learn continually. Among all continual learning methods,\nmemory-based continual learning has the best performance but faces the\nconstraints of data storage capacity. To address this issue, this paper\ndevelops a novel pseudo replay-based continual learning framework by\nintegrating class incremental learning and oversampling-based data generation.\nWithout storing all the data, the developed framework could generate\nhigh-quality data representing previous classes to train machine learning model\nincrementally when new category anomaly occurs. In addition, it could even\nenhance the monitoring performance since it also effectively improves the data\nquality. The effectiveness of the proposed framework is validated in three\ncases studies, which leverages supervised classification problem for anomaly\ndetection. The experimental results show that the developed method is very\npromising in detecting novel anomaly while maintaining a good performance on\nthe previous task and brings up more flexibility in model architecture.\n","authors":["Yuxuan Li","Tianxin Xie","Chenang Liu","Zhangyue Shi"],"pdf_url":"https://arxiv.org/pdf/2312.02491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10064v2","updated":"2024-09-04T12:28:52Z","published":"2024-01-22T12:17:27Z","title":"Navigating the Maize: Cyclic and conditional computational graphs for\n molecular simulation","summary":" Many computational chemistry and molecular simulation workflows can be\nexpressed as graphs. This abstraction is useful to modularize and potentially\nreuse existing components, as well as provide parallelization and ease\nreproducibility. Existing tools represent the computation as a directed acyclic\ngraph (DAG), thus allowing efficient execution by parallelization of concurrent\nbranches. These systems can, however, generally not express cyclic and\nconditional workflows. We therefore developed Maize, a workflow manager for\ncyclic and conditional graphs based on the principles of flow-based\nprogramming. By running each node of the graph concurrently in separate\nprocesses and allowing communication at any time through dedicated inter-node\nchannels, arbitrary graph structures can be executed. We demonstrate the\neffectiveness of the tool on a dynamic active learning task in computational\ndrug design, involving the use of a small molecule generative model and an\nassociated scoring system, and on a reactivity prediction pipeline using\nquantum-chemistry and semiempirical approaches.\n","authors":["Thomas Löhr","Michele Assante","Michael Dodds","Lili Cao","Mikhail Kabeshov","Jon-Paul Janet","Marco Klähn","Ola Engkvist"],"pdf_url":"https://arxiv.org/pdf/2402.10064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00980v2","updated":"2024-09-04T12:25:28Z","published":"2024-09-02T06:52:01Z","title":"DNN-GDITD: Out-of-distribution detection via Deep Neural Network based\n Gaussian Descriptor for Imbalanced Tabular Data","summary":" Classification tasks present challenges due to class imbalances and evolving\ndata distributions. Addressing these issues requires a robust method to handle\nimbalances while effectively detecting out-of-distribution (OOD) samples not\nencountered during training. This study introduces a novel OOD detection\nalgorithm designed for tabular datasets, titled Deep Neural Network-based\nGaussian Descriptor for Imbalanced Tabular Data (DNN-GDITD). The DNN-GDITD\nalgorithm can be placed on top of any DNN to facilitate better classification\nof imbalanced data and OOD detection using spherical decision boundaries. Using\na combination of Push, Score-based, and focal losses, DNN-GDITD assigns\nconfidence scores to test data points, categorizing them as known classes or as\nan OOD sample. Extensive experimentation on tabular datasets demonstrates the\neffectiveness of DNN-GDITD compared to three OOD algorithms. Evaluation\nencompasses imbalanced and balanced scenarios on diverse tabular datasets,\nincluding a synthetic financial dispute dataset and publicly available tabular\ndatasets like Gas Sensor, Drive Diagnosis, and MNIST, showcasing DNN-GDITD's\nversatility.\n","authors":["Priyanka Chudasama","Anil Surisetty","Aakarsh Malhotra","Alok Singh"],"pdf_url":"https://arxiv.org/pdf/2409.00980v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2409.02647v1","updated":"2024-09-04T12:23:47Z","published":"2024-09-04T12:23:47Z","title":"Learning-Based Error Detection System for Advanced Vehicle Instrument\n Cluster Rendering","summary":" The automotive industry is currently expanding digital display options with\nevery new model that comes onto the market. This entails not just an expansion\nin dimensions, resolution, and customization choices, but also the capability\nto employ novel display effects like overlays while assembling the content of\nthe display cluster. Unfortunately, this raises the need for appropriate\nmonitoring systems that can detect rendering errors and apply appropriate\ncountermeasures when required. Classical solutions such as Cyclic Redundancy\nChecks (CRC) will soon be no longer viable as any sort of alpha blending,\nwarping of scaling of content can cause unwanted CRC violations. Therefore, we\npropose a novel monitoring approach to verify correctness of displayed content\nusing telltales (e.g. warning signs) as example. It uses a learning-based\napproach to separate \"good\" telltales, i.e. those that a human driver will\nunderstand correctly, and \"corrupted\" telltales, i.e. those that will not be\nvisible or perceived correctly. As a result, it possesses inherent resilience\nagainst individual pixel errors and implicitly supports changing backgrounds,\noverlay or scaling effects. This is underlined by our experimental study where\nall \"corrupted\" test patterns were correctly classified, while no false alarms\nwere triggered.\n","authors":["Cornelius Bürkle","Fabian Oboril","Kay-Ulrich Scholl"],"pdf_url":"https://arxiv.org/pdf/2409.02647v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.02644v1","updated":"2024-09-04T12:20:27Z","published":"2024-09-04T12:20:27Z","title":"Conformal Prediction in Dynamic Biological Systems","summary":" Uncertainty quantification (UQ) is the process of systematically determining\nand characterizing the degree of confidence in computational model predictions.\nIn the context of systems biology, especially with dynamic models, UQ is\ncrucial because it addresses the challenges posed by nonlinearity and parameter\nsensitivity, allowing us to properly understand and extrapolate the behavior of\ncomplex biological systems. Here, we focus on dynamic models represented by\ndeterministic nonlinear ordinary differential equations. Many current UQ\napproaches in this field rely on Bayesian statistical methods. While powerful,\nthese methods often require strong prior specifications and make parametric\nassumptions that may not always hold in biological systems. Additionally, these\nmethods face challenges in domains where sample sizes are limited, and\nstatistical inference becomes constrained, with computational speed being a\nbottleneck in large models of biological systems. As an alternative, we propose\nthe use of conformal inference methods, introducing two novel algorithms that,\nin some instances, offer non-asymptotic guarantees, enhancing robustness and\nscalability across various applications. We demonstrate the efficacy of our\nproposed algorithms through several scenarios, highlighting their advantages\nover traditional Bayesian approaches. The proposed methods show promising\nresults for diverse biological data structures and scenarios, offering a\ngeneral framework to quantify uncertainty for dynamic models of biological\nsystems.The software for the methodology and the reproduction of the results is\navailable at https://zenodo.org/doi/10.5281/zenodo.13644870.\n","authors":["Alberto Portela","Julio R. Banga","Marcos Matabuena"],"pdf_url":"https://arxiv.org/pdf/2409.02644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00180v4","updated":"2024-09-04T11:56:13Z","published":"2023-03-01T02:14:20Z","title":"MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN\n for Precise Facial Expression Intensity Estimation","summary":" This paper presents MMA-MRNNet, a novel deep learning architecture for\ndynamic multi-output Facial Expression Intensity Estimation (FEIE) from video\ndata. Traditional approaches to this task often rely on complex 3-D CNNs, which\nrequire extensive pre-training and assume that facial expressions are uniformly\ndistributed across all frames of a video. These methods struggle to handle\nvideos of varying lengths, often resorting to ad-hoc strategies that either\ndiscard valuable information or introduce bias. MMA-MRNNet addresses these\nchallenges through a two-stage process. First, the Multiple Models of Affect\n(MMA) extractor component is a Multi-Task Learning CNN that concurrently\nestimates valence-arousal, recognizes basic facial expressions, and detects\naction units in each frame. These representations are then processed by a\nMasked RNN component, which captures temporal dependencies and dynamically\nupdates weights according to the true length of the input video, ensuring that\nonly the most relevant features are used for the final prediction. The proposed\nunimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction\ndataset and demonstrated significantly superior performance, surpassing\nstate-of-the-art methods by a wide margin, regardless of whether they were\nunimodal, multimodal, or ensemble approaches. Finally, we demonstrated the\neffectiveness of the MMA component of our proposed method across multiple\nin-the-wild datasets, where it consistently outperformed all state-of-the-art\nmethods across various metrics.\n","authors":["Dimitrios Kollias","Andreas Psaroudakis","Anastasios Arsenos","Paraskevi Theofilou","Chunchang Shao","Guanyu Hu","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2303.00180v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00208v3","updated":"2024-09-04T11:48:04Z","published":"2023-11-01T00:38:26Z","title":"What Formal Languages Can Transformers Express? A Survey","summary":" As transformers have gained prominence in natural language processing, some\nresearchers have investigated theoretically what problems they can and cannot\nsolve, by treating problems as formal languages. Exploring such questions can\nhelp clarify the power of transformers relative to other models of computation,\ntheir fundamental capabilities and limits, and the impact of architectural\nchoices. Work in this subarea has made considerable progress in recent years.\nHere, we undertake a comprehensive survey of this work, documenting the diverse\nassumptions that underlie different results and providing a unified framework\nfor harmonizing seemingly contradictory findings.\n","authors":["Lena Strobl","William Merrill","Gail Weiss","David Chiang","Dana Angluin"],"pdf_url":"https://arxiv.org/pdf/2311.00208v3.pdf","comment":"One minor correction in {\\S}5.1"},{"id":"http://arxiv.org/abs/2307.13565v4","updated":"2024-09-04T11:47:12Z","published":"2023-07-25T15:17:31Z","title":"Decision-Focused Learning: Foundations, State of the Art, Benchmark and\n Future Opportunities","summary":" Decision-focused learning (DFL) is an emerging paradigm that integrates\nmachine learning (ML) and constrained optimization to enhance decision quality\nby training ML models in an end-to-end system. This approach shows significant\npotential to revolutionize combinatorial decision-making in real-world\napplications that operate under uncertainty, where estimating unknown\nparameters within decision models is a major challenge. This paper presents a\ncomprehensive review of DFL, providing an in-depth analysis of both\ngradient-based and gradient-free techniques used to combine ML and constrained\noptimization. It evaluates the strengths and limitations of these techniques\nand includes an extensive empirical evaluation of eleven methods across seven\nproblems. The survey also offers insights into recent advancements and future\nresearch directions in DFL.\n Code and benchmark: https://github.com/PredOpt/predopt-benchmarks\n","authors":["Jayanta Mandi","James Kotary","Senne Berden","Maxime Mulamba","Victor Bucarey","Tias Guns","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2307.13565v4.pdf","comment":"Experimental Survey and Benchmarking"},{"id":"http://arxiv.org/abs/2409.02629v1","updated":"2024-09-04T11:47:00Z","published":"2024-09-04T11:47:00Z","title":"AdvSecureNet: A Python Toolkit for Adversarial Machine Learning","summary":" Machine learning models are vulnerable to adversarial attacks. Several tools\nhave been developed to research these vulnerabilities, but they often lack\ncomprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch\nbased toolkit for adversarial machine learning that is the first to natively\nsupport multi-GPU setups for attacks, defenses, and evaluation. It is the first\ntoolkit that supports both CLI and API interfaces and external YAML\nconfiguration files to enhance versatility and reproducibility. The toolkit\nincludes multiple attacks, defenses and evaluation metrics. Rigiorous software\nengineering practices are followed to ensure high code quality and\nmaintainability. The project is available as an open-source project on GitHub\nat https://github.com/melihcatal/advsecurenet and installable via PyPI.\n","authors":["Melih Catal","Manuel Günther"],"pdf_url":"https://arxiv.org/pdf/2409.02629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02628v1","updated":"2024-09-04T11:45:55Z","published":"2024-09-04T11:45:55Z","title":"(Implicit) Ensembles of Ensembles: Epistemic Uncertainty Collapse in\n Large Models","summary":" Epistemic uncertainty is crucial for safety-critical applications and\nout-of-distribution detection tasks. Yet, we uncover a paradoxical phenomenon\nin deep learning models: an epistemic uncertainty collapse as model complexity\nincreases, challenging the assumption that larger models invariably offer\nbetter uncertainty quantification. We propose that this stems from implicit\nensembling within large models. To support this hypothesis, we demonstrate\nepistemic uncertainty collapse empirically across various architectures, from\nexplicit ensembles of ensembles and simple MLPs to state-of-the-art vision\nmodels, including ResNets and Vision Transformers -- for the latter, we examine\nimplicit ensemble extraction and decompose larger models into diverse\nsub-models, recovering epistemic uncertainty. We provide theoretical\njustification for these phenomena and explore their implications for\nuncertainty estimation.\n","authors":["Andreas Kirsch"],"pdf_url":"https://arxiv.org/pdf/2409.02628v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2404.07569v2","updated":"2024-09-04T11:34:33Z","published":"2024-04-11T08:57:48Z","title":"Can Vehicle Motion Planning Generalize to Realistic Long-tail Scenarios?","summary":" Real-world autonomous driving systems must make safe decisions in the face of\nrare and diverse traffic scenarios. Current state-of-the-art planners are\nmostly evaluated on real-world datasets like nuScenes (open-loop) or nuPlan\n(closed-loop). In particular, nuPlan seems to be an expressive evaluation\nmethod since it is based on real-world data and closed-loop, yet it mostly\ncovers basic driving scenarios. This makes it difficult to judge a planner's\ncapabilities to generalize to rarely-seen situations. Therefore, we propose a\nnovel closed-loop benchmark interPlan containing several edge cases and\nchallenging driving scenarios. We assess existing state-of-the-art planners on\nour benchmark and show that neither rule-based nor learning-based planners can\nsafely navigate the interPlan scenarios. A recently evolving direction is the\nusage of foundation models like large language models (LLM) to handle\ngeneralization. We evaluate an LLM-only planner and introduce a novel hybrid\nplanner that combines an LLM-based behavior planner with a rule-based motion\nplanner that achieves state-of-the-art performance on our benchmark.\n","authors":["Marcel Hallgarten","Julian Zapata","Martin Stoll","Katrin Renz","Andreas Zell"],"pdf_url":"https://arxiv.org/pdf/2404.07569v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16582v2","updated":"2024-09-04T11:14:18Z","published":"2024-03-25T09:49:42Z","title":"In the Search for Optimal Multi-view Learning Models for Crop\n Classification with Global Remote Sensing Data","summary":" Studying and analyzing cropland is a difficult task due to its dynamic and\nheterogeneous growth behavior. Usually, diverse data sources can be collected\nfor its estimation. Although deep learning models have proven to excel in the\ncrop classification task, they face substantial challenges when dealing with\nmultiple inputs, named Multi-View Learning (MVL). The methods used in the MVL\nscenario can be structured based on the encoder architecture, the fusion\nstrategy, and the optimization technique. The literature has primarily focused\non using specific encoder architectures for local regions, lacking a deeper\nexploration of other components in the MVL methodology. In contrast, we\ninvestigate the simultaneous selection of the fusion strategy and encoder\narchitecture, assessing global-scale cropland and crop-type classifications. We\nuse a range of five fusion strategies (Input, Feature, Decision, Ensemble,\nHybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible\nconfigurations in the MVL method. We use the CropHarvest dataset for\nvalidation, which provides optical, radar, weather time series, and topographic\ninformation as input data. We found that in scenarios with a limited number of\nlabeled samples, a unique configuration is insufficient for all the cases.\nInstead, a specialized combination should be meticulously sought, including an\nencoder and fusion strategy. To streamline this search process, we suggest\nidentifying the optimal encoder architecture tailored for a particular fusion\nstrategy, and then determining the most suitable fusion strategy for the\nclassification task. We provide a methodological framework for researchers\nexploring crop classification through an MVL methodology.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2403.16582v2.pdf","comment":"submitted to journal"},{"id":"http://arxiv.org/abs/2407.15512v2","updated":"2024-09-04T11:01:47Z","published":"2024-07-22T09:58:29Z","title":"Increasing the Robustness of Model Predictions to Missing Sensors in\n Earth Observation","summary":" Multi-sensor ML models for EO aim to enhance prediction accuracy by\nintegrating data from various sources. However, the presence of missing data\nposes a significant challenge, particularly in non-persistent sensors that can\nbe affected by external factors. Existing literature has explored strategies\nlike temporal dropout and sensor-invariant models to address the generalization\nto missing data issues. Inspired by these works, we study two novel methods\ntailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and\nEnsemble Sensor Invariant (ESensI). Through experimentation on three\nmulti-sensor temporal EO datasets, we demonstrate that these methods\neffectively increase the robustness of model predictions to missing sensors.\nParticularly, we focus on how the predictive performance of models drops when\nsensors are missing at different levels. We observe that ensemble multi-sensor\nmodels are the most robust to the lack of sensors. In addition, the sensor\ndropout component in ISensD shows promising robustness results.\n","authors":["Francisco Mena","Diego Arenas","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2407.15512v2.pdf","comment":"Accepted at the MACLEAN workshop in the ECML/PKDD 2024"},{"id":"http://arxiv.org/abs/2401.15113v3","updated":"2024-09-04T10:59:10Z","published":"2024-01-25T20:41:17Z","title":"Scalable Glacier Mapping using Deep Learning and Open Earth Observation\n Data Matches the Accuracy of Manual Delineation","summary":" Accurate global glacier mapping is critical for understanding climate change\nimpacts. Despite its importance, automated glacier mapping at a global scale\nremains largely unexplored. Here we address this gap and propose\nGlacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep\nlearning model, and five strategies for multitemporal global-scale glacier\nmapping using open satellite imagery. Assessing the spatial, temporal and\ncross-sensor generalisation shows that our best strategy achieves intersection\nover union >0.85 on previously unobserved images in most cases, which drops to\n>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90\nfor regions dominated by clean ice. A comparative validation against human\nexpert uncertainties in terms of area and distance deviations underscores\nGlaViTU performance, approaching or matching expert-level delineation. Adding\nsynthetic aperture radar data, namely, backscatter and interferometric\ncoherence, increases the accuracy in all regions where available. The\ncalibrated confidence for glacier extents is reported making the predictions\nmore reliable and interpretable. We also release a benchmark dataset that\ncovers 9% of glaciers worldwide. Our results support efforts towards automated\nmultitemporal and global glacier mapping.\n","authors":["Konstantin A. Maslov","Claudio Persello","Thomas Schellenberger","Alfred Stein"],"pdf_url":"https://arxiv.org/pdf/2401.15113v3.pdf","comment":"after major revision, expanded validation"},{"id":"http://arxiv.org/abs/2409.01137v2","updated":"2024-09-04T10:58:57Z","published":"2024-09-02T10:19:31Z","title":"Smart E-commerce Recommendations with Semantic AI","summary":" In e-commerce, web mining for page recommendations is widely used but often\nfails to meet user needs. To address this, we propose a novel solution\ncombining semantic web mining with BP neural networks. We process user search\nlogs to extract five key features: content priority, time spent, user feedback,\nrecommendation semantics, and input deviation. These features are then fed into\na BP neural network to classify and prioritize web pages. The prioritized pages\nare recommended to users. Using book sales pages for testing, our results\ndemonstrate that this solution can quickly and accurately identify the pages\nusers need. Our approach ensures that recommendations are more relevant and\ntailored to individual preferences, enhancing the online shopping experience.\nBy leveraging advanced semantic analysis and neural network techniques, we\nbridge the gap between user expectations and actual recommendations. This\ninnovative method not only improves accuracy but also speeds up the\nrecommendation process, making it a valuable tool for e-commerce platforms\naiming to boost user satisfaction and engagement. Additionally, our system\nability to handle large datasets and provide real-time recommendations makes it\na scalable and efficient solution for modern e-commerce challenges.\n","authors":["M. Badouch","M. Boutaounte"],"pdf_url":"https://arxiv.org/pdf/2409.01137v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2409.00125v2","updated":"2024-09-04T10:48:52Z","published":"2024-08-28T22:02:42Z","title":"A Hybrid Framework for Spatial Interpolation: Merging Data-driven with\n Domain Knowledge","summary":" Estimating spatially distributed information through the interpolation of\nscattered observation datasets often overlooks the critical role of domain\nknowledge in understanding spatial dependencies. Additionally, the features of\nthese data sets are typically limited to the spatial coordinates of the\nscattered observation locations. In this paper, we propose a hybrid framework\nthat integrates data-driven spatial dependency feature extraction with\nrule-assisted spatial dependency function mapping to augment domain knowledge.\nWe demonstrate the superior performance of our framework in two comparative\napplication scenarios, highlighting its ability to capture more localized\nspatial features in the reconstructed distribution fields. Furthermore, we\nunderscore its potential to enhance nonlinear estimation capabilities through\nthe application of transformed fuzzy rules and to quantify the inherent\nuncertainties associated with the observation data sets. Our framework\nintroduces an innovative approach to spatial information estimation by\nsynergistically combining observational data with rule-assisted domain\nknowledge.\n","authors":["Cong Zhang","Shuyi Du","Hongqing Song","Yuhe Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00125v2.pdf","comment":"21 pages, 13 figures; typos corrected, references updated"},{"id":"http://arxiv.org/abs/2409.02604v1","updated":"2024-09-04T10:37:44Z","published":"2024-09-04T10:37:44Z","title":"Hypothesizing Missing Causal Variables with LLMs","summary":" Scientific discovery is a catalyst for human intellectual advances, driven by\nthe cycle of hypothesis generation, experimental design, data evaluation, and\niterative assumption refinement. This process, while crucial, is expensive and\nheavily dependent on the domain knowledge of scientists to generate hypotheses\nand navigate the scientific cycle. Central to this is causality, the ability to\nestablish the relationship between the cause and the effect. Motivated by the\nscientific discovery process, in this work, we formulate a novel task where the\ninput is a partial causal graph with missing variables, and the output is a\nhypothesis about the missing variables to complete the partial graph. We design\na benchmark with varying difficulty levels and knowledge assumptions about the\ncausal graph. With the growing interest in using Large Language Models (LLMs)\nto assist in scientific discovery, we benchmark open-source and closed models\non our testbed. We show the strong ability of LLMs to hypothesize the mediation\nvariables between a cause and its effect. In contrast, they underperform in\nhypothesizing the cause and effect variables themselves. We also observe\nsurprising results where some of the open-source models outperform the closed\nGPT-4 model.\n","authors":["Ivaxi Sheth","Sahar Abdelnabi","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2409.02604v1.pdf","comment":"Code - https://github.com/ivaxi0s/hypothesizing-causal-variable-llm"},{"id":"http://arxiv.org/abs/2409.02599v1","updated":"2024-09-04T10:30:11Z","published":"2024-09-04T10:30:11Z","title":"A Fashion Item Recommendation Model in Hyperbolic Space","summary":" In this work, we propose a fashion item recommendation model that\nincorporates hyperbolic geometry into user and item representations. Using\nhyperbolic space, our model aims to capture implicit hierarchies among items\nbased on their visual data and users' purchase history. During training, we\napply a multi-task learning framework that considers both hyperbolic and\nEuclidean distances in the loss function. Our experiments on three data sets\nshow that our model performs better than previous models trained in Euclidean\nspace only, confirming the effectiveness of our model. Our ablation studies\nshow that multi-task learning plays a key role, and removing the Euclidean loss\nsubstantially deteriorates the model performance.\n","authors":["Ryotaro Shimizu","Yu Wang","Masanari Kimura","Yuki Hirakawa","Takashi Wada","Yuki Saito","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.02599v1.pdf","comment":"This work was presented at the CVFAD Workshop at CVPR 2024"},{"id":"http://arxiv.org/abs/2409.02596v1","updated":"2024-09-04T10:27:07Z","published":"2024-09-04T10:27:07Z","title":"An Analysis of Linear Complexity Attention Substitutes with BEST-RQ","summary":" Self-Supervised Learning (SSL) has proven to be effective in various domains,\nincluding speech processing. However, SSL is computationally and memory\nexpensive. This is in part due the quadratic complexity of multi-head\nself-attention (MHSA). Alternatives for MHSA have been proposed and used in the\nspeech domain, but have yet to be investigated properly in an SSL setting. In\nthis work, we study the effects of replacing MHSA with recent state-of-the-art\nalternatives that have linear complexity, namely, HyperMixing, Fastformer,\nSummaryMixing, and Mamba. We evaluate these methods by looking at the speed,\nthe amount of VRAM consumed, and the performance on the SSL MP3S benchmark.\nResults show that these linear alternatives maintain competitive performance\ncompared to MHSA while, on average, decreasing VRAM consumption by around 20%\nto 60% and increasing speed from 7% to 65% for input sequences ranging from 20\nto 80 seconds.\n","authors":["Ryan Whetten","Titouan Parcollet","Adel Moumen","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2409.02596v1.pdf","comment":"Accepted in the IEEE Soken Language Technology Workshop 2024"},{"id":"http://arxiv.org/abs/2405.04296v2","updated":"2024-09-04T10:23:04Z","published":"2024-05-07T13:11:37Z","title":"Open Implementation and Study of BEST-RQ for Speech Processing","summary":" Self-Supervised Learning (SSL) has proven to be useful in various speech\ntasks. However, these methods are generally very demanding in terms of data,\nmemory, and computational resources. BERT-based Speech pre-Training with\nRandom-projection Quantizer (BEST-RQ), is an SSL method that has shown great\nperformance on Automatic Speech Recognition (ASR) while being simpler than\nother SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance,\ndetails are lacking in the original paper, such as the amount of GPU/TPU hours\nused in pre-training, and there is no official easy-to-use open-source\nimplementation. Furthermore, BEST-RQ has not been evaluated on other downstream\ntasks aside from ASR and speech translation. In this work, we describe a\nre-implementation of a Random-projection quantizer and perform a preliminary\nstudy with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the\ndetails and differences of our implementation. We show that a random projection\nquantizer can achieve similar downstream performance as wav2vec 2.0 while\ndecreasing training time by over a factor of two.\n","authors":["Ryan Whetten","Titouan Parcollet","Marco Dinarelli","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2405.04296v2.pdf","comment":"Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio,\n Speech and Beyond (SASB 2024)"},{"id":"http://arxiv.org/abs/2302.13696v5","updated":"2024-09-04T10:21:32Z","published":"2023-02-27T11:55:24Z","title":"Moderate Adaptive Linear Units (MoLU)","summary":" We propose a new high-performance activation function, Moderate Adaptive\nLinear Units (MoLU), for the deep neural network. The MoLU is a simple,\nbeautiful and powerful activation function that can be a good main activation\nfunction among hundreds of activation functions. Because the MoLU is made up of\nthe elementary functions, not only it is a diffeomorphism (i.e. analytic over\nwhole domains), but also it reduces the training time.\n","authors":["Hankyul Koh","Joon-hyuk Ko","Wonho Jhe"],"pdf_url":"https://arxiv.org/pdf/2302.13696v5.pdf","comment":"4 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.01227v2","updated":"2024-09-04T10:20:59Z","published":"2024-09-02T13:02:51Z","title":"Prompt Compression with Context-Aware Sentence Encoding for Fast and\n Improved LLM Inference","summary":" Large language models (LLMs) have triggered a new stream of research focusing\non compressing the context length to reduce the computational cost while\nensuring the retention of helpful information for LLMs to answer the given\nquestion. Token-based removal methods are one of the most prominent approaches\nin this direction, but risk losing the semantics of the context caused by\nintermediate token removal, especially under high compression ratios, while\nalso facing challenges in computational efficiency. In this work, we propose\ncontext-aware prompt compression (CPC), a sentence-level prompt compression\ntechnique where its key innovation is a novel context-aware sentence encoder\nthat provides a relevance score for each sentence for a given question. To\ntrain this encoder, we generate a new dataset consisting of questions,\npositives, and negative pairs where positives are sentences relevant to the\nquestion, while negatives are irrelevant context sentences. We train the\nencoder in a contrastive setup to learn context-aware sentence representations.\nOur method considerably outperforms prior works on prompt compression on\nbenchmark datasets and is up to 10.93x faster at inference compared to the best\ntoken-level compression method. We also find better improvement for shorter\nlength constraints in most benchmarks, showing the effectiveness of our\nproposed solution in the compression of relevant information in a shorter\ncontext. Finally, we release the code and the dataset for quick reproducibility\nand further development: https://github.com/Workday/cpc.\n","authors":["Barys Liskavets","Maxim Ushakov","Shuvendu Roy","Mark Klibanov","Ali Etemad","Shane Luke"],"pdf_url":"https://arxiv.org/pdf/2409.01227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15421v2","updated":"2024-09-04T10:17:22Z","published":"2024-08-27T21:54:26Z","title":"Simultaneous Training of First- and Second-Order Optimizers in\n Population-Based Reinforcement Learning","summary":" The tuning of hyperparameters in reinforcement learning (RL) is critical, as\nthese parameters significantly impact an agent's performance and learning\nefficiency. Dynamic adjustment of hyperparameters during the training process\ncan significantly enhance both the performance and stability of learning.\nPopulation-based training (PBT) provides a method to achieve this by\ncontinuously tuning hyperparameters throughout the training. This ongoing\nadjustment enables models to adapt to different learning stages, resulting in\nfaster convergence and overall improved performance. In this paper, we propose\nan enhancement to PBT by simultaneously utilizing both first- and second-order\noptimizers within a single population. We conducted a series of experiments\nusing the TD3 algorithm across various MuJoCo environments. Our results, for\nthe first time, empirically demonstrate the potential of incorporating\nsecond-order optimizers within PBT-based RL. Specifically, the combination of\nthe K-FAC optimizer with Adam led to up to a 10% improvement in overall\nperformance compared to PBT using only Adam. Additionally, in environments\nwhere Adam occasionally fails, such as the Swimmer environment, the mixed\npopulation with K-FAC exhibited more reliable learning outcomes, offering a\nsignificant advantage in training stability without a substantial increase in\ncomputational time.\n","authors":["Felix Pfeiffer","Shahram Eivazi"],"pdf_url":"https://arxiv.org/pdf/2408.15421v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.02588v1","updated":"2024-09-04T10:14:17Z","published":"2024-09-04T10:14:17Z","title":"Multiview Random Vector Functional Link Network for Predicting\n DNA-Binding Proteins","summary":" The identification of DNA-binding proteins (DBPs) is a critical task due to\ntheir significant impact on various biological activities. Understanding the\nmechanisms underlying protein-DNA interactions is essential for elucidating\nvarious life activities. In recent years, machine learning-based models have\nbeen prominently utilized for DBP prediction. In this paper, to predict DBPs,\nwe propose a novel framework termed a multiview random vector functional link\n(MvRVFL) network, which fuses neural network architecture with multiview\nlearning. The proposed MvRVFL model combines the benefits of late and early\nfusion, allowing for distinct regularization parameters across different views\nwhile leveraging a closed-form solution to determine unknown parameters\nefficiently. The primal objective function incorporates a coupling term aimed\nat minimizing a composite of errors stemming from all views. From each of the\nthree protein views of the DBP datasets, we extract five features. These\nfeatures are then fused together by incorporating a hidden feature during the\nmodel training process. The performance of the proposed MvRVFL model on the DBP\ndataset surpasses that of baseline models, demonstrating its superior\neffectiveness. Furthermore, we extend our assessment to the UCI, KEEL, AwA, and\nCorel5k datasets, to establish the practicality of the proposed models. The\nconsistency error bound, the generalization error bound, and empirical\nfindings, coupled with rigorous statistical analyses, confirm the superior\ngeneralization capabilities of the MvRVFL model compared to the baseline\nmodels.\n","authors":["A. Quadir","M. Sajid","M. Tanveer"],"pdf_url":"https://arxiv.org/pdf/2409.02588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02584v1","updated":"2024-09-04T10:06:42Z","published":"2024-09-04T10:06:42Z","title":"BMI Prediction from Handwritten English Characters Using a Convolutional\n Neural Network","summary":" A person's Body Mass Index, or BMI, is the most widely used parameter for\nassessing their health. BMI is a crucial predictor of potential diseases that\nmay arise at higher body fat levels because it is correlated with body fat.\nConversely, a community's or an individual's nutritional status can be\ndetermined using the BMI. Although deep learning models are used in several\nstudies to estimate BMI from face photos and other data, no previous research\nestablished a clear connection between deep learning techniques for handwriting\nanalysis and BMI prediction. This article addresses this research gap with a\ndeep learning approach to estimating BMI from handwritten characters by\ndeveloping a convolutional neural network (CNN). A dataset containing samples\nfrom 48 people in lowercase English scripts is successfully captured for the\nBMI prediction task. The proposed CNN-based approach reports a commendable\naccuracy of 99.92%. Performance comparison with other popular CNN architectures\nreveals that AlexNet and InceptionV3 achieve the second and third-best\nperformance, with the accuracy of 99.69% and 99.53%, respectively.\n","authors":["N. T. Diba","N. Akter","S. A. H. Chowdhury","J. E. Giti"],"pdf_url":"https://arxiv.org/pdf/2409.02584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04985v6","updated":"2024-09-04T10:04:52Z","published":"2023-12-08T11:47:35Z","title":"SparQ Attention: Bandwidth-Efficient LLM Inference","summary":" The computational difficulties of large language model (LLM) inference remain\na significant obstacle to their widespread deployment. The need for many\napplications to support long input sequences and process them in large batches\ntypically causes token-generation to be bottlenecked by data transfer. For this\nreason, we introduce SparQ Attention, a technique for increasing the inference\nthroughput of LLMs by utilising memory bandwidth more efficiently within the\nattention layers, through selective fetching of the cached history. Our\nproposed technique can be applied directly to off-the-shelf LLMs during\ninference, without requiring any modification to the pre-training setup or\nadditional fine-tuning. We show that SparQ Attention brings up to 8x savings in\nattention data transfers without substantial drops in accuracy, by evaluating\nLlama 2 and 3, Mistral, Gemma and Pythia models on a wide range of downstream\ntasks.\n","authors":["Luka Ribar","Ivan Chelombiev","Luke Hudlass-Galley","Charlie Blake","Carlo Luschi","Douglas Orr"],"pdf_url":"https://arxiv.org/pdf/2312.04985v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02572v1","updated":"2024-09-04T09:46:33Z","published":"2024-09-04T09:46:33Z","title":"Advancing Cyber Incident Timeline Analysis Through Rule Based AI and\n Large Language Models","summary":" Timeline Analysis (TA) is a key part of Timeline Forensics (TF) in Digital\nForensics (DF), focusing primarily on examining and analysing temporal digital\nartefacts such as timestamps, derived from event logs, file metadata, and other\nrelated data to correlate events resulting from cyber incidents and reconstruct\ntheir chronological timeline. Traditional tools often struggle to efficiently\nprocess the vast volume and variety of data acquired during DF investigations\nand Incident Response (IR) processes. This paper presents a novel framework,\nGenDFIR, that combines Rule-Based Artificial Intelligence (R-BAI) algorithms\nwith Large Language Models (LLMs) to advance and automate the TA process. Our\napproach consists of two main stages (1) We use R-BAI to identify and select\nanomalous digital artefacts based on predefined rules. (2) The selected\nartefacts are then converted into embeddings for processing by an LLM with the\nhelp of a Retrieval-Augmented Generation (RAG) agent. The LLM consequently\nleverages its capabilities to perform automated TA on the artefacts and predict\npotential incident scenarios. To validate our framework, we evaluate GenDFIR\nperformance, efficiency, and reliability using various metrics across synthetic\ncyber incident simulation scenarios. This paper presents a proof of concept,\nwhere the findings demonstrate the significant potential of integrating R-BAI\nand LLMs for TA. This novel approach highlights the power of Generative AI\n(GenAI), specifically LLMs, and opens new avenues for advanced threat detection\nand incident reconstruction, representing a significant step forward in the\nfield.\n","authors":["Fatma Yasmine Loumachi","Mohamed Chahine Ghanem"],"pdf_url":"https://arxiv.org/pdf/2409.02572v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2012.15079v2","updated":"2024-09-04T09:44:38Z","published":"2020-12-30T08:31:31Z","title":"Enhancing Sindhi Word Segmentation using Subword Representation Learning\n and Position-aware Self-attention","summary":" Sindhi word segmentation is a challenging task due to space omission and\ninsertion issues. The Sindhi language itself adds to this complexity. It's\ncursive and consists of characters with inherent joining and non-joining\nproperties, independent of word boundaries. Existing Sindhi word segmentation\nmethods rely on designing and combining hand-crafted features. However, these\nmethods have limitations, such as difficulty handling out-of-vocabulary words,\nlimited robustness for other languages, and inefficiency with large amounts of\nnoisy or raw text. Neural network-based models, in contrast, can automatically\ncapture word boundary information without requiring prior knowledge. In this\npaper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses\nword segmentation as a sequence labeling task. The SGNWS model incorporates\nsubword representation learning through a bidirectional long short-term memory\nencoder, position-aware self-attention, and a conditional random field. Our\nempirical results demonstrate that the SGNWS model achieves state-of-the-art\nperformance in Sindhi word segmentation on six datasets.\n","authors":["Wazir Ali","Jay Kumar","Saifullah Tumrani","Redhwan Nour","Adeeb Noor","Zenglin Xu"],"pdf_url":"https://arxiv.org/pdf/2012.15079v2.pdf","comment":"Journal Paper, 14 pages"},{"id":"http://arxiv.org/abs/2409.02555v1","updated":"2024-09-04T09:21:13Z","published":"2024-09-04T09:21:13Z","title":"Low-Resolution Object Recognition with Cross-Resolution Relational\n Contrastive Distillation","summary":" Recognizing objects in low-resolution images is a challenging task due to the\nlack of informative details. Recent studies have shown that knowledge\ndistillation approaches can effectively transfer knowledge from a\nhigh-resolution teacher model to a low-resolution student model by aligning\ncross-resolution representations. However, these approaches still face\nlimitations in adapting to the situation where the recognized objects exhibit\nsignificant representation discrepancies between training and testing images.\nIn this study, we propose a cross-resolution relational contrastive\ndistillation approach to facilitate low-resolution object recognition. Our\napproach enables the student model to mimic the behavior of a well-trained\nteacher model which delivers high accuracy in identifying high-resolution\nobjects. To extract sufficient knowledge, the student learning is supervised\nwith contrastive relational distillation loss, which preserves the similarities\nin various relational structures in contrastive representation space. In this\nmanner, the capability of recovering missing details of familiar low-resolution\nobjects can be effectively enhanced, leading to a better knowledge transfer.\nExtensive experiments on low-resolution object classification and\nlow-resolution face recognition clearly demonstrate the effectiveness and\nadaptability of our approach.\n","authors":["Kangkai Zhang","Shiming Ge","Ruixin Shi","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02555v1.pdf","comment":"This paper is accepted by IEEE Transactions on Circuits and Systems\n for Video Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2409.02530v1","updated":"2024-09-04T08:44:36Z","published":"2024-09-04T08:44:36Z","title":"Understanding eGFR Trajectories and Kidney Function Decline via Large\n Multimodal Models","summary":" The estimated Glomerular Filtration Rate (eGFR) is an essential indicator of\nkidney function in clinical practice. Although traditional equations and\nMachine Learning (ML) models using clinical and laboratory data can estimate\neGFR, accurately predicting future eGFR levels remains a significant challenge\nfor nephrologists and ML researchers. Recent advances demonstrate that Large\nLanguage Models (LLMs) and Large Multimodal Models (LMMs) can serve as robust\nfoundation models for diverse applications. This study investigates the\npotential of LMMs to predict future eGFR levels with a dataset consisting of\nlaboratory and clinical values from 50 patients. By integrating various\nprompting techniques and ensembles of LMMs, our findings suggest that these\nmodels, when combined with precise prompts and visual representations of eGFR\ntrajectories, offer predictive performance comparable to existing ML models.\nThis research extends the application of foundation models and suggests avenues\nfor future studies to harness these models in addressing complex medical\nforecasting challenges.\n","authors":["Chih-Yuan Li","Jun-Ting Wu","Chan Hsu","Ming-Yen Lin","Yihuang Kang"],"pdf_url":"https://arxiv.org/pdf/2409.02530v1.pdf","comment":"This preprint version includes corrections of typographical errors\n related to numerical values in Table 2, which were present in the version\n published at the BDH workshop in MIPR 2024. These corrections do not affect\n the overall conclusions of the study"},{"id":"http://arxiv.org/abs/2409.02529v1","updated":"2024-09-04T08:42:42Z","published":"2024-09-04T08:42:42Z","title":"Sample what you cant compress","summary":" For learned image representations, basic autoencoders often produce blurry\nresults. Reconstruction quality can be improved by incorporating additional\npenalties such as adversarial (GAN) and perceptual losses. Arguably, these\napproaches lack a principled interpretation. Concurrently, in generative\nsettings diffusion has demonstrated a remarkable ability to create crisp, high\nquality results and has solid theoretical underpinnings (from variational\ninference to direct study as the Fisher Divergence). Our work combines\nautoencoder representation learning with diffusion and is, to our knowledge,\nthe first to demonstrate the efficacy of jointly learning a continuous encoder\nand decoder under a diffusion-based loss. We demonstrate that this approach\nyields better reconstruction quality as compared to GAN-based autoencoders\nwhile being easier to tune. We also show that the resulting representation is\neasier to model with a latent diffusion model as compared to the representation\nobtained from a state-of-the-art GAN-based loss. Since our decoder is\nstochastic, it can generate details not encoded in the otherwise deterministic\nlatent representation; we therefore name our approach \"Sample what you can't\ncompress\", or SWYCC for short.\n","authors":["Vighnesh Birodkar","Gabriel Barcik","James Lyon","Sergey Ioffe","David Minnen","Joshua V. Dillon"],"pdf_url":"https://arxiv.org/pdf/2409.02529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02517v1","updated":"2024-09-04T08:25:54Z","published":"2024-09-04T08:25:54Z","title":"Training Universal Vocoders with Feature Smoothing-Based Augmentation\n Methods for High-Quality TTS Systems","summary":" While universal vocoders have achieved proficient waveform generation across\ndiverse voices, their integration into text-to-speech (TTS) tasks often results\nin degraded synthetic quality. To address this challenge, we present a novel\naugmentation technique for training universal vocoders. Our training scheme\nrandomly applies linear smoothing filters to input acoustic features,\nfacilitating vocoder generalization across a wide range of smoothings. It\nsignificantly mitigates the training-inference mismatch, enhancing the\nnaturalness of synthetic output even when the acoustic model produces overly\nsmoothed features. Notably, our method is applicable to any vocoder without\nrequiring architectural modifications or dependencies on specific acoustic\nmodels. The experimental results validate the superiority of our vocoder over\nconventional methods, achieving 11.99% and 12.05% improvements in mean opinion\nscores when integrated with Tacotron 2 and FastSpeech 2 TTS acoustic models,\nrespectively.\n","authors":["Jeongmin Liu","Eunwoo Song"],"pdf_url":"https://arxiv.org/pdf/2409.02517v1.pdf","comment":"4 pages, 4 figures, for demo samples, see\n https://sytronik.github.io/demos/voc_smth_aug/"},{"id":"http://arxiv.org/abs/2409.02512v1","updated":"2024-09-04T08:21:47Z","published":"2024-09-04T08:21:47Z","title":"Continual Diffuser (CoD): Mastering Continual Offline Reinforcement\n Learning with Experience Rehearsal","summary":" Artificial neural networks, especially recent diffusion-based models, have\nshown remarkable superiority in gaming, control, and QA systems, where the\ntraining tasks' datasets are usually static. However, in real-world\napplications, such as robotic control of reinforcement learning (RL), the tasks\nare changing, and new tasks arise in a sequential order. This situation poses\nthe new challenge of plasticity-stability trade-off for training an agent who\ncan adapt to task changes and retain acquired knowledge. In view of this, we\npropose a rehearsal-based continual diffusion model, called Continual Diffuser\n(CoD), to endow the diffuser with the capabilities of quick adaptation\n(plasticity) and lasting retention (stability). Specifically, we first\nconstruct an offline benchmark that contains 90 tasks from multiple domains.\nThen, we train the CoD on each task with sequential modeling and conditional\ngeneration for making decisions. Next, we preserve a small portion of previous\ndatasets as the rehearsal buffer and replay it to retain the acquired\nknowledge. Extensive experiments on a series of tasks show CoD can achieve a\npromising plasticity-stability trade-off and outperform existing\ndiffusion-based methods and other representative baselines on most tasks.\n","authors":["Jifeng Hu","Li Shen","Sili Huang","Zhejian Yang","Hechang Chen","Lichao Sun","Yi Chang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.02512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10093v2","updated":"2024-09-04T08:20:40Z","published":"2024-06-14T14:49:12Z","title":"BiKC: Keypose-Conditioned Consistency Policy for Bimanual Robotic\n Manipulation","summary":" Bimanual manipulation tasks typically involve multiple stages which require\nefficient interactions between two arms, posing step-wise and stage-wise\nchallenges for imitation learning systems. Specifically, failure and delay of\none step will broadcast through time, hinder success and efficiency of each\nsub-stage task, and thereby overall task performance. Although recent works\nhave made strides in addressing certain challenges, few approaches explicitly\nconsider the multi-stage nature of bimanual tasks while simultaneously\nemphasizing the importance of inference speed. In this paper, we introduce a\nnovel keypose-conditioned consistency policy tailored for bimanual\nmanipulation. It is a hierarchical imitation learning framework that consists\nof a high-level keypose predictor and a low-level trajectory generator. The\npredicted keyposes provide guidance for trajectory generation and also mark the\ncompletion of one sub-stage task. The trajectory generator is designed as a\nconsistency model trained from scratch without distillation, which generates\naction sequences conditioning on current observations and predicted keyposes\nwith fast inference speed. Simulated and real-world experimental results\ndemonstrate that the proposed approach surpasses baseline methods in terms of\nsuccess rate and operational efficiency. Codes are available at\nhttps://github.com/ManUtdMoon/BiKC.\n","authors":["Dongjie Yu","Hang Xu","Yizhou Chen","Yi Ren","Jia Pan"],"pdf_url":"https://arxiv.org/pdf/2406.10093v2.pdf","comment":"Accepted by The 16th International Workshop on the Algorithmic\n Foundations of Robotics (WAFR 2024)"},{"id":"http://arxiv.org/abs/2405.11449v3","updated":"2024-09-04T08:03:27Z","published":"2024-05-19T04:58:53Z","title":"NetMamba: Efficient Network Traffic Classification via Pre-training\n Unidirectional Mamba","summary":" Network traffic classification is a crucial research area aiming to enhance\nservice quality, streamline network management, and bolster cybersecurity. To\naddress the growing complexity of transmission encryption techniques, various\nmachine learning and deep learning methods have been proposed. However,\nexisting approaches face two main challenges. Firstly, they struggle with model\ninefficiency due to the quadratic complexity of the widely used Transformer\narchitecture. Secondly, they suffer from inadequate traffic representation\nbecause of discarding important byte information while retaining unwanted\nbiases. To address these challenges, we propose NetMamba, an efficient\nlinear-time state space model equipped with a comprehensive traffic\nrepresentation scheme. We adopt a specially selected and improved\nunidirectional Mamba architecture for the networking field, instead of the\nTransformer, to address efficiency issues. In addition, we design a traffic\nrepresentation scheme to extract valid information from massive traffic data\nwhile removing biased information. Evaluation experiments on six public\ndatasets encompassing three main classification tasks showcase NetMamba's\nsuperior classification performance compared to state-of-the-art baselines. It\nachieves an accuracy rate of nearly 99% (some over 99%) in all tasks.\nAdditionally, NetMamba demonstrates excellent efficiency, improving inference\nspeed by up to 60 times while maintaining comparably low memory usage.\nFurthermore, NetMamba exhibits superior few-shot learning abilities, achieving\nbetter classification performance with fewer labeled data. To the best of our\nknowledge, NetMamba is the first model to tailor the Mamba architecture for\nnetworking.\n","authors":["Tongze Wang","Xiaohui Xie","Wenduo Wang","Chuyi Wang","Youjian Zhao","Yong Cui"],"pdf_url":"https://arxiv.org/pdf/2405.11449v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02495v1","updated":"2024-09-04T07:46:28Z","published":"2024-09-04T07:46:28Z","title":"CoAst: Validation-Free Contribution Assessment for Federated Learning\n based on Cross-Round Valuation","summary":" In the federated learning (FL) process, since the data held by each\nparticipant is different, it is necessary to figure out which participant has a\nhigher contribution to the model performance. Effective contribution assessment\ncan help motivate data owners to participate in the FL training. Research works\nin this field can be divided into two directions based on whether a validation\ndataset is required. Validation-based methods need to use representative\nvalidation data to measure the model accuracy, which is difficult to obtain in\npractical FL scenarios. Existing validation-free methods assess the\ncontribution based on the parameters and gradients of local models and the\nglobal model in a single training round, which is easily compromised by the\nstochasticity of model training. In this work, we propose CoAst, a practical\nmethod to assess the FL participants' contribution without access to any\nvalidation data. The core idea of CoAst involves two aspects: one is to only\ncount the most important part of model parameters through a weights\nquantization, and the other is a cross-round valuation based on the similarity\nbetween the current local parameters and the global parameter updates in\nseveral subsequent communication rounds. Extensive experiments show that CoAst\nhas comparable assessment reliability to existing validation-based methods and\noutperforms existing validation-free methods.\n","authors":["Hao Wu","Likun Zhang","Shucheng Li","Fengyuan Xu","Sheng Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.02495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02492v1","updated":"2024-09-04T07:35:12Z","published":"2024-09-04T07:35:12Z","title":"Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of\n Data-Driven Optimization Routine","summary":" Diffusion tensor imaging (DTI) holds significant importance in clinical\ndiagnosis and neuroscience research. However, conventional model-based fitting\nmethods often suffer from sensitivity to noise, leading to decreased accuracy\nin estimating DTI parameters. While traditional data-driven deep learning\nmethods have shown potential in terms of accuracy and efficiency, their limited\ngeneralization to out-of-training-distribution data impedes their broader\napplication due to the diverse scan protocols used across centers, scanners,\nand studies. This work aims to tackle these challenges and promote the use of\nDTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI\ncombines the weighted linear least squares fitting algorithm and regularization\nby denoising technique. The former fits DW images from diverse acquisition\nsettings into diffusion tensor field, while the latter applies a deep\nlearning-based denoiser to regularize the diffusion tensor field instead of the\nDW images, which is free from the limitation of fixed-channel assignment of the\nnetwork. The optimization object is solved using the alternating direction\nmethod of multipliers and then unrolled to construct a deep neural network,\nleveraging a data-driven strategy to learn network parameters. Extensive\nvalidation experiments are conducted utilizing both internally simulated\ndatasets and externally obtained in-vivo datasets. The results, encompassing\nboth qualitative and quantitative analyses, showcase that the proposed method\nattains state-of-the-art performance in DTI parameter estimation. Notably, it\ndemonstrates superior generalization, accuracy, and efficiency, rendering it\nhighly reliable for widespread application in the field.\n","authors":["Jialong Li","Zhicheng Zhang","Yunwei Chen","Qiqi Lu","Ye Wu","Xiaoming Liu","QianJin Feng","Yanqiu Feng","Xinyuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11293v2","updated":"2024-09-04T07:27:39Z","published":"2023-11-19T10:43:43Z","title":"From Categories to Classifiers: Name-Only Continual Learning by\n Exploring the Web","summary":" Continual Learning (CL) often relies on the availability of extensive\nannotated datasets, an assumption that is unrealistically time-consuming and\ncostly in practice. We explore a novel paradigm termed name-only continual\nlearning where time and cost constraints prohibit manual annotation. In this\nscenario, learners adapt to new category shifts using only category names\nwithout the luxury of annotated training data. Our proposed solution leverages\nthe expansive and ever-evolving internet to query and download uncurated\nwebly-supervised data for image classification. We investigate the reliability\nof our web data and find them comparable, and in some cases superior, to\nmanually annotated datasets. Additionally, we show that by harnessing the web,\nwe can create support sets that surpass state-of-the-art name-only\nclassification that create support sets using generative models or image\nretrieval from LAION-5B, achieving up to 25% boost in accuracy. When applied\nacross varied continual learning contexts, our method consistently exhibits a\nsmall performance gap in comparison to models trained on manually annotated\ndatasets. We present EvoTrends, a class-incremental dataset made from the web\nto capture real-world trends, created in just minutes. Overall, this paper\nunderscores the potential of using uncurated webly-supervised data to mitigate\nthe challenges associated with manual data labeling in continual learning.\n","authors":["Ameya Prabhu","Hasan Abed Al Kader Hammoud","Ser-Nam Lim","Bernard Ghanem","Philip H. S. Torr","Adel Bibi"],"pdf_url":"https://arxiv.org/pdf/2311.11293v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02485v1","updated":"2024-09-04T07:23:12Z","published":"2024-09-04T07:23:12Z","title":"Adversarial Attacks on Machine Learning-Aided Visualizations","summary":" Research in ML4VIS investigates how to use machine learning (ML) techniques\nto generate visualizations, and the field is rapidly growing with high societal\nimpact. However, as with any computational pipeline that employs ML processes,\nML4VIS approaches are susceptible to a range of ML-specific adversarial\nattacks. These attacks can manipulate visualization generations, causing\nanalysts to be tricked and their judgments to be impaired. Due to a lack of\nsynthesis from both visualization and ML perspectives, this security aspect is\nlargely overlooked by the current ML4VIS literature. To bridge this gap, we\ninvestigate the potential vulnerabilities of ML-aided visualizations from\nadversarial attacks using a holistic lens of both visualization and ML\nperspectives. We first identify the attack surface (i.e., attack entry points)\nthat is unique in ML-aided visualizations. We then exemplify five different\nadversarial attacks. These examples highlight the range of possible attacks\nwhen considering the attack surface and multiple different adversary\ncapabilities. Our results show that adversaries can induce various attacks,\nsuch as creating arbitrary and deceptive visualizations, by systematically\nidentifying input attributes that are influential in ML inferences. Based on\nour observations of the attack surface characteristics and the attack examples,\nwe underline the importance of comprehensive studies of security issues and\ndefense mechanisms as a call of urgency for the ML4VIS community.\n","authors":["Takanori Fujiwara","Kostiantyn Kucher","Junpeng Wang","Rafael M. Martins","Andreas Kerren","Anders Ynnerman"],"pdf_url":"https://arxiv.org/pdf/2409.02485v1.pdf","comment":"This is the author's version of the article that has been accepted by\n the Journal of Visualization"},{"id":"http://arxiv.org/abs/2409.02482v1","updated":"2024-09-04T07:18:26Z","published":"2024-09-04T07:18:26Z","title":"Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes","summary":" High-quality real-time view synthesis methods are based on volume rendering,\nsplatting, or surface rendering. While surface-based methods generally are the\nfastest, they cannot faithfully model fuzzy geometry like hair. In turn,\nalpha-blending techniques excel at representing fuzzy materials but require an\nunbounded number of samples per ray (P1). Further overheads are induced by\nempty space skipping in volume rendering (P2) and sorting input primitives in\nsplatting (P3). These problems are exacerbated on low-performance graphics\nhardware, e.g. on mobile devices. We present a novel representation for\nreal-time view synthesis where the (P1) number of sampling locations is small\nand bounded, (P2) sampling locations are efficiently found via rasterization,\nand (P3) rendering is sorting-free. We achieve this by representing objects as\nsemi-transparent multi-layer meshes, rendered in fixed layer order from\noutermost to innermost. We model mesh layers as SDF shells with optimal spacing\nlearned during training. After baking, we fit UV textures to the corresponding\nmeshes. We show that our method can represent challenging fuzzy objects while\nachieving higher frame rates than volume-based and splatting-based methods on\nlow-end and mobile devices.\n","authors":["Stefano Esposito","Anpei Chen","Christian Reiser","Samuel Rota Bulò","Lorenzo Porzi","Katja Schwarz","Christian Richardt","Michael Zollhöfer","Peter Kontschieder","Andreas Geiger"],"pdf_url":"https://arxiv.org/pdf/2409.02482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02471v1","updated":"2024-09-04T06:43:17Z","published":"2024-09-04T06:43:17Z","title":"Demographic parity in regression and classification within the\n unawareness framework","summary":" This paper explores the theoretical foundations of fair regression under the\nconstraint of demographic parity within the unawareness framework, where\ndisparate treatment is prohibited, extending existing results where such\ntreatment is permitted. Specifically, we aim to characterize the optimal fair\nregression function when minimizing the quadratic loss. Our results reveal that\nthis function is given by the solution to a barycenter problem with optimal\ntransport costs. Additionally, we study the connection between optimal fair\ncost-sensitive classification, and optimal fair regression. We demonstrate that\nnestedness of the decision sets of the classifiers is both necessary and\nsufficient to establish a form of equivalence between classification and\nregression. Under this nestedness assumption, the optimal classifiers can be\nderived by applying thresholds to the optimal fair regression function;\nconversely, the optimal fair regression function is characterized by the family\nof cost-sensitive classifiers.\n","authors":["Vincent Divol","Solenne Gaucher"],"pdf_url":"https://arxiv.org/pdf/2409.02471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11008v2","updated":"2024-09-04T06:41:37Z","published":"2024-05-17T11:09:33Z","title":"A Systematic Review on Sleep Stage Classification and Sleep Disorder\n Detection Using Artificial Intelligence","summary":" Sleep is vital for people's physical and mental health, and sound sleep can\nhelp them focus on daily activities. Therefore, a sleep study that includes\nsleep patterns and sleep disorders is crucial to enhancing our knowledge about\nindividuals' health status. This study aims to provide a comprehensive,\nsystematic review of the recent literature to analyze the different approaches\nand their outcomes in sleep studies, which includes works on \"sleep stages\nclassification\" and \"sleep disorder detection\" using AI. In this review, 183\narticles were initially selected from different journals, among which 80\nrecords were enlisted for explicit review, ranging from 2016 to 2023. Brain\nwaves were the most commonly employed body parameters for sleep staging and\ndisorder studies (almost 29% of the research used brain activity signals\nexclusively, and 77% combined with the other signals). The convolutional neural\nnetwork (CNN), the most widely used of the 34 distinct artificial intelligence\nmodels, comprised 27%. The other models included the long short-term memory\n(LSTM), support vector machine (SVM), random forest (RF), and recurrent neural\nnetwork (RNN), which consisted of 11%, 6%, 6%, and 5% sequentially. For\nperformance metrics, accuracy was widely used for a maximum of 83.75% of the\ncases, the F1 score of 45%, Kappa of 36.25%, Sensitivity of 31.25%, and\nSpecificity of 30% of cases, along with the other metrics. This article would\nhelp physicians and researchers get the gist of AI's contribution to sleep\nstudies and the feasibility of their intended work.\n","authors":["Tayab Uddin Wara","Ababil Hossain Fahad","Adri Shankar Das","Md. Mehedi Hasan Shawon"],"pdf_url":"https://arxiv.org/pdf/2405.11008v2.pdf","comment":"39 pages, 11 Figures, 8 Tables"},{"id":"http://arxiv.org/abs/2404.10155v3","updated":"2024-09-04T06:10:47Z","published":"2024-04-15T22:02:58Z","title":"The Fault in our Stars: Quality Assessment of Code Generation Benchmarks","summary":" Large Language Models (LLMs) are gaining popularity among software engineers.\nA crucial aspect of developing effective code generation LLMs is to evaluate\nthese models using a robust benchmark. Evaluation benchmarks with quality\nissues can provide a false sense of performance. In this work, we conduct the\nfirst-of-its-kind study of the quality of prompts within benchmarks used to\ncompare the performance of different code generation models. To conduct this\nstudy, we analyzed 3,566 prompts from 9 code generation benchmarks to identify\nquality issues in them. We also investigated whether fixing the identified\nquality issues in the benchmarks' prompts affects a model's performance. We\nalso studied memorization issues of the evaluation dataset, which can put into\nquestion a benchmark's trustworthiness. We found that code generation\nevaluation benchmarks mainly focused on Python and coding exercises and had\nvery limited contextual dependencies to challenge the model. These datasets and\nthe developers' prompts suffer from quality issues like spelling and\ngrammatical errors, unclear sentences to express developers' intent, and not\nusing proper documentation style. Fixing all these issues in the benchmarks can\nlead to a better performance for Python code generation, but not a significant\nimprovement was observed for Java code generation. We also found evidence that\nGPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues.\n","authors":["Mohammed Latif Siddiq","Simantika Dristi","Joy Saha","Joanna C. S. Santos"],"pdf_url":"https://arxiv.org/pdf/2404.10155v3.pdf","comment":"Accepted at the 24th IEEE International Conference on Source Code\n Analysis and Manipulation(SCAM 2024) Research Track"},{"id":"http://arxiv.org/abs/2305.18420v2","updated":"2024-09-04T05:03:06Z","published":"2023-05-28T19:40:46Z","title":"Sample Complexity of Variance-reduced Distributionally Robust Q-learning","summary":" Dynamic decision-making under distributional shifts is of fundamental\ninterest in theory and applications of reinforcement learning: The distribution\nof the environment in which the data is collected can differ from that of the\nenvironment in which the model is deployed. This paper presents two novel\nmodel-free algorithms, namely the distributionally robust Q-learning and its\nvariance-reduced counterpart, that can effectively learn a robust policy\ndespite distributional shifts. These algorithms are designed to efficiently\napproximate the $q$-function of an infinite-horizon $\\gamma$-discounted robust\nMarkov decision process with Kullback-Leibler ambiguity set to an entry-wise\n$\\epsilon$-degree of precision. Further, the variance-reduced distributionally\nrobust Q-learning combines the synchronous Q-learning with variance-reduction\ntechniques to enhance its performance. Consequently, we establish that it\nattains a minimax sample complexity upper bound of $\\tilde\nO(|\\mathbf{S}||\\mathbf{A}|(1-\\gamma)^{-4}\\epsilon^{-2})$, where $\\mathbf{S}$\nand $\\mathbf{A}$ denote the state and action spaces. This is the first\ncomplexity result that is independent of the ambiguity size $\\delta$, thereby\nproviding new complexity theoretic insights. Additionally, a series of\nnumerical experiments confirm the theoretical findings and the efficiency of\nthe algorithms in handling distributional shifts.\n","authors":["Shengbo Wang","Nian Si","Jose Blanchet","Zhengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.18420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02446v1","updated":"2024-09-04T04:56:41Z","published":"2024-09-04T04:56:41Z","title":"ForeCal: Random Forest-based Calibration for DNNs","summary":" Deep neural network(DNN) based classifiers do extremely well in\ndiscriminating between observations, resulting in higher ROC AUC and accuracy\nmetrics, but their outputs are often miscalibrated with respect to true event\nlikelihoods. Post-hoc calibration algorithms are often used to calibrate the\noutputs of these classifiers. Methods like Isotonic regression, Platt scaling,\nand Temperature scaling have been shown to be effective in some cases but are\nlimited by their parametric assumptions and/or their inability to capture\ncomplex non-linear relationships. We propose ForeCal - a novel post-hoc\ncalibration algorithm based on Random forests. ForeCal exploits two unique\nproperties of Random forests: the ability to enforce weak monotonicity and\nrange-preservation. It is more powerful in achieving calibration than current\nstate-of-the-art methods, is non-parametric, and can incorporate exogenous\ninformation as features to learn a better calibration function. Through\nexperiments on 43 diverse datasets from the UCI ML repository, we show that\nForeCal outperforms existing methods in terms of Expected Calibration\nError(ECE) with minimal impact on the discriminative power of the base DNN as\nmeasured by AUC.\n","authors":["Dhruv Nigam"],"pdf_url":"https://arxiv.org/pdf/2409.02446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02431v1","updated":"2024-09-04T04:18:25Z","published":"2024-09-04T04:18:25Z","title":"Adversarial Learning for Neural PDE Solvers with Sparse Data","summary":" Neural network solvers for partial differential equations (PDEs) have made\nsignificant progress, yet they continue to face challenges related to data\nscarcity and model robustness. Traditional data augmentation methods, which\nleverage symmetry or invariance, impose strong assumptions on physical systems\nthat often do not hold in dynamic and complex real-world applications. To\naddress this research gap, this study introduces a universal learning strategy\nfor neural network PDEs, named Systematic Model Augmentation for Robust\nTraining (SMART). By focusing on challenging and improving the model's\nweaknesses, SMART reduces generalization error during training under\ndata-scarce conditions, leading to significant improvements in prediction\naccuracy across various PDE scenarios. The effectiveness of the proposed method\nis demonstrated through both theoretical analysis and extensive\nexperimentation. The code will be available.\n","authors":["Yunpeng Gong","Yongjie Hou","Zhenzhong Wang","Zexin Lin","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.02431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02430v1","updated":"2024-09-04T04:17:57Z","published":"2024-09-04T04:17:57Z","title":"Transfer-based Adversarial Poisoning Attacks for Online (MIMO-)Deep\n Receviers","summary":" Recently, the design of wireless receivers using deep neural networks (DNNs),\nknown as deep receivers, has attracted extensive attention for ensuring\nreliable communication in complex channel environments. To adapt quickly to\ndynamic channels, online learning has been adopted to update the weights of\ndeep receivers with over-the-air data (e.g., pilots). However, the fragility of\nneural models and the openness of wireless channels expose these systems to\nmalicious attacks. To this end, understanding these attack methods is essential\nfor robust receiver design.In this paper, we propose a transfer-based\nadversarial poisoning attack method for online receivers.Without knowledge of\nthe attack target, adversarial perturbations are injected to the pilots,\npoisoning the online deep receiver and impairing its ability to adapt to\ndynamic channels and nonlinear effects. In particular, our attack method\ntargets Deep Soft Interference Cancellation (DeepSIC)[1] using online\nmeta-learning.As a classical model-driven deep receiver, DeepSIC incorporates\nwireless domain knowledge into its architecture. This integration allows it to\nadapt efficiently to time-varying channels with only a small number of pilots,\nachieving optimal performance in a multi-input and multi-output (MIMO)\nscenario.The deep receiver in this scenario has a number of applications in the\nfield of wireless communication, which motivates our study of the attack\nmethods targeting it.Specifically, we demonstrate the effectiveness of our\nattack in simulations on synthetic linear, synthetic nonlinear, static, and\nCOST 2100 channels. Simulation results indicate that the proposed poisoning\nattack significantly reduces the performance of online receivers in rapidly\nchanging scenarios.\n","authors":["Kunze Wu","Weiheng Jiang","Dusit Niyato","Yinghuan Li","Chuang Luo"],"pdf_url":"https://arxiv.org/pdf/2409.02430v1.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.02428v1","updated":"2024-09-04T04:15:14Z","published":"2024-09-04T04:15:14Z","title":"Large Language Models as Efficient Reward Function Searchers for\n Custom-Environment Multi-Objective Reinforcement Learning","summary":" Leveraging large language models (LLMs) for designing reward functions\ndemonstrates significant potential. However, achieving effective design and\nimprovement of reward functions in reinforcement learning (RL) tasks with\ncomplex custom environments and multiple requirements presents considerable\nchallenges. In this paper, we enable LLMs to be effective white-box searchers,\nhighlighting their advanced semantic understanding capabilities. Specifically,\nwe generate reward components for each explicit user requirement and employ the\nreward critic to identify the correct code form. Then, LLMs assign weights to\nthe reward components to balance their values and iteratively search and\noptimize these weights based on the context provided by the training log\nanalyzer, while adaptively determining the search step size. We applied the\nframework to an underwater information collection RL task without direct human\nfeedback or reward examples (zero-shot). The reward critic successfully correct\nthe reward code with only one feedback for each requirement, effectively\npreventing irreparable errors that can occur when reward function feedback is\nprovided in aggregate. The effective initialization of weights enables the\nacquisition of different reward functions within the Pareto solution set\nwithout weight search. Even in the case where a weight is 100 times off, fewer\nthan four iterations are needed to obtain solutions that meet user\nrequirements. The framework also works well with most prompts utilizing GPT-3.5\nTurbo, since it does not require advanced numerical understanding or\ncalculation.\n","authors":["Guanwen Xie","Jingzehua Xu","Yiyuan Yang","Shuai Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02426v1","updated":"2024-09-04T04:14:02Z","published":"2024-09-04T04:14:02Z","title":"Diffusion Models Learn Low-Dimensional Distributions via Subspace\n Clustering","summary":" Recent empirical studies have demonstrated that diffusion models can\neffectively learn the image distribution and generate new samples. Remarkably,\nthese models can achieve this even with a small number of training samples\ndespite a large image dimension, circumventing the curse of dimensionality. In\nthis work, we provide theoretical insights into this phenomenon by leveraging\nkey empirical observations: (i) the low intrinsic dimensionality of image data,\n(ii) a union of manifold structure of image data, and (iii) the low-rank\nproperty of the denoising autoencoder in trained diffusion models. These\nobservations motivate us to assume the underlying data distribution of image\ndata as a mixture of low-rank Gaussians and to parameterize the denoising\nautoencoder as a low-rank model according to the score function of the assumed\ndistribution. With these setups, we rigorously show that optimizing the\ntraining loss of diffusion models is equivalent to solving the canonical\nsubspace clustering problem over the training samples. Based on this\nequivalence, we further show that the minimal number of samples required to\nlearn the underlying distribution scales linearly with the intrinsic dimensions\nunder the above data and model assumptions. This insight sheds light on why\ndiffusion models can break the curse of dimensionality and exhibit the phase\ntransition in learning distributions. Moreover, we empirically establish a\ncorrespondence between the subspaces and the semantic representations of image\ndata, facilitating image editing. We validate these results with corroborated\nexperimental results on both simulated distributions and image datasets.\n","authors":["Peng Wang","Huijie Zhang","Zekai Zhang","Siyi Chen","Yi Ma","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02426v1.pdf","comment":"39 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.02425v1","updated":"2024-09-04T04:12:22Z","published":"2024-09-04T04:12:22Z","title":"Deep Adaptive Interest Network: Personalized Recommendation with\n Context-Aware Learning","summary":" In personalized recommendation systems, accurately capturing users' evolving\ninterests and combining them with contextual information is a critical research\narea. This paper proposes a novel model called the Deep Adaptive Interest\nNetwork (DAIN), which dynamically models users' interests while incorporating\ncontext-aware learning mechanisms to achieve precise and adaptive personalized\nrecommendations. DAIN leverages deep learning techniques to build an adaptive\ninterest network structure that can capture users' interest changes in\nreal-time while further optimizing recommendation results by integrating\ncontextual information. Experiments conducted on several public datasets\ndemonstrate that DAIN excels in both recommendation performance and\ncomputational efficiency. This research not only provides a new solution for\npersonalized recommendation systems but also offers fresh insights into the\napplication of context-aware learning in recommendation systems.\n","authors":["Shuaishuai Huang","Haowei Yang","You Yao","Xueting Lin","Yuming Tu"],"pdf_url":"https://arxiv.org/pdf/2409.02425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08627v3","updated":"2024-09-04T03:53:59Z","published":"2023-12-08T04:27:11Z","title":"Predicting and Interpreting Energy Barriers of Metallic Glasses with\n Graph Neural Networks","summary":" Metallic Glasses (MGs) are widely used materials that are stronger than steel\nwhile being shapeable as plastic. While understanding the structure-property\nrelationship of MGs remains a challenge in materials science, studying their\nenergy barriers (EBs) as an intermediary step shows promise. In this work, we\nutilize Graph Neural Networks (GNNs) to model MGs and study EBs. We contribute\na new dataset for EB prediction and a novel Symmetrized GNN (SymGNN) model that\nis E(3)-invariant in expectation. SymGNN handles invariance by aggregating over\northogonal transformations of the graph structure. When applied to EB\nprediction, SymGNN are more accurate than molecular dynamics (MD)\nlocal-sampling methods and other machine-learning models. Compared to precise\nMD simulations, SymGNN reduces the inference time on new MGs from roughly 41\ndays to less than one second. We apply explanation algorithms to reveal the\nrelationship between structures and EBs. The structures that we identify\nthrough explanations match the medium-range order (MRO) hypothesis and possess\nunique topological properties. Our work enables effective prediction and\ninterpretation of MG EBs, bolstering material science research.\n","authors":["Haoyu Li","Shichang Zhang","Longwen Tang","Mathieu Bauchy","Yizhou Sun"],"pdf_url":"https://arxiv.org/pdf/2401.08627v3.pdf","comment":"ICML 2024. Code available at https://github.com/haoyuli02/SymGNN"},{"id":"http://arxiv.org/abs/2409.02416v1","updated":"2024-09-04T03:41:44Z","published":"2024-09-04T03:41:44Z","title":"Relative-Translation Invariant Wasserstein Distance","summary":" We introduce a new family of distances, relative-translation invariant\nWasserstein distances ($RW_p$), for measuring the similarity of two probability\ndistributions under distribution shift. Generalizing it from the classical\noptimal transport model, we show that $RW_p$ distances are also real distance\nmetrics defined on the quotient set $\\mathcal{P}_p(\\mathbb{R}^n)/\\sim$ and\ninvariant to distribution translations. When $p=2$, the $RW_2$ distance enjoys\nmore exciting properties, including decomposability of the optimal transport\nmodel, translation-invariance of the $RW_2$ distance, and a Pythagorean\nrelationship between $RW_2$ and the classical quadratic Wasserstein distance\n($W_2$). Based on these properties, we show that a distribution shift, measured\nby $W_2$ distance, can be explained in the bias-variance perspective. In\naddition, we propose a variant of the Sinkhorn algorithm, named $RW_2$ Sinkhorn\nalgorithm, for efficiently calculating $RW_2$ distance, coupling solutions, as\nwell as $W_2$ distance. We also provide the analysis of numerical stability and\ntime complexity for the proposed algorithm. Finally, we validate the $RW_2$\ndistance metric and the algorithm performance with three experiments. We\nconduct one numerical validation for the $RW_2$ Sinkhorn algorithm and show two\nreal-world applications demonstrating the effectiveness of using $RW_2$ under\ndistribution shift: digits recognition and similar thunderstorm detection. The\nexperimental results report that our proposed algorithm significantly improves\nthe computational efficiency of Sinkhorn in certain practical applications, and\nthe $RW_2$ distance is robust to distribution translations compared with\nbaselines.\n","authors":["Binshuai Wang","Qiwei Di","Ming Yin","Mengdi Wang","Quanquan Gu","Peng Wei"],"pdf_url":"https://arxiv.org/pdf/2409.02416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02413v1","updated":"2024-09-04T03:39:23Z","published":"2024-09-04T03:39:23Z","title":"Abstractive Text Summarization: State of the Art, Challenges, and\n Improvements","summary":" Specifically focusing on the landscape of abstractive text summarization, as\nopposed to extractive techniques, this survey presents a comprehensive\noverview, delving into state-of-the-art techniques, prevailing challenges, and\nprospective research directions. We categorize the techniques into traditional\nsequence-to-sequence models, pre-trained large language models, reinforcement\nlearning, hierarchical methods, and multi-modal summarization. Unlike prior\nworks that did not examine complexities, scalability and comparisons of\ntechniques in detail, this review takes a comprehensive approach encompassing\nstate-of-the-art methods, challenges, solutions, comparisons, limitations and\ncharts out future improvements - providing researchers an extensive overview to\nadvance abstractive summarization research. We provide vital comparison tables\nacross techniques categorized - offering insights into model complexity,\nscalability and appropriate applications. The paper highlights challenges such\nas inadequate meaning representation, factual consistency, controllable text\nsummarization, cross-lingual summarization, and evaluation metrics, among\nothers. Solutions leveraging knowledge incorporation and other innovative\nstrategies are proposed to address these challenges. The paper concludes by\nhighlighting emerging research areas like factual inconsistency,\ndomain-specific, cross-lingual, multilingual, and long-document summarization,\nas well as handling noisy data. Our objective is to provide researchers and\npractitioners with a structured overview of the domain, enabling them to better\nunderstand the current landscape and identify potential areas for further\nresearch and improvement.\n","authors":["Hassan Shakil","Ahmad Farooq","Jugal Kalita"],"pdf_url":"https://arxiv.org/pdf/2409.02413v1.pdf","comment":"9 Tables, 7 Figures"},{"id":"http://arxiv.org/abs/2104.12678v6","updated":"2024-09-04T03:27:11Z","published":"2021-04-26T16:11:47Z","title":"Semi-Decentralized Federated Edge Learning for Fast Convergence on\n Non-IID Data","summary":" Federated edge learning (FEEL) has emerged as an effective approach to reduce\nthe large communication latency in Cloud-based machine learning solutions,\nwhile preserving data privacy. Unfortunately, the learning performance of FEEL\nmay be compromised due to limited training data in a single edge cluster. In\nthis paper, we investigate a novel framework of FEEL, namely semi-decentralized\nfederated edge learning (SD-FEEL). By allowing model aggregation across\ndifferent edge clusters, SD-FEEL enjoys the benefit of FEEL in reducing the\ntraining latency, while improving the learning performance by accessing richer\ntraining data from multiple edge clusters. A training algorithm for SD-FEEL\nwith three main procedures in each round is presented, including local model\nupdates, intra-cluster and inter-cluster model aggregations, which is proved to\nconverge on non-independent and identically distributed (non-IID) data. We also\ncharacterize the interplay between the network topology of the edge servers and\nthe communication overhead of inter-cluster model aggregation on the training\nperformance. Experiment results corroborate our analysis and demonstrate the\neffectiveness of SD-FFEL in achieving faster convergence than traditional\nfederated learning architectures. Besides, guidelines on choosing critical\nhyper-parameters of the training algorithm are also provided.\n","authors":["Yuchang Sun","Jiawei Shao","Yuyi Mao","Jessie Hui Wang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2104.12678v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00908v2","updated":"2024-09-04T03:26:58Z","published":"2024-09-02T02:40:42Z","title":"EnsLoss: Stochastic Calibrated Loss Ensembles for Preventing Overfitting\n in Classification","summary":" Empirical risk minimization (ERM) with a computationally feasible surrogate\nloss is a widely accepted approach for classification. Notably, the convexity\nand calibration (CC) properties of a loss function ensure consistency of ERM in\nmaximizing accuracy, thereby offering a wide range of options for surrogate\nlosses. In this article, we propose a novel ensemble method, namely EnsLoss,\nwhich extends the ensemble learning concept to combine loss functions within\nthe ERM framework. A key feature of our method is the consideration on\npreserving the \"legitimacy\" of the combined losses, i.e., ensuring the CC\nproperties. Specifically, we first transform the CC conditions of losses into\nloss-derivatives, thereby bypassing the need for explicit loss functions and\ndirectly generating calibrated loss-derivatives. Therefore, inspired by\nDropout, EnsLoss enables loss ensembles through one training process with\ndoubly stochastic gradient descent (i.e., random batch samples and random\ncalibrated loss-derivatives). We theoretically establish the statistical\nconsistency of our approach and provide insights into its benefits. The\nnumerical effectiveness of EnsLoss compared to fixed loss methods is\ndemonstrated through experiments on a broad range of 14 OpenML tabular datasets\nand 46 image datasets with various deep learning architectures. Python\nrepository and source code are available on GitHub at\nhttps://github.com/statmlben/ensloss.\n","authors":["Ben Dai"],"pdf_url":"https://arxiv.org/pdf/2409.00908v2.pdf","comment":"31 pages; 4 figures"},{"id":"http://arxiv.org/abs/2408.08998v2","updated":"2024-09-04T03:26:09Z","published":"2024-08-16T20:00:08Z","title":"A Confidence Interval for the $\\ell_2$ Expected Calibration Error","summary":" Recent advances in machine learning have significantly improved prediction\naccuracy in various applications. However, ensuring the calibration of\nprobabilistic predictions remains a significant challenge. Despite efforts to\nenhance model calibration, the rigorous statistical evaluation of model\ncalibration remains less explored. In this work, we develop confidence\nintervals the $\\ell_2$ Expected Calibration Error (ECE). We consider\ntop-1-to-$k$ calibration, which includes both the popular notion of confidence\ncalibration as well as full calibration. For a debiased estimator of the ECE,\nwe show asymptotic normality, but with different convergence rates and\nasymptotic variances for calibrated and miscalibrated models. We develop\nmethods to construct asymptotically valid confidence intervals for the ECE,\naccounting for this behavior as well as non-negativity. Our theoretical\nfindings are supported through extensive experiments, showing that our methods\nproduce valid confidence intervals with shorter lengths compared to those\nobtained by resampling-based methods.\n","authors":["Yan Sun","Pratik Chaudhari","Ian J. Barnett","Edgar Dobriban"],"pdf_url":"https://arxiv.org/pdf/2408.08998v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02410v1","updated":"2024-09-04T03:25:48Z","published":"2024-09-04T03:25:48Z","title":"Adaptive Class Emergence Training: Enhancing Neural Network Stability\n and Generalization through Progressive Target Evolution","summary":" Recent advancements in artificial intelligence, particularly deep neural\nnetworks, have pushed the boundaries of what is achievable in complex tasks.\nTraditional methods for training neural networks in classification problems\noften rely on static target outputs, such as one-hot encoded vectors, which can\nlead to unstable optimization and difficulties in handling non-linearities\nwithin data. In this paper, we propose a novel training methodology that\nprogressively evolves the target outputs from a null vector to one-hot encoded\nvectors throughout the training process. This gradual transition allows the\nnetwork to adapt more smoothly to the increasing complexity of the\nclassification task, maintaining an equilibrium state that reduces the risk of\noverfitting and enhances generalization. Our approach, inspired by concepts\nfrom structural equilibrium in finite element analysis, has been validated\nthrough extensive experiments on both synthetic and real-world datasets. The\nresults demonstrate that our method achieves faster convergence, improved\naccuracy, and better generalization, especially in scenarios with high data\ncomplexity and noise. This progressive training framework offers a robust\nalternative to classical methods, opening new perspectives for more efficient\nand stable neural network training.\n","authors":["Jaouad Dabounou"],"pdf_url":"https://arxiv.org/pdf/2409.02410v1.pdf","comment":"15 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.00025v2","updated":"2024-09-04T03:11:02Z","published":"2024-08-16T19:22:02Z","title":"A Novel Approach to Classify Power Quality Signals Using Vision\n Transformers","summary":" With the rapid integration of electronically interfaced renewable energy\nresources and loads into smart grids, there is increasing interest in power\nquality disturbances (PQD) classification to enhance the security and\nefficiency of these grids. This paper introduces a new approach to PQD\nclassification based on the Vision Transformer (ViT) model. When a PQD occurs,\nthe proposed approach first converts the power quality signal into an image and\nthen utilizes a pre-trained ViT to accurately determine the class of the PQD.\nUnlike most previous works, which were limited to a few disturbance classes or\nsmall datasets, the proposed method is trained and tested on a large dataset\nwith 17 disturbance classes. Our experimental results show that the proposed\nViT-based approach achieves PQD classification precision and recall of 98.28%\nand 97.98%, respectively, outperforming recently proposed techniques applied to\nthe same dataset.\n","authors":["Ahmad Mohammad Saber","Alaa Selim","Mohamed M. Hammad","Amr Youssef","Deepa Kundur","Ehab El-Saadany"],"pdf_url":"https://arxiv.org/pdf/2409.00025v2.pdf","comment":"IECON 2024-50th Annual Conference of the IEEE Industrial Electronics\n Society, Chicago, U.S.A, 2024, pp. 1-6"},{"id":"http://arxiv.org/abs/2409.02404v1","updated":"2024-09-04T03:06:13Z","published":"2024-09-04T03:06:13Z","title":"Learning Privacy-Preserving Student Networks via\n Discriminative-Generative Distillation","summary":" While deep models have proved successful in learning rich knowledge from\nmassive well-annotated data, they may pose a privacy leakage risk in practical\ndeployment. It is necessary to find an effective trade-off between high utility\nand strong privacy. In this work, we propose a discriminative-generative\ndistillation approach to learn privacy-preserving deep models. Our key idea is\ntaking models as bridge to distill knowledge from private data and then\ntransfer it to learn a student network via two streams. First, discriminative\nstream trains a baseline classifier on private data and an ensemble of teachers\non multiple disjoint private subsets, respectively. Then, generative stream\ntakes the classifier as a fixed discriminator and trains a generator in a\ndata-free manner. After that, the generator is used to generate massive\nsynthetic data which are further applied to train a variational autoencoder\n(VAE). Among these synthetic data, a few of them are fed into the teacher\nensemble to query labels via differentially private aggregation, while most of\nthem are embedded to the trained VAE for reconstructing synthetic data.\nFinally, a semi-supervised student learning is performed to simultaneously\nhandle two tasks: knowledge transfer from the teachers with distillation on few\nprivately labeled synthetic data, and knowledge enhancement with tangent-normal\nadversarial regularization on many triples of reconstructed synthetic data. In\nthis way, our approach can control query cost over private data and mitigate\naccuracy degradation in a unified manner, leading to a privacy-preserving\nstudent model. Extensive experiments and analysis clearly show the\neffectiveness of the proposed approach.\n","authors":["Shiming Ge","Bochao Liu","Pengju Wang","Yong Li","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02404v1.pdf","comment":"This paper is accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2406.06479v3","updated":"2024-09-04T02:57:47Z","published":"2024-06-10T17:20:13Z","title":"Graph-Based Bidirectional Transformer Decision Threshold Adjustment\n Algorithm for Class-Imbalanced Molecular Data","summary":" Data sets with imbalanced class sizes, where one class size is much smaller\nthan that of others, occur exceedingly often in many applications, including\nthose with biological foundations, such as disease diagnosis and drug\ndiscovery. Therefore, it is extremely important to be able to identify data\nelements of classes of various sizes, as a failure to do so can result in heavy\ncosts. Nonetheless, many data classification procedures do not perform well on\nimbalanced data sets as they often fail to detect elements belonging to\nunderrepresented classes. In this work, we propose the BTDT-MBO algorithm,\nincorporating Merriman-Bence-Osher (MBO) approaches and a bidirectional\ntransformer, as well as distance correlation and decision threshold\nadjustments, for data classification tasks on highly imbalanced molecular data\nsets, where the sizes of the classes vary greatly. The proposed technique not\nonly integrates adjustments in the classification threshold for the MBO\nalgorithm in order to help deal with the class imbalance, but also uses a\nbidirectional transformer procedure based on an attention mechanism for\nself-supervised learning. In addition, the model implements distance\ncorrelation as a weight function for the similarity graph-based framework on\nwhich the adjusted MBO algorithm operates. The proposed method is validated\nusing six molecular data sets and compared to other related techniques. The\ncomputational experiments show that the proposed technique is superior to\ncompeting approaches even in the case of a high class imbalance ratio.\n","authors":["Nicole Hayes","Ekaterina Merkurjev","Guo-Wei Wei"],"pdf_url":"https://arxiv.org/pdf/2406.06479v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11783v2","updated":"2024-09-04T02:45:12Z","published":"2023-03-19T08:19:10Z","title":"CCPL: Cross-modal Contrastive Protein Learning","summary":" Effective protein representation learning is crucial for predicting protein\nfunctions. Traditional methods often pretrain protein language models on large,\nunlabeled amino acid sequences, followed by finetuning on labeled data. While\neffective, these methods underutilize the potential of protein structures,\nwhich are vital for function determination. Common structural representation\ntechniques rely heavily on annotated data, limiting their generalizability.\nMoreover, structural pretraining methods, similar to natural language\npretraining, can distort actual protein structures. In this work, we introduce\na novel unsupervised protein structure representation pretraining method,\ncross-modal contrastive protein learning (CCPL). CCPL leverages a robust\nprotein language model and uses unsupervised contrastive alignment to enhance\nstructure learning, incorporating self-supervised structural constraints to\nmaintain intrinsic structural information. We evaluated our model across\nvarious benchmarks, demonstrating the framework's superiority.\n","authors":["Jiangbin Zheng","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2303.11783v2.pdf","comment":"Accepted to ICPR 2024"},{"id":"http://arxiv.org/abs/2409.02392v1","updated":"2024-09-04T02:41:04Z","published":"2024-09-04T02:41:04Z","title":"Building Math Agents with Multi-Turn Iterative Preference Learning","summary":" Recent studies have shown that large language models' (LLMs) mathematical\nproblem-solving capabilities can be enhanced by integrating external tools,\nsuch as code interpreters, and employing multi-turn Chain-of-Thought (CoT)\nreasoning. While current methods focus on synthetic data generation and\nSupervised Fine-Tuning (SFT), this paper studies the complementary direct\npreference learning approach to further improve model performance. However,\nexisting direct preference learning algorithms are originally designed for the\nsingle-turn chat task, and do not fully address the complexities of multi-turn\nreasoning and external tool integration required for tool-integrated\nmathematical reasoning tasks. To fill in this gap, we introduce a multi-turn\ndirect preference learning framework, tailored for this context, that leverages\nfeedback from code interpreters and optimizes trajectory-level preferences.\nThis framework includes multi-turn DPO and multi-turn KTO as specific\nimplementations. The effectiveness of our framework is validated through\ntraining of various language models using an augmented prompt set from the\nGSM8K and MATH datasets. Our results demonstrate substantial improvements: a\nsupervised fine-tuned Gemma-1.1-it-7B model's performance increased from 77.5%\nto 83.9% on GSM8K and from 46.1% to 51.2% on MATH. Similarly, a Gemma-2-it-9B\nmodel improved from 84.1% to 86.3% on GSM8K and from 51.0% to 54.5% on MATH.\n","authors":["Wei Xiong","Chengshuai Shi","Jiaming Shen","Aviv Rosenberg","Zhen Qin","Daniele Calandriello","Misha Khalman","Rishabh Joshi","Bilal Piot","Mohammad Saleh","Chi Jin","Tong Zhang","Tianqi Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02392v1.pdf","comment":"A multi-turn direct preference learning framework for tool-integrated\n reasoning tasks"},{"id":"http://arxiv.org/abs/2409.01128v2","updated":"2024-09-04T02:40:52Z","published":"2024-09-02T10:07:24Z","title":"Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in\n Federated Class Continual Learning","summary":" Federated Class Continual Learning (FCCL) merges the challenges of\ndistributed client learning with the need for seamless adaptation to new\nclasses without forgetting old ones. The key challenge in FCCL is catastrophic\nforgetting, an issue that has been explored to some extent in Continual\nLearning (CL). However, due to privacy preservation requirements, some\nconventional methods, such as experience replay, are not directly applicable to\nFCCL. Existing FCCL methods mitigate forgetting by generating historical data\nthrough federated training of GANs or data-free knowledge distillation.\nHowever, these approaches often suffer from unstable training of generators or\nlow-quality generated data, limiting their guidance for the model. To address\nthis challenge, we propose a novel method of data replay based on diffusion\nmodels. Instead of training a diffusion model, we employ a pre-trained\nconditional diffusion model to reverse-engineer each class, searching the\ncorresponding input conditions for each class within the model's input space,\nsignificantly reducing computational resources and time consumption while\nensuring effective generation. Furthermore, we enhance the classifier's domain\ngeneralization ability on generated and real data through contrastive learning,\nindirectly improving the representational capability of generated data for real\ndata. Comprehensive experiments demonstrate that our method significantly\noutperforms existing baselines. Code is available at\nhttps://github.com/jinglin-liang/DDDR.\n","authors":["Jinglin Liang","Jin Zhong","Hanlin Gu","Zhongqi Lu","Xingxing Tang","Gang Dai","Shuangping Huang","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.01128v2.pdf","comment":"Accepted by ECCV 2024 Oral"},{"id":"http://arxiv.org/abs/2409.02388v1","updated":"2024-09-04T02:31:53Z","published":"2024-09-04T02:31:53Z","title":"Gaussian Rate-Distortion-Perception Coding and Entropy-Constrained\n Scalar Quantization","summary":" This paper investigates the best known bounds on the quadratic Gaussian\ndistortion-rate-perception function with limited common randomness for the\nKullback-Leibler divergence-based perception measure, as well as their\ncounterparts for the squared Wasserstein-2 distance-based perception measure,\nrecently established by Xie et al. These bounds are shown to be nondegenerate\nin the sense that they cannot be deduced from each other via a refined version\nof Talagrand's transportation inequality. On the other hand, an improved lower\nbound is established when the perception measure is given by the squared\nWasserstein-2 distance. In addition, it is revealed by exploiting the\nconnection between rate-distortion-perception coding and entropy-constrained\nscalar quantization that all the aforementioned bounds are generally not tight\nin the weak perception constraint regime.\n","authors":["Li Xie","Liangyan Li","Jun Chen","Lei Yu","Zhongshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11055v2","updated":"2024-09-04T02:24:25Z","published":"2023-05-18T15:50:33Z","title":"Small noise analysis for Tikhonov and RKHS regularizations","summary":" Regularization plays a pivotal role in ill-posed machine learning and inverse\nproblems. However, the fundamental comparative analysis of various\nregularization norms remains open. We establish a small noise analysis\nframework to assess the effects of norms in Tikhonov and RKHS regularizations,\nin the context of ill-posed linear inverse problems with Gaussian noise. This\nframework studies the convergence rates of regularized estimators in the small\nnoise limit and reveals the potential instability of the conventional\nL2-regularizer. We solve such instability by proposing an innovative class of\nadaptive fractional RKHS regularizers, which covers the L2 Tikhonov and RKHS\nregularizations by adjusting the fractional smoothness parameter. A surprising\ninsight is that over-smoothing via these fractional RKHSs consistently yields\noptimal convergence rates, but the optimal hyper-parameter may decay too fast\nto be selected in practice.\n","authors":["Quanjun Lang","Fei Lu"],"pdf_url":"https://arxiv.org/pdf/2305.11055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01731v2","updated":"2024-09-04T02:23:10Z","published":"2024-09-03T09:14:21Z","title":"Stacked ensemble\\-based mutagenicity prediction model using multiple\n modalities with graph attention network","summary":" Mutagenicity is a concern due to its association with genetic mutations which\ncan result in a variety of negative consequences, including the development of\ncancer. Earlier identification of mutagenic compounds in the drug development\nprocess is therefore crucial for preventing the progression of unsafe\ncandidates and reducing development costs. While computational techniques,\nespecially machine learning models have become increasingly prevalent for this\nendpoint, they rely on a single modality. In this work, we introduce a novel\nstacked ensemble based mutagenicity prediction model which incorporate multiple\nmodalities such as simplified molecular input line entry system (SMILES) and\nmolecular graph. These modalities capture diverse information about molecules\nsuch as substructural, physicochemical, geometrical and topological. To derive\nsubstructural, geometrical and physicochemical information, we use SMILES,\nwhile topological information is extracted through a graph attention network\n(GAT) via molecular graph. Our model uses a stacked ensemble of machine\nlearning classifiers to make predictions using these multiple features. We\nemploy the explainable artificial intelligence (XAI) technique SHAP (Shapley\nAdditive Explanations) to determine the significance of each classifier and the\nmost relevant features in the prediction. We demonstrate that our method\nsurpasses SOTA methods on two standard datasets across various metrics.\nNotably, we achieve an area under the curve of 95.21\\% on the Hansen benchmark\ndataset, affirming the efficacy of our method in predicting mutagenicity. We\nbelieve that this research will captivate the interest of both clinicians and\ncomputational biologists engaged in translational research.\n","authors":["Tanya Liyaqat","Tanvir Ahmad","Mohammad Kashif","Chandni Saxena"],"pdf_url":"https://arxiv.org/pdf/2409.01731v2.pdf","comment":"Submitted to a journal"},{"id":"http://arxiv.org/abs/2406.09246v2","updated":"2024-09-04T02:14:57Z","published":"2024-06-13T15:46:55Z","title":"OpenVLA: An Open-Source Vision-Language-Action Model","summary":" Large policies pretrained on a combination of Internet-scale vision-language\ndata and diverse robot demonstrations have the potential to change how we teach\nrobots new skills: rather than training new behaviors from scratch, we can\nfine-tune such vision-language-action (VLA) models to obtain robust,\ngeneralizable policies for visuomotor control. Yet, widespread adoption of VLAs\nfor robotics has been challenging as 1) existing VLAs are largely closed and\ninaccessible to the public, and 2) prior work fails to explore methods for\nefficiently fine-tuning VLAs for new tasks, a key component for adoption.\nAddressing these challenges, we introduce OpenVLA, a 7B-parameter open-source\nVLA trained on a diverse collection of 970k real-world robot demonstrations.\nOpenVLA builds on a Llama 2 language model combined with a visual encoder that\nfuses pretrained features from DINOv2 and SigLIP. As a product of the added\ndata diversity and new model components, OpenVLA demonstrates strong results\nfor generalist manipulation, outperforming closed models such as RT-2-X (55B)\nby 16.5% in absolute task success rate across 29 tasks and multiple robot\nembodiments, with 7x fewer parameters. We further show that we can effectively\nfine-tune OpenVLA for new settings, with especially strong generalization\nresults in multi-task environments involving multiple objects and strong\nlanguage grounding abilities, and outperform expressive from-scratch imitation\nlearning methods such as Diffusion Policy by 20.4%. We also explore compute\nefficiency; as a separate contribution, we show that OpenVLA can be fine-tuned\non consumer GPUs via modern low-rank adaptation methods and served efficiently\nvia quantization without a hit to downstream success rate. Finally, we release\nmodel checkpoints, fine-tuning notebooks, and our PyTorch codebase with\nbuilt-in support for training VLAs at scale on Open X-Embodiment datasets.\n","authors":["Moo Jin Kim","Karl Pertsch","Siddharth Karamcheti","Ted Xiao","Ashwin Balakrishna","Suraj Nair","Rafael Rafailov","Ethan Foster","Grace Lam","Pannag Sanketi","Quan Vuong","Thomas Kollar","Benjamin Burchfiel","Russ Tedrake","Dorsa Sadigh","Sergey Levine","Percy Liang","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2406.09246v2.pdf","comment":"Website: https://openvla.github.io/"},{"id":"http://arxiv.org/abs/2404.04298v2","updated":"2024-09-04T02:00:58Z","published":"2024-04-04T20:27:37Z","title":"SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated\n Responses","summary":" Can LLMs consistently improve their previous outputs for better results? For\nthis to be true, LLMs would need to be better at discriminating among\npreviously-generated alternatives, than generating initial responses. We\nexplore the validity of this hypothesis in practice. We first formulate a\nunified framework that allows us to compare the generative and discriminative\ncapability of any model on any task. In our resulting experimental analysis of\nseveral open-source and industrial LLMs, we observe that models are not\nreliably better at discriminating among previously-generated alternatives than\ngenerating initial responses. This finding challenges the notion that LLMs may\nbe able to enhance their performance only through their own judgment.\n","authors":["Dongwei Jiang","Jingyu Zhang","Orion Weller","Nathaniel Weir","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.04298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02374v1","updated":"2024-09-04T01:47:01Z","published":"2024-09-04T01:47:01Z","title":"Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable\n Image Editing","summary":" Recently, diffusion models have emerged as a powerful class of generative\nmodels. Despite their success, there is still limited understanding of their\nsemantic spaces. This makes it challenging to achieve precise and disentangled\nimage generation without additional training, especially in an unsupervised\nway. In this work, we improve the understanding of their semantic spaces from\nintriguing observations: among a certain range of noise levels, (1) the learned\nposterior mean predictor (PMP) in the diffusion model is locally linear, and\n(2) the singular vectors of its Jacobian lie in low-dimensional semantic\nsubspaces. We provide a solid theoretical basis to justify the linearity and\nlow-rankness in the PMP. These insights allow us to propose an unsupervised,\nsingle-step, training-free LOw-rank COntrollable image editing (LOCO Edit)\nmethod for precise local editing in diffusion models. LOCO Edit identified\nediting directions with nice properties: homogeneity, transferability,\ncomposability, and linearity. These properties of LOCO Edit benefit greatly\nfrom the low-dimensional semantic subspace. Our method can further be extended\nto unsupervised or text-supervised editing in various text-to-image diffusion\nmodels (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the\neffectiveness and efficiency of LOCO Edit. The codes will be released at\nhttps://github.com/ChicyChen/LOCO-Edit.\n","authors":["Siyi Chen","Huijie Zhang","Minzhe Guo","Yifu Lu","Peng Wang","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2409.02374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02363v1","updated":"2024-09-04T01:18:55Z","published":"2024-09-04T01:18:55Z","title":"Optimal Neural Network Approximation for High-Dimensional Continuous\n Functions","summary":" Recently, the authors of Shen Yang Zhang (JMLR, 2022) developed a neural\nnetwork with width $36d(2d + 1)$ and depth $11$, which utilizes a special\nactivation function called the elementary universal activation function, to\nachieve the super approximation property for functions in $C([a,b]^d)$. That\nis, the constructed network only requires a fixed number of neurons to\napproximate a $d$-variate continuous function on a $d$-dimensional hypercube\nwith arbitrary accuracy. Their network uses $\\mathcal{O}(d^2)$ fixed neurons.\nOne natural question to address is whether we can reduce the number of these\nneurons in such a network. By leveraging a variant of the Kolmogorov\nSuperposition Theorem, our analysis shows that there is a neural network\ngenerated by the elementary universal activation function with only $366d +365$\nfixed, intrinsic (non-repeated) neurons that attains this super approximation\nproperty. Furthermore, we present a family of continuous functions that\nrequires at least width $d$, and therefore at least $d$ intrinsic neurons, to\nachieve arbitrary accuracy in its approximation. This shows that the\nrequirement of $\\mathcal{O}(d)$ intrinsic neurons is optimal in the sense that\nit grows linearly with the input dimension $d$, unlike some approximation\nmethods where parameters may grow exponentially with $d$.\n","authors":["Ayan Maiti","Michelle Michelle","Haizhao Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13493v2","updated":"2024-09-04T01:00:12Z","published":"2024-08-24T06:32:30Z","title":"Thresholded Lexicographic Ordered Multiobjective Reinforcement Learning","summary":" Lexicographic multi-objective problems, which impose a lexicographic\nimportance order over the objectives, arise in many real-life scenarios.\nExisting Reinforcement Learning work directly addressing lexicographic tasks\nhas been scarce. The few proposed approaches were all noted to be heuristics\nwithout theoretical guarantees as the Bellman equation is not applicable to\nthem. Additionally, the practical applicability of these prior approaches also\nsuffers from various issues such as not being able to reach the goal state.\nWhile some of these issues have been known before, in this work we investigate\nfurther shortcomings, and propose fixes for improving practical performance in\nmany cases. We also present a policy optimization approach using our\nLexicographic Projection Optimization (LPO) algorithm that has the potential to\naddress these theoretical and practical concerns. Finally, we demonstrate our\nproposed algorithms on benchmark problems.\n","authors":["Alperen Tercan","Vinayak S. Prabhu"],"pdf_url":"https://arxiv.org/pdf/2408.13493v2.pdf","comment":"Full version of ECAI 2024 paper"},{"id":"http://arxiv.org/abs/2408.15221v2","updated":"2024-09-04T00:58:59Z","published":"2024-08-27T17:33:30Z","title":"LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet","summary":" Recent large language model (LLM) defenses have greatly improved models'\nability to refuse harmful queries, even when adversarially attacked. However,\nLLM defenses are primarily evaluated against automated adversarial attacks in a\nsingle turn of conversation, an insufficient threat model for real-world\nmalicious use. We demonstrate that multi-turn human jailbreaks uncover\nsignificant vulnerabilities, exceeding 70% attack success rate (ASR) on\nHarmBench against defenses that report single-digit ASRs with automated\nsingle-turn attacks. Human jailbreaks also reveal vulnerabilities in machine\nunlearning defenses, successfully recovering dual-use biosecurity knowledge\nfrom unlearned models. We compile these results into Multi-Turn Human\nJailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks.\nWe publicly release MHJ alongside a compendium of jailbreak tactics developed\nacross dozens of commercial red teaming engagements, supporting research\ntowards stronger LLM defenses.\n","authors":["Nathaniel Li","Ziwen Han","Ian Steneker","Willow Primack","Riley Goodside","Hugh Zhang","Zifan Wang","Cristina Menghini","Summer Yue"],"pdf_url":"https://arxiv.org/pdf/2408.15221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02349v1","updated":"2024-09-04T00:35:55Z","published":"2024-09-04T00:35:55Z","title":"Machine Learning Applications to Computational Plasma Physics and\n Reduced-Order Plasma Modeling: A Perspective","summary":" Machine learning (ML) provides a broad spectrum of tools and architectures\nthat enable the transformation of data from simulations and experiments into\nuseful and explainable science, thereby augmenting domain knowledge.\nFurthermore, ML-enhanced numerical modelling can revamp scientific computing\nfor real-world complex engineering systems, creating unique opportunities to\nexamine the operation of the technologies in detail and automate their\noptimization and control. In recent years, ML applications have seen\nsignificant growth across various scientific domains, particularly in fluid\nmechanics, where ML has shown great promise in enhancing computational modeling\nof fluid flows. In contrast, ML applications in numerical plasma physics\nresearch remain relatively limited in scope and extent. Despite this, the close\nrelationship between fluid mechanics and plasma physics presents a valuable\nopportunity to create a roadmap for transferring ML advances in fluid flow\nmodeling to computational plasma physics. This Perspective aims to outline such\na roadmap. We begin by discussing some general fundamental aspects of ML,\nincluding the various categories of ML algorithms and the different types of\nproblems that can be solved with the help of ML. With regard to each problem\ntype, we then present specific examples from the use of ML in computational\nfluid dynamics, reviewing several insightful prior efforts. We also review\nrecent ML applications in plasma physics for each problem type. The paper\ndiscusses promising future directions and development pathways for ML in plasma\nmodelling within the different application areas. Additionally, we point out\nprominent challenges that must be addressed to realize ML's full potential in\ncomputational plasma physics, including the need for cost-effective\nhigh-fidelity simulation tools for extensive data generation.\n","authors":["Farbod Faraji","Maryam Reza"],"pdf_url":"https://arxiv.org/pdf/2409.02349v1.pdf","comment":"42 pages, 20 figures"},{"id":"http://arxiv.org/abs/2409.02347v1","updated":"2024-09-04T00:24:57Z","published":"2024-09-04T00:24:57Z","title":"Understanding the Role of Functional Diversity in Weight-Ensembling with\n Ingredient Selection and Multidimensional Scaling","summary":" Weight-ensembles are formed when the parameters of multiple neural networks\nare directly averaged into a single model. They have demonstrated\ngeneralization capability in-distribution (ID) and out-of-distribution (OOD)\nwhich is not completely understood, though they are thought to successfully\nexploit functional diversity allotted by each distinct model. Given a\ncollection of models, it is also unclear which combination leads to the optimal\nweight-ensemble; the SOTA is a linear-time ``greedy\" method. We introduce two\nnovel weight-ensembling approaches to study the link between performance\ndynamics and the nature of how each method decides to use apply the\nfunctionally diverse components, akin to diversity-encouragement in the\nprediction-ensemble literature. We develop a visualization tool to explain how\neach algorithm explores various domains defined via pairwise-distances to\nfurther investigate selection and algorithms' convergence. Empirical analyses\nshed perspectives which reinforce how high-diversity enhances weight-ensembling\nwhile qualifying the extent to which diversity alone improves accuracy. We also\ndemonstrate that sampling positionally distinct models can contribute just as\nmeaningfully to improvements in a weight-ensemble.\n","authors":["Alex Rojas","David Alvarez-Melis"],"pdf_url":"https://arxiv.org/pdf/2409.02347v1.pdf","comment":"Published at the ICML 2024 (Vienna, Austria) Workshop on Foundation\n Models in the Wild"},{"id":"http://arxiv.org/abs/2408.06266v4","updated":"2024-09-04T00:22:45Z","published":"2024-08-12T16:24:51Z","title":"Anchored Preference Optimization and Contrastive Revisions: Addressing\n Underspecification in Alignment","summary":" Large Language Models (LLMs) are often aligned using contrastive alignment\nobjectives and preference pair datasets. The interaction between model, paired\ndata, and objective makes alignment a complicated procedure, sometimes\nproducing subpar results. We study this and find that (i) preference data gives\na better learning signal when the underlying responses are contrastive, and\n(ii) alignment objectives lead to better performance when they specify more\ncontrol over the model during training. Based on these insights, we introduce\nContrastive Learning from AI Revisions (CLAIR), a data-creation method which\nleads to more contrastive preference pairs, and Anchored Preference\nOptimization (APO), a controllable and more stable alignment objective. We\nalign Llama-3-8B-Instruct using various comparable datasets and alignment\nobjectives and measure MixEval-Hard scores, which correlate highly with human\njudgments. The CLAIR preferences lead to the strongest performance out of all\ndatasets, and APO consistently outperforms less controllable objectives. Our\nbest model, trained on 32K CLAIR preferences with APO, improves\nLlama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code\nis available at https://github.com/ContextualAI/CLAIR_and_APO.\n","authors":["Karel D'Oosterlinck","Winnie Xu","Chris Develder","Thomas Demeester","Amanpreet Singh","Christopher Potts","Douwe Kiela","Shikib Mehri"],"pdf_url":"https://arxiv.org/pdf/2408.06266v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02346v1","updated":"2024-09-04T00:20:55Z","published":"2024-09-04T00:20:55Z","title":"Robust Federated Finetuning of Foundation Models via Alternating\n Minimization of LoRA","summary":" Parameter-Efficient Fine-Tuning (PEFT) has risen as an innovative training\nstrategy that updates only a select few model parameters, significantly\nlowering both computational and memory demands. PEFT also helps to decrease\ndata transfer in federated learning settings, where communication depends on\nthe size of updates. In this work, we explore the constraints of previous\nstudies that integrate a well-known PEFT method named LoRA with federated\nfine-tuning, then introduce RoLoRA, a robust federated fine-tuning framework\nthat utilizes an alternating minimization approach for LoRA, providing greater\nrobustness against decreasing fine-tuning parameters and increasing data\nheterogeneity. Our results indicate that RoLoRA not only presents the\ncommunication benefits but also substantially enhances the robustness and\neffectiveness in multiple federated fine-tuning scenarios.\n","authors":["Shuangyi Chen","Yue Ju","Hardik Dalal","Zhongwen Zhu","Ashish Khisti"],"pdf_url":"https://arxiv.org/pdf/2409.02346v1.pdf","comment":"Presented at ES-FOMO-II@ICML2024"},{"id":"http://arxiv.org/abs/2409.02343v1","updated":"2024-09-04T00:10:36Z","published":"2024-09-04T00:10:36Z","title":"NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for\n Retrieval","summary":" $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval)\nfrom pre-trained embedding models is the predominant retrieval method for text\nand images, as well as Retrieval-Augmented Generation (RAG) pipelines. In\npractice, application developers often fine-tune the embeddings to improve\ntheir accuracy on the dataset and query workload in hand. Existing approaches\neither fine-tune the pre-trained model itself or, more efficiently, but at the\ncost of accuracy, train adaptor models to transform the output of the\npre-trained model. We present NUDGE, a family of novel non-parametric embedding\nfine-tuning approaches that are significantly more accurate and efficient than\nboth sets of existing approaches. NUDGE directly modifies the embeddings of\ndata records to maximize the accuracy of $k$-NN retrieval. We present a\nthorough theoretical and experimental study of NUDGE's non-parametric approach.\nWe show that even though the underlying problem is NP-Hard, constrained\nvariations can be solved efficiently. These constraints additionally ensure\nthat the changes to the embeddings are modest, avoiding large distortions to\nthe semantics learned during pre-training. In experiments across five\npre-trained models and nine standard text and image retrieval datasets, NUDGE\nruns in minutes and often improves NDCG@10 by more than 10% over existing\nfine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase\nin accuracy and runs 200x and 3x faster, respectively, over fine-tuning the\npre-trained model and training adaptors.\n","authors":["Sepanta Zeighami","Zac Wellmer","Aditya Parameswaran"],"pdf_url":"https://arxiv.org/pdf/2409.02343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02342v1","updated":"2024-09-04T00:06:23Z","published":"2024-09-04T00:06:23Z","title":"Optimal sampling for least-squares approximation","summary":" Least-squares approximation is one of the most important methods for\nrecovering an unknown function from data. While in many applications the data\nis fixed, in many others there is substantial freedom to choose where to\nsample. In this paper, we review recent progress on optimal sampling for\n(weighted) least-squares approximation in arbitrary linear spaces. We introduce\nthe Christoffel function as a key quantity in the analysis of (weighted)\nleast-squares approximation from random samples, then show how it can be used\nto construct sampling strategies that possess near-optimal sample complexity:\nnamely, the number of samples scales log-linearly in $n$, the dimension of the\napproximation space. We discuss a series of variations, extensions and further\ntopics, and throughout highlight connections to approximation theory, machine\nlearning, information-based complexity and numerical linear algebra. Finally,\nmotivated by various contemporary applications, we consider a generalization of\nthe classical setting where the samples need not be pointwise samples of a\nscalar-valued function, and the approximation space need not be linear. We show\nthat even in this significantly more general setting suitable generalizations\nof the Christoffel function still determine the sample complexity. This\nprovides a unified procedure for designing improved sampling strategies for\ngeneral recovery problems. This article is largely self-contained, and intended\nto be accessible to nonspecialists.\n","authors":["Ben Adcock"],"pdf_url":"https://arxiv.org/pdf/2409.02342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02078v2","updated":"2024-09-04T00:06:20Z","published":"2023-12-04T17:41:52Z","title":"From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video\n Solution to Enhance Community Safety","summary":" This article adopts and evaluates an AI-enabled Smart Video Solution (SVS)\ndesigned to enhance safety in the real world. The system integrates with\nexisting infrastructure camera networks, leveraging recent advancements in AI\nfor easy adoption. Prioritizing privacy and ethical standards, pose based data\nis used for downstream AI tasks such as anomaly detection. Cloud-based\ninfrastructure and mobile app are deployed, enabling real-time alerts within\ncommunities. The SVS employs innovative data representation and visualization\ntechniques, such as the Occupancy Indicator, Statistical Anomaly Detection,\nBird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance\npublic safety. Evaluation of the SVS demonstrates its capacity to convert\ncomplex computer vision outputs into actionable insights for stakeholders,\ncommunity partners, law enforcement, urban planners, and social scientists.\nThis article presents a comprehensive real-world deployment and evaluation of\nthe SVS, implemented in a community college environment across 16 cameras. The\nsystem integrates AI-driven visual processing, supported by statistical\nanalysis, database management, cloud communication, and user notifications.\nAdditionally, the article evaluates the end-to-end latency from the moment an\nAI algorithm detects anomalous behavior in real-time at the camera level to the\ntime stakeholders receive a notification. The results demonstrate the system's\nrobustness, effectively managing 16 CCTV cameras with a consistent throughput\nof 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end\nlatency of 26.76 seconds between anomaly detection and alert issuance.\n","authors":["Shanle Yao","Babak Rahimi Ardabili","Armin Danesh Pazho","Ghazal Alinezhad Noghre","Christopher Neff","Lauren Bourque","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2312.02078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02339v1","updated":"2024-09-04T00:01:15Z","published":"2024-09-04T00:01:15Z","title":"Data-driven 2D stationary quantum droplets and wave propagations in the\n amended GP equation with two potentials via deep neural networks learning","summary":" In this paper, we develop a systematic deep learning approach to solve\ntwo-dimensional (2D) stationary quantum droplets (QDs) and investigate their\nwave propagation in the 2D amended Gross-Pitaevskii equation with\nLee-Huang-Yang correction and two kinds of potentials. Firstly, we use the\ninitial-value iterative neural network (IINN) algorithm for 2D stationary\nquantum droplets of stationary equations. Then the learned stationary QDs are\nused as the initial value conditions for physics-informed neural networks\n(PINNs) to explore their evolutions in the some space-time region. Especially,\nwe consider two types of potentials, one is the 2D quadruple-well Gaussian\npotential and the other is the PT-symmetric HO-Gaussian potential, which lead\nto spontaneous symmetry breaking and the generation of multi-component QDs. The\nused deep learning method can also be applied to study wave propagations of\nother nonlinear physical models.\n","authors":["Jin Song","Zhenya Yan"],"pdf_url":"https://arxiv.org/pdf/2409.02339v1.pdf","comment":"17 pages, 12 figures (Proc. R. Soc. A, accepted for publication).\n arXiv admin note: text overlap with arXiv:2409.01124"},{"id":"http://arxiv.org/abs/2409.03129v1","updated":"2024-09-04T23:38:30Z","published":"2024-09-04T23:38:30Z","title":"Subsidy design for better social outcomes","summary":" Overcoming the impact of selfish behavior of rational players in multiagent\nsystems is a fundamental problem in game theory. Without any intervention from\na central agent, strategic users take actions in order to maximize their\npersonal utility, which can lead to extremely inefficient overall system\nperformance, often indicated by a high Price of Anarchy. Recent work (Lin et\nal. 2021) investigated and formalized yet another undesirable behavior of\nrational agents, that of avoiding freely available information about the game\nfor selfish reasons, leading to worse social outcomes. A central planner can\nsignificantly mitigate these issues by injecting a subsidy to reduce certain\ncosts associated with the system and obtain net gains in the system\nperformance. Crucially, the planner needs to determine how to allocate this\nsubsidy effectively.\n We formally show that designing subsidies that perfectly optimize the social\ngood, in terms of minimizing the Price of Anarchy or preventing the information\navoidance behavior, is computationally hard under standard complexity theoretic\nassumptions. On the positive side, we show that we can learn provably good\nvalues of subsidy in repeated games coming from the same domain. This\ndata-driven subsidy design approach avoids solving computationally hard\nproblems for unseen games by learning over polynomially many games. We also\nshow that optimal subsidy can be learned with no-regret given an online\nsequence of games, under mild assumptions on the cost matrix. Our study focuses\non two distinct games: a Bayesian extension of the well-studied fair\ncost-sharing game, and a component maintenance game with engineering\napplications.\n","authors":["Maria-Florina Balcan","Matteo Pozzi","Dravyansh Sharma"],"pdf_url":"https://arxiv.org/pdf/2409.03129v1.pdf","comment":"30 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2310.13845v2","updated":"2024-09-04T23:17:41Z","published":"2023-10-20T22:39:07Z","title":"Spectral-Aware Augmentation for Enhanced Graph Representation Learning","summary":" Graph Contrastive Learning (GCL) has demonstrated remarkable effectiveness in\nlearning representations on graphs in recent years. To generate ideal\naugmentation views, the augmentation generation methods should preserve\nessential information while discarding less relevant details for downstream\ntasks. However, current augmentation methods usually involve random topology\ncorruption in the spatial domain, which fails to adequately address information\nspread across different frequencies in the spectral domain. Our preliminary\nstudy highlights this issue, demonstrating that spatial random perturbations\nimpact all frequency bands almost uniformly. Given that task-relevant\ninformation typically resides in specific spectral regions that vary across\ngraphs, this one-size-fits-all approach can pose challenges. We argue that\nindiscriminate spatial random perturbation might unintentionally weaken\ntask-relevant information, reducing its effectiveness.\n To tackle this challenge, we propose applying perturbations selectively,\nfocusing on information specific to different frequencies across diverse\ngraphs. In this paper, we present GASSER, a model that applies tailored\nperturbations to specific frequencies of graph structures in the spectral\ndomain, guided by spectral hints. Through extensive experimentation and\ntheoretical analysis, we demonstrate that the augmentation views generated by\nGASSER are adaptive, controllable, and intuitively aligned with the homophily\nratios and spectrum of graph structures.\n","authors":["Kaiqi Yang","Haoyu Han","Wei Jin","Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2310.13845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03118v1","updated":"2024-09-04T23:02:27Z","published":"2024-09-04T23:02:27Z","title":"Generative artificial intelligence for computational chemistry: a\n roadmap to predicting emergent phenomena","summary":" The recent surge in Generative Artificial Intelligence (AI) has introduced\nexciting possibilities for computational chemistry. Generative AI methods have\nmade significant progress in sampling molecular structures across chemical\nspecies, developing force fields, and speeding up simulations. This Perspective\noffers a structured overview, beginning with the fundamental theoretical\nconcepts in both Generative AI and computational chemistry. It then covers\nwidely used Generative AI methods, including autoencoders, generative\nadversarial networks, reinforcement learning, flow models and language models,\nand highlights their selected applications in diverse areas including force\nfield development, and protein/RNA structure prediction. A key focus is on the\nchallenges these methods face before they become truly predictive, particularly\nin predicting emergent chemical phenomena. We believe that the ultimate goal of\na simulation method or theory is to predict phenomena not seen before, and that\nGenerative AI should be subject to these same standards before it is deemed\nuseful for chemistry. We suggest that to overcome these challenges, future AI\nmodels need to integrate core chemical principles, especially from statistical\nmechanics.\n","authors":["Pratyush Tiwary","Lukas Herron","Richard John","Suemin Lee","Disha Sanwal","Ruiyu Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03115v1","updated":"2024-09-04T22:47:33Z","published":"2024-09-04T22:47:33Z","title":"Probing self-attention in self-supervised speech models for\n cross-linguistic differences","summary":" Speech models have gained traction thanks to increase in accuracy from novel\ntransformer architectures. While this impressive increase in performance across\nautomatic speech recognition (ASR) benchmarks is noteworthy, there is still\nmuch that is unknown about the use of attention mechanisms for speech-related\ntasks. For example, while it is assumed that these models are learning\nlanguage-independent (i.e., universal) speech representations, there has not\nyet been an in-depth exploration of what it would mean for the models to be\nlanguage-independent. In the current paper, we explore this question within the\nrealm of self-attention mechanisms of one small self-supervised speech\ntransformer model (TERA). We find that even with a small model, the attention\nheads learned are diverse ranging from almost entirely diagonal to almost\nentirely global regardless of the training language. We highlight some notable\ndifferences in attention patterns between Turkish and English and demonstrate\nthat the models do learn important phonological information during pretraining.\nWe also present a head ablation study which shows that models across languages\nprimarily rely on diagonal heads to classify phonemes.\n","authors":["Sai Gopinath","Joselyn Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2409.03115v1.pdf","comment":"10 pages, 18 figures"},{"id":"http://arxiv.org/abs/2404.14886v2","updated":"2024-09-04T22:43:51Z","published":"2024-04-23T10:13:39Z","title":"GCEPNet: Graph Convolution-Enhanced Expectation Propagation for Massive\n MIMO Detection","summary":" Massive MIMO (multiple-input multiple-output) detection is an important topic\nin wireless communication and various machine learning based methods have been\ndeveloped recently for this task. Expectation Propagation (EP) and its variants\nare widely used for MIMO detection and have achieved the best performance.\nHowever, EP-based solvers fail to capture the correlation between unknown\nvariables, leading to a loss of information, and in addition, they are\ncomputationally expensive. In this paper, we show that the real-valued system\ncan be modeled as spectral signal convolution on graph, through which the\ncorrelation between unknown variables can be captured. Based on such analysis,\nwe propose graph convolution-enhanced expectation propagation (GCEPNet).\nGCEPNet incorporates data-dependent attention scores into Chebyshev polynomial\nfor powerful graph convolution with better generalization capacity. It enables\na better estimation of the cavity distribution for EP and empirically achieves\nthe state-of-the-art (SOTA) MIMO detection performance with much faster\ninference speed. To our knowledge, we are the first to shed light on the\nconnection between the system model and graph convolution, and the first to\ndesign the data-dependent coefficients for graph convolution.\n","authors":["Qincheng Lu","Sitao Luan","Xiao-Wen Chang"],"pdf_url":"https://arxiv.org/pdf/2404.14886v2.pdf","comment":"In IEEE GLOBECOM 2024 Conference Proceedings"},{"id":"http://arxiv.org/abs/2409.03107v1","updated":"2024-09-04T22:14:59Z","published":"2024-09-04T22:14:59Z","title":"RoboKoop: Efficient Control Conditioned Representations from Visual\n Input in Robotics using Koopman Operator","summary":" Developing agents that can perform complex control tasks from\nhigh-dimensional observations is a core ability of autonomous agents that\nrequires underlying robust task control policies and adapting the underlying\nvisual representations to the task. Most existing policies need a lot of\ntraining samples and treat this problem from the lens of two-stage learning\nwith a controller learned on top of pre-trained vision models. We approach this\nproblem from the lens of Koopman theory and learn visual representations from\nrobotic agents conditioned on specific downstream tasks in the context of\nlearning stabilizing control for the agent. We introduce a Contrastive Spectral\nKoopman Embedding network that allows us to learn efficient linearized visual\nrepresentations from the agent's visual data in a high dimensional latent space\nand utilizes reinforcement learning to perform off-policy control on top of the\nextracted representations with a linear controller. Our method enhances\nstability and control in gradient dynamics over time, significantly\noutperforming existing approaches by improving efficiency and accuracy in\nlearning task policies over extended horizons.\n","authors":["Hemant Kumawat","Biswadeep Chakraborty","Saibal Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2409.03107v1.pdf","comment":"Accepted to the $8^{th}$ Conference on Robot Learning (CoRL 2024)"},{"id":"http://arxiv.org/abs/2409.03103v1","updated":"2024-09-04T22:03:07Z","published":"2024-09-04T22:03:07Z","title":"Leveraging Interpretability in the Transformer to Automate the Proactive\n Scaling of Cloud Resources","summary":" Modern web services adopt cloud-native principles to leverage the advantages\nof microservices. To consistently guarantee high Quality of Service (QoS)\naccording to Service Level Agreements (SLAs), ensure satisfactory user\nexperiences, and minimize operational costs, each microservice must be\nprovisioned with the right amount of resources. However, accurately\nprovisioning microservices with adequate resources is complex and depends on\nmany factors, including workload intensity and the complex interconnections\nbetween microservices. To address this challenge, we develop a model that\ncaptures the relationship between an end-to-end latency, requests at the\nfront-end level, and resource utilization. We then use the developed model to\npredict the end-to-end latency. Our solution leverages the Temporal Fusion\nTransformer (TFT), an attention-based architecture equipped with\ninterpretability features. When the prediction results indicate SLA\nnon-compliance, we use the feature importance provided by the TFT as covariates\nin Kernel Ridge Regression (KRR), with the response variable being the desired\nlatency, to learn the parameters associated with the feature importance. These\nlearned parameters reflect the adjustments required to the features to ensure\nSLA compliance. We demonstrate the merit of our approach with a\nmicroservice-based application and provide a roadmap to deployment.\n","authors":["Amadou Ba","Pavithra Harsha","Chitra Subramanian"],"pdf_url":"https://arxiv.org/pdf/2409.03103v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.00813v2","updated":"2024-09-04T21:45:49Z","published":"2023-10-01T23:06:17Z","title":"OceanNet: A principled neural operator-based digital twin for regional\n oceans","summary":" While data-driven approaches demonstrate great potential in atmospheric\nmodeling and weather forecasting, ocean modeling poses distinct challenges due\nto complex bathymetry, land, vertical structure, and flow non-linearity. This\nstudy introduces OceanNet, a principled neural operator-based digital twin for\nocean circulation. OceanNet uses a Fourier neural operator and\npredictor-evaluate-corrector integration scheme to mitigate autoregressive\nerror growth and enhance stability over extended time scales. A spectral\nregularizer counteracts spectral bias at smaller scales. OceanNet is applied to\nthe northwest Atlantic Ocean western boundary current (the Gulf Stream),\nfocusing on the task of seasonal prediction for Loop Current eddies and the\nGulf Stream meander. Trained using historical sea surface height (SSH) data,\nOceanNet demonstrates competitive forecast skill by outperforming SSH\npredictions by an uncoupled, state-of-the-art dynamical ocean model forecast,\nreducing computation by 500,000 times. These accomplishments demonstrate the\npotential of physics-inspired deep neural operators as cost-effective\nalternatives to high-resolution numerical ocean models.\n","authors":["Ashesh Chattopadhyay","Michael Gray","Tianning Wu","Anna B. Lowe","Ruoying He"],"pdf_url":"https://arxiv.org/pdf/2310.00813v2.pdf","comment":"Supplementary information can be found in:\n https://drive.google.com/file/d/1NoxJLa967naJT787a5-IfZ7f_MmRuZMP/view?usp=sharing"},{"id":"http://arxiv.org/abs/2409.03077v1","updated":"2024-09-04T21:05:42Z","published":"2024-09-04T21:05:42Z","title":"Backdoor defense, learnability and obfuscation","summary":" We introduce a formal notion of defendability against backdoors using a game\nbetween an attacker and a defender. In this game, the attacker modifies a\nfunction to behave differently on a particular input known as the \"trigger\",\nwhile behaving the same almost everywhere else. The defender then attempts to\ndetect the trigger at evaluation time. If the defender succeeds with high\nenough probability, then the function class is said to be defendable. The key\nconstraint on the attacker that makes defense possible is that the attacker's\nstrategy must work for a randomly-chosen trigger.\n Our definition is simple and does not explicitly mention learning, yet we\ndemonstrate that it is closely connected to learnability. In the\ncomputationally unbounded setting, we use a voting algorithm of Hanneke et al.\n(2022) to show that defendability is essentially determined by the VC dimension\nof the function class, in much the same way as PAC learnability. In the\ncomputationally bounded setting, we use a similar argument to show that\nefficient PAC learnability implies efficient defendability, but not conversely.\nOn the other hand, we use indistinguishability obfuscation to show that the\nclass of polynomial size circuits is not efficiently defendable. Finally, we\npresent polynomial size decision trees as a natural example for which defense\nis strictly easier than learning. Thus, we identify efficient defendability as\na notable intermediate concept in between efficient learnability and\nobfuscation.\n","authors":["Paul Christiano","Jacob Hilton","Victor Lecomte","Mark Xu"],"pdf_url":"https://arxiv.org/pdf/2409.03077v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2305.18160v3","updated":"2024-09-04T20:57:26Z","published":"2023-05-29T15:41:12Z","title":"Counterpart Fairness -- Addressing Systematic between-group Differences\n in Fairness Evaluation","summary":" When using machine learning (ML) to aid decision-making, it is critical to\nensure that an algorithmic decision is fair and does not discriminate against\nspecific individuals/groups, particularly those from underprivileged\npopulations. Existing group fairness methods aim to ensure equal outcomes (such\nas loan approval rates) across groups delineated by protected variables like\nrace or gender. However, these methods overlook the intricate, inherent\ndifferences among these groups that could influence outcomes. The confounding\nfactors, which are non-protected variables but manifest systematic differences,\ncan significantly affect fairness evaluation. Therefore, we recommend a more\nrefined and comprehensive approach that accounts for both the systematic\ndifferences within groups and the multifaceted, intertwined confounding\neffects. We proposed a fairness metric based on counterparts (i.e., individuals\nwho are similar with respect to the task of interest) from different groups,\nwhose group identities cannot be distinguished algorithmically by exploring\nconfounding factors. We developed a propensity-score-based method for\nidentifying counterparts, avoiding the issue of comparing \"oranges\" with\n\"apples\". In addition, we introduced a counterpart-based statistical fairness\nindex, called Counterpart-Fairness (CFair), to assess the fairness of ML\nmodels. Various empirical studies were conducted to validate the effectiveness\nof CFair.\n","authors":["Yifei Wang","Zhengyang Zhou","Liqin Wang","John Laurentiev","Peter Hou","Li Zhou","Pengyu Hong"],"pdf_url":"https://arxiv.org/pdf/2305.18160v3.pdf","comment":"24 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2409.03060v1","updated":"2024-09-04T20:20:37Z","published":"2024-09-04T20:20:37Z","title":"Better Verified Explanations with Applications to Incorrectness and\n Out-of-Distribution Detection","summary":" Building on VeriX (Verified eXplainability, arXiv:2212.01051), a system for\nproducing optimal verified explanations for machine learning model outputs, we\npresent VeriX+, which significantly improves both the size and the generation\ntime of verified explanations. We introduce a bound propagation-based\nsensitivity technique to improve the size, and a binary search-based traversal\nwith confidence ranking for improving time -- the two techniques are orthogonal\nand can be used independently or together. We also show how to adapt the\nQuickXplain (Junker 2004) algorithm to our setting to provide a trade-off\nbetween size and time. Experimental evaluations on standard benchmarks\ndemonstrate significant improvements on both metrics, e.g., a size reduction of\n38% on the GTSRB dataset and a time reduction of 90% on MNIST. We also explore\napplications of our verified explanations and show that explanation size is a\nuseful proxy for both incorrectness detection and out-of-distribution\ndetection.\n","authors":["Min Wu","Xiaofu Li","Haoze Wu","Clark Barrett"],"pdf_url":"https://arxiv.org/pdf/2409.03060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03052v1","updated":"2024-09-04T19:54:40Z","published":"2024-09-04T19:54:40Z","title":"An Introduction to Centralized Training for Decentralized Execution in\n Cooperative Multi-Agent Reinforcement Learning","summary":" Multi-agent reinforcement learning (MARL) has exploded in popularity in\nrecent years. Many approaches have been developed but they can be divided into\nthree main types: centralized training and execution (CTE), centralized\ntraining for decentralized execution (CTDE), and Decentralized training and\nexecution (DTE).\n CTDE methods are the most common as they can use centralized information\nduring training but execute in a decentralized manner -- using only information\navailable to that agent during execution. CTDE is the only paradigm that\nrequires a separate training phase where any available information (e.g., other\nagent policies, underlying states) can be used. As a result, they can be more\nscalable than CTE methods, do not require communication during execution, and\ncan often perform well. CTDE fits most naturally with the cooperative case, but\ncan be potentially applied in competitive or mixed settings depending on what\ninformation is assumed to be observed.\n This text is an introduction to CTDE in cooperative MARL. It is meant to\nexplain the setting, basic concepts, and common methods. It does not cover all\nwork in CTDE MARL as the subarea is quite extensive. I have included work that\nI believe is important for understanding the main concepts in the subarea and\napologize to those that I have omitted.\n","authors":["Christopher Amato"],"pdf_url":"https://arxiv.org/pdf/2409.03052v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2405.06161"},{"id":"http://arxiv.org/abs/2309.04644v3","updated":"2024-09-04T19:32:34Z","published":"2023-09-09T00:05:45Z","title":"Towards Understanding Neural Collapse: The Effects of Batch\n Normalization and Weight Decay","summary":" Neural Collapse (NC) is a geometric structure recently observed at the\nterminal phase of training deep neural networks, which states that last-layer\nfeature vectors for the same class would \"collapse\" to a single point, while\nfeatures of different classes become equally separated. We demonstrate that\nbatch normalization (BN) and weight decay (WD) critically influence the\nemergence of NC. In the near-optimal loss regime, we establish an asymptotic\nlower bound on the emergence of NC that depends only on the WD value, training\nloss, and the presence of last-layer BN. Our experiments substantiate\ntheoretical insights by showing that models demonstrate a stronger presence of\nNC with BN, appropriate WD values, lower loss, and lower last-layer feature\nnorm. Our findings offer a novel perspective in studying the role of BN and WD\nin shaping neural network features.\n","authors":["Leyan Pan","Xinyuan Cao"],"pdf_url":"https://arxiv.org/pdf/2309.04644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03043v1","updated":"2024-09-04T19:27:56Z","published":"2024-09-04T19:27:56Z","title":"Can Your Generative Model Detect Out-of-Distribution Covariate Shift?","summary":" Detecting Out-of-Distribution~(OOD) sensory data and covariate distribution\nshift aims to identify new test examples with different high-level image\nstatistics to the captured, normal and In-Distribution (ID) set. Existing OOD\ndetection literature largely focuses on semantic shift with little-to-no\nconsensus over covariate shift. Generative models capture the ID data in an\nunsupervised manner, enabling them to effectively identify samples that deviate\nsignificantly from this learned distribution, irrespective of the downstream\ntask. In this work, we elucidate the ability of generative models to detect and\nquantify domain-specific covariate shift through extensive analyses that\ninvolves a variety of models. To this end, we conjecture that it is sufficient\nto detect most occurring sensory faults (anomalies and deviations in global\nsignals statistics) by solely modeling high-frequency signal-dependent and\nindependent details. We propose a novel method, CovariateFlow, for OOD\ndetection, specifically tailored to covariate heteroscedastic high-frequency\nimage-components using conditional Normalizing Flows (cNFs). Our results on\nCIFAR10 vs. CIFAR10-C and ImageNet200 vs. ImageNet200-C demonstrate the\neffectiveness of the method by accurately detecting OOD covariate shift. This\nwork contributes to enhancing the fidelity of imaging systems and aiding\nmachine learning models in OOD detection in the presence of covariate shift.\n","authors":["Christiaan Viviers","Amaan Valiuddin","Francisco Caetano","Lemar Abdi","Lena Filatova","Peter de With","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2409.03043v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2402.01695v3","updated":"2024-09-04T19:13:06Z","published":"2024-01-24T03:11:36Z","title":"Language-Guided World Models: A Model-Based Approach to AI Control","summary":" This paper introduces the concept of Language-Guided World Models (LWMs) --\nprobabilistic models that can simulate environments by reading texts. Agents\nequipped with these models provide humans with more extensive and efficient\ncontrol, allowing them to simultaneously alter agent behaviors in multiple\ntasks via natural verbal communication. In this work, we take initial steps in\ndeveloping robust LWMs that can generalize to compositionally novel language\ndescriptions. We design a challenging world modeling benchmark based on the\ngame of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that\nrequire varying degrees of compositional generalization. Our experiments reveal\nthe lack of generalizability of the state-of-the-art Transformer model, as it\noffers marginal improvements in simulation quality over a no-text baseline. We\ndevise a more robust model by fusing the Transformer with the EMMA attention\nmechanism (Hanjie et al., 2021). Our model substantially outperforms the\nTransformer and approaches the performance of a model with an oracle semantic\nparsing and grounding capability. To demonstrate the practicality of this model\nin improving AI safety and transparency, we simulate a scenario in which the\nmodel enables an agent to present plans to a human before execution, and to\nrevise plans based on their language feedback.\n","authors":["Alex Zhang","Khanh Nguyen","Jens Tuyls","Albert Lin","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2402.01695v3.pdf","comment":"SpLU-RoboNLP workshop at ACL 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.02889v1","updated":"2024-09-04T17:25:21Z","published":"2024-09-04T17:25:21Z","title":"LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via\n Hybrid Architecture","summary":" Expanding the long-context capabilities of Multi-modal Large Language\nModels~(MLLMs) is crucial for video understanding, high-resolution image\nunderstanding, and multi-modal agents. This involves a series of systematic\noptimizations, including model architecture, data construction and training\nstrategy, particularly addressing challenges such as \\textit{degraded\nperformance with more images} and \\textit{high computational costs}. In this\npaper, we adapt the model architecture to a hybrid of Mamba and Transformer\nblocks, approach data construction with both temporal and spatial dependencies\namong multiple images and employ a progressive training strategy. The released\nmodel \\textbf{LongLLaVA}~(\\textbf{Long}-Context \\textbf{L}arge\n\\textbf{L}anguage \\textbf{a}nd \\textbf{V}ision \\textbf{A}ssistant) is the first\nhybrid MLLM, which achieved a better balance between efficiency and\neffectiveness. LongLLaVA not only achieves competitive results across various\nbenchmarks, but also maintains high throughput and low memory consumption.\nEspecially, it could process nearly a thousand images on a single A100 80GB\nGPU, showing promising application prospects for a wide range of tasks.\n","authors":["Xidong Wang","Dingjie Song","Shunian Chen","Chen Zhang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02889v1.pdf","comment":"19 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.02845v1","updated":"2024-09-04T16:17:41Z","published":"2024-09-04T16:17:41Z","title":"Multi-Track MusicLDM: Towards Versatile Music Generation with Latent\n Diffusion Model","summary":" Diffusion models have shown promising results in cross-modal generation tasks\ninvolving audio and music, such as text-to-sound and text-to-music generation.\nThese text-controlled music generation models typically focus on generating\nmusic by capturing global musical attributes like genre and mood. However,\nmusic composition is a complex, multilayered task that often involves musical\narrangement as an integral part of the process. This process involves composing\neach instrument to align with existing ones in terms of beat, dynamics,\nharmony, and melody, requiring greater precision and control over tracks than\ntext prompts usually provide. In this work, we address these challenges by\nextending the MusicLDM, a latent diffusion model for music, into a multi-track\ngenerative model. By learning the joint probability of tracks sharing a\ncontext, our model is capable of generating music across several tracks that\ncorrespond well to each other, either conditionally or unconditionally.\nAdditionally, our model is capable of arrangement generation, where the model\ncan generate any subset of tracks given the others (e.g., generating a piano\ntrack complementing given bass and drum tracks). We compared our model with an\nexisting multi-track generative model and demonstrated that our model achieves\nconsiderable improvements across objective metrics for both total and\narrangement generation tasks.\n","authors":["Tornike Karchkhadze","Mohammad Rasool Izadi","Ke Chen","Gerard Assayag","Shlomo Dubnov"],"pdf_url":"https://arxiv.org/pdf/2409.02845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02828v1","updated":"2024-09-04T15:50:16Z","published":"2024-09-04T15:50:16Z","title":"ExpLLM: Towards Chain of Thought for Facial Expression Recognition","summary":" Facial expression recognition (FER) is a critical task in multimedia with\nsignificant implications across various domains. However, analyzing the causes\nof facial expressions is essential for accurately recognizing them. Current\napproaches, such as those based on facial action units (AUs), typically provide\nAU names and intensities but lack insight into the interactions and\nrelationships between AUs and the overall expression. In this paper, we propose\na novel method called ExpLLM, which leverages large language models to generate\nan accurate chain of thought (CoT) for facial expression recognition.\nSpecifically, we have designed the CoT mechanism from three key perspectives:\nkey observations, overall emotional interpretation, and conclusion. The key\nobservations describe the AU's name, intensity, and associated emotions. The\noverall emotional interpretation provides an analysis based on multiple AUs and\ntheir interactions, identifying the dominant emotions and their relationships.\nFinally, the conclusion presents the final expression label derived from the\npreceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed\nto construct this expression CoT and generate instruction-description data for\ntraining our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets\ndemonstrate that ExpLLM outperforms current state-of-the-art FER methods.\nExpLLM also surpasses the latest GPT-4o in expression CoT generation,\nparticularly in recognizing micro-expressions where GPT-4o frequently fails.\n","authors":["Xing Lan","Jian Xue","Ji Qi","Dongmei Jiang","Ke Lu","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2409.02828v1.pdf","comment":"project page: https://starhiking.github.io/ExpLLM_Page/"},{"id":"http://arxiv.org/abs/2409.02657v1","updated":"2024-09-04T12:30:25Z","published":"2024-09-04T12:30:25Z","title":"PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for\n One-Shot Talking Head Generation","summary":" While previous audio-driven talking head generation (THG) methods generate\nhead poses from driving audio, the generated poses or lips cannot match the\naudio well or are not editable. In this study, we propose \\textbf{PoseTalk}, a\nTHG system that can freely generate lip-synchronized talking head videos with\nfree head poses conditioned on text prompts and audio. The core insight of our\nmethod is using head pose to connect visual, linguistic, and audio signals.\nFirst, we propose to generate poses from both audio and text prompts, where the\naudio offers short-term variations and rhythm correspondence of the head\nmovements and the text prompts describe the long-term semantics of head\nmotions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to\ngenerate motion latent from text prompts and audio cues in a pose latent space.\nSecond, we observe a loss-imbalance problem: the loss for the lip region\ncontributes less than 4\\% of the total reconstruction loss caused by both pose\nand lip, making optimization lean towards head movements rather than lip\nshapes. To address this issue, we propose a refinement-based learning strategy\nto synthesize natural talking videos using two cascaded networks, i.e.,\nCoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce\nanimated images in novel poses and the RefineNet focuses on learning finer lip\nmotions by progressively estimating lip motions from low-to-high resolutions,\nyielding improved lip-synchronization performance. Experiments demonstrate our\npose prediction strategy achieves better pose diversity and realness compared\nto text-only or audio-only, and our video generator model outperforms\nstate-of-the-art methods in synthesizing talking videos with natural head\nmotions. Project: https://junleen.github.io/projects/posetalk.\n","authors":["Jun Ling","Yiwen Wang","Han Xue","Rong Xie","Li Song"],"pdf_url":"https://arxiv.org/pdf/2409.02657v1.pdf","comment":"7+5 pages, 15 figures"},{"id":"http://arxiv.org/abs/2409.02555v1","updated":"2024-09-04T09:21:13Z","published":"2024-09-04T09:21:13Z","title":"Low-Resolution Object Recognition with Cross-Resolution Relational\n Contrastive Distillation","summary":" Recognizing objects in low-resolution images is a challenging task due to the\nlack of informative details. Recent studies have shown that knowledge\ndistillation approaches can effectively transfer knowledge from a\nhigh-resolution teacher model to a low-resolution student model by aligning\ncross-resolution representations. However, these approaches still face\nlimitations in adapting to the situation where the recognized objects exhibit\nsignificant representation discrepancies between training and testing images.\nIn this study, we propose a cross-resolution relational contrastive\ndistillation approach to facilitate low-resolution object recognition. Our\napproach enables the student model to mimic the behavior of a well-trained\nteacher model which delivers high accuracy in identifying high-resolution\nobjects. To extract sufficient knowledge, the student learning is supervised\nwith contrastive relational distillation loss, which preserves the similarities\nin various relational structures in contrastive representation space. In this\nmanner, the capability of recovering missing details of familiar low-resolution\nobjects can be effectively enhanced, leading to a better knowledge transfer.\nExtensive experiments on low-resolution object classification and\nlow-resolution face recognition clearly demonstrate the effectiveness and\nadaptability of our approach.\n","authors":["Kangkai Zhang","Shiming Ge","Ruixin Shi","Dan Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.02555v1.pdf","comment":"This paper is accepted by IEEE Transactions on Circuits and Systems\n for Video Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2409.02453v1","updated":"2024-09-04T05:19:57Z","published":"2024-09-04T05:19:57Z","title":"FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video\n Reconstruction in Resource and Timing Constrained Network Settings","summary":" Despite the growing adoption of video processing via Internet of Things (IoT)\ndevices due to their cost-effectiveness, transmitting captured data to nearby\nservers poses challenges due to varying timing constraints and scarcity of\nnetwork bandwidth. Existing video compression methods face difficulties in\nrecovering compressed data when incomplete data is provided. Here, we introduce\n\\emph{\\project}, a deep-learning based solution that utilizes previously\nreceived data to predict the missing segments of a frame, enabling the\nreconstruction of a frame from partially received data.\n","authors":["John Li","Shehab Sarar Ahmed","Deepak Nair"],"pdf_url":"https://arxiv.org/pdf/2409.02453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15461v2","updated":"2024-09-04T02:45:56Z","published":"2024-08-28T00:54:51Z","title":"Hand1000: Generating Realistic Hands from Text with Only 1,000 Images","summary":" Text-to-image generation models have achieved remarkable advancements in\nrecent years, aiming to produce realistic images from textual descriptions.\nHowever, these models often struggle with generating anatomically accurate\nrepresentations of human hands. The resulting images frequently exhibit issues\nsuch as incorrect numbers of fingers, unnatural twisting or interlacing of\nfingers, or blurred and indistinct hands. These issues stem from the inherent\ncomplexity of hand structures and the difficulty in aligning textual\ndescriptions with precise visual depictions of hands. To address these\nchallenges, we propose a novel approach named Hand1000 that enables the\ngeneration of realistic hand images with target gesture using only 1,000\ntraining samples. The training of Hand1000 is divided into three stages with\nthe first stage aiming to enhance the model's understanding of hand anatomy by\nusing a pre-trained hand gesture recognition model to extract gesture\nrepresentation. The second stage further optimizes text embedding by\nincorporating the extracted hand gesture representation, to improve alignment\nbetween the textual descriptions and the generated hand images. The third stage\nutilizes the optimized embedding to fine-tune the Stable Diffusion model to\ngenerate realistic hand images. In addition, we construct the first publicly\navailable dataset specifically designed for text-to-hand image generation.\nBased on the existing hand gesture recognition dataset, we adopt advanced image\ncaptioning models and LLaMA3 to generate high-quality textual descriptions\nenriched with detailed gesture information. Extensive experiments demonstrate\nthat Hand1000 significantly outperforms existing models in producing\nanatomically correct hand images while faithfully representing other details in\nthe text, such as faces, clothing, and colors.\n","authors":["Haozhuo Zhang","Bin Zhu","Yu Cao","Yanbin Hao"],"pdf_url":"https://arxiv.org/pdf/2408.15461v2.pdf","comment":"Project page https://haozhuo-zhang.github.io/Hand1000-project-page/"},{"id":"http://arxiv.org/abs/2409.02376v1","updated":"2024-09-04T01:54:20Z","published":"2024-09-04T01:54:20Z","title":"Coral Model Generation from Single Images for Virtual Reality\n Applications","summary":" With the rapid development of VR technology, the demand for high-quality 3D\nmodels is increasing. Traditional methods struggle with efficiency and quality\nin large-scale customization. This paper introduces a deep-learning framework\nthat generates high-precision 3D coral models from a single image. Using the\nCoral dataset, the framework extracts geometric and texture features, performs\n3D reconstruction, and optimizes design and material blending. Advanced\noptimization and polygon count control ensure shape accuracy, detail retention,\nand flexible output for various complexities, catering to high-quality\nrendering and real-time interaction needs.The project incorporates Explainable\nAI (XAI) to transform AI-generated models into interactive \"artworks,\" best\nviewed in VR and XR. This enhances model interpretability and human-machine\ncollaboration. Real-time feedback in VR interactions displays information like\ncoral species and habitat, enriching user experience. The generated models\nsurpass traditional methods in detail, visual quality, and efficiency. This\nresearch offers an intelligent approach to 3D content creation for VR, lowering\nproduction barriers, and promoting widespread VR applications. Additionally,\nintegrating XAI provides new insights into AI-generated visual content and\nadvances research in 3D vision interpretability.\n","authors":["Jie Fu","Shun Fu","Mick Grierson"],"pdf_url":"https://arxiv.org/pdf/2409.02376v1.pdf","comment":"In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts\n 2024) arXiv:2406.14485"},{"id":"http://arxiv.org/abs/2408.11593v3","updated":"2024-09-04T01:25:55Z","published":"2024-08-21T12:59:42Z","title":"MCDubber: Multimodal Context-Aware Expressive Video Dubbing","summary":" Automatic Video Dubbing (AVD) aims to take the given script and generate\nspeech that aligns with lip motion and prosody expressiveness. Current AVD\nmodels mainly utilize visual information of the current sentence to enhance the\nprosody of synthesized speech. However, it is crucial to consider whether the\nprosody of the generated dubbing aligns with the multimodal context, as the\ndubbing will be combined with the original context in the final video. This\naspect has been overlooked in previous studies. To address this issue, we\npropose a Multimodal Context-aware video Dubbing model, termed\n\\textbf{MCDubber}, to convert the modeling object from a single sentence to a\nlonger sequence with context information to ensure the consistency of the\nglobal context prosody. MCDubber comprises three main components: (1) A context\nduration aligner aims to learn the context-aware alignment between the text and\nlip frames; (2) A context prosody predictor seeks to read the global context\nvisual sequence and predict the context-aware global energy and pitch; (3) A\ncontext acoustic decoder ultimately predicts the global context mel-spectrogram\nwith the assistance of adjacent ground-truth mel-spectrograms of the target\nsentence. Through this process, MCDubber fully considers the influence of\nmultimodal context on the prosody expressiveness of the current sentence when\ndubbing. The extracted mel-spectrogram belonging to the target sentence from\nthe output context mel-spectrograms is the final required dubbing audio.\nExtensive experiments on the Chem benchmark dataset demonstrate that our\nMCDubber significantly improves dubbing expressiveness compared to all advanced\nbaselines. The code and demos are available at\nhttps://github.com/XiaoYuanJun-zy/MCDubber.\n","authors":["Yuan Zhao","Zhenqi Jia","Rui Liu","De Hu","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2408.11593v3.pdf","comment":"Accepted by NCMMSC2024"}]},"2024-09-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.03757v1","updated":"2024-09-05T17:59:56Z","published":"2024-09-05T17:59:56Z","title":"Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene\n Understanding","summary":" Complex 3D scene understanding has gained increasing attention, with scene\nencoding strategies playing a crucial role in this success. However, the\noptimal scene encoding strategies for various scenarios remain unclear,\nparticularly compared to their image-based counterparts. To address this issue,\nwe present a comprehensive study that probes various visual encoding models for\n3D scene understanding, identifying the strengths and limitations of each model\nacross different scenarios. Our evaluation spans seven vision foundation\nencoders, including image-based, video-based, and 3D foundation models. We\nevaluate these models in four tasks: Vision-Language Scene Reasoning, Visual\nGrounding, Segmentation, and Registration, each focusing on different aspects\nof scene understanding. Our evaluations yield key findings: DINOv2 demonstrates\nsuperior performance, video models excel in object-level tasks, diffusion\nmodels benefit geometric tasks, and language-pretrained models show unexpected\nlimitations in language-related tasks. These insights challenge some\nconventional understandings, provide novel perspectives on leveraging visual\nfoundation models, and highlight the need for more flexible encoder selection\nin future vision-language and scene-understanding tasks.\n","authors":["Yunze Man","Shuhong Zheng","Zhipeng Bao","Martial Hebert","Liang-Yan Gui","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03757v1.pdf","comment":"Project page: https://yunzeman.github.io/lexicon3d , Github:\n https://github.com/YunzeMan/Lexicon3D"},{"id":"http://arxiv.org/abs/2409.03753v1","updated":"2024-09-05T17:59:15Z","published":"2024-09-05T17:59:15Z","title":"WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild","summary":" The increasing availability of real-world conversation data offers exciting\nopportunities for researchers to study user-chatbot interactions. However, the\nsheer volume of this data makes manually examining individual conversations\nimpractical. To overcome this challenge, we introduce WildVis, an interactive\ntool that enables fast, versatile, and large-scale conversation analysis.\nWildVis provides search and visualization capabilities in the text and\nembedding spaces based on a list of criteria. To manage million-scale datasets,\nwe implemented optimizations including search index construction, embedding\nprecomputation and compression, and caching to ensure responsive user\ninteractions within seconds. We demonstrate WildVis's utility through three\ncase studies: facilitating chatbot misuse research, visualizing and comparing\ntopic distributions across datasets, and characterizing user-specific\nconversation patterns. WildVis is open-source and designed to be extendable,\nsupporting additional datasets and customized search and visualization\nfunctionalities.\n","authors":["Yuntian Deng","Wenting Zhao","Jack Hessel","Xiang Ren","Claire Cardie","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03752v1","updated":"2024-09-05T17:59:12Z","published":"2024-09-05T17:59:12Z","title":"Attention Heads of Large Language Models: A Survey","summary":" Since the advent of ChatGPT, Large Language Models (LLMs) have excelled in\nvarious tasks but remain largely as black-box systems. Consequently, their\ndevelopment relies heavily on data-driven approaches, limiting performance\nenhancement through changes in internal architecture and reasoning pathways. As\na result, many researchers have begun exploring the potential internal\nmechanisms of LLMs, aiming to identify the essence of their reasoning\nbottlenecks, with most studies focusing on attention heads. Our survey aims to\nshed light on the internal reasoning processes of LLMs by concentrating on the\ninterpretability and underlying mechanisms of attention heads. We first distill\nthe human thought process into a four-stage framework: Knowledge Recalling,\nIn-Context Identification, Latent Reasoning, and Expression Preparation. Using\nthis framework, we systematically review existing research to identify and\ncategorize the functions of specific attention heads. Furthermore, we summarize\nthe experimental methodologies used to discover these special heads, dividing\nthem into two categories: Modeling-Free methods and Modeling-Required methods.\nAlso, we outline relevant evaluation methods and benchmarks. Finally, we\ndiscuss the limitations of current research and propose several potential\nfuture directions. Our reference list is open-sourced at\n\\url{https://github.com/IAAR-Shanghai/Awesome-Attention-Heads}.\n","authors":["Zifan Zheng","Yezhaohui Wang","Yuxin Huang","Shichao Song","Bo Tang","Feiyu Xiong","Zhiyu Li"],"pdf_url":"https://arxiv.org/pdf/2409.03752v1.pdf","comment":"20 pages, 11 figures, 4 tables"},{"id":"http://arxiv.org/abs/2305.07893v3","updated":"2024-09-05T17:45:57Z","published":"2023-05-13T11:02:50Z","title":"PESTS: Persian_English Cross Lingual Corpus for Semantic Textual\n Similarity","summary":" One of the components of natural language processing that has received a lot\nof investigation recently is semantic textual similarity. In computational\nlinguistics and natural language processing, assessing the semantic similarity\nof words, phrases, paragraphs, and texts is crucial. Calculating the degree of\nsemantic resemblance between two textual pieces, paragraphs, or phrases\nprovided in both monolingual and cross-lingual versions is known as semantic\nsimilarity. Cross lingual semantic similarity requires corpora in which there\nare sentence pairs in both the source and target languages with a degree of\nsemantic similarity between them. Many existing cross lingual semantic\nsimilarity models use a machine translation due to the unavailability of cross\nlingual semantic similarity dataset, which the propagation of the machine\ntranslation error reduces the accuracy of the model. On the other hand, when we\nwant to use semantic similarity features for machine translation the same\nmachine translations should not be used for semantic similarity. For Persian,\nwhich is one of the low resource languages, no effort has been made in this\nregard and the need for a model that can understand the context of two\nlanguages is felt more than ever. In this article, the corpus of semantic\ntextual similarity between sentences in Persian and English languages has been\nproduced for the first time by using linguistic experts. We named this dataset\nPESTS (Persian English Semantic Textual Similarity). This corpus contains 5375\nsentence pairs. Also, different models based on transformers have been\nfine-tuned using this dataset. The results show that using the PESTS dataset,\nthe Pearson correlation of the XLM ROBERTa model increases from 85.87% to\n95.62%.\n","authors":["Mohammad Abdous","Poorya Piroozfar","Behrouz Minaei Bidgoli"],"pdf_url":"https://arxiv.org/pdf/2305.07893v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03733v1","updated":"2024-09-05T17:44:49Z","published":"2024-09-05T17:44:49Z","title":"Planning In Natural Language Improves LLM Search For Code Generation","summary":" While scaling training compute has led to remarkable improvements in large\nlanguage models (LLMs), scaling inference compute has not yet yielded analogous\ngains. We hypothesize that a core missing component is a lack of diverse LLM\noutputs, leading to inefficient search due to models repeatedly sampling highly\nsimilar, yet incorrect generations. We empirically demonstrate that this lack\nof diversity can be mitigated by searching over candidate plans for solving a\nproblem in natural language. Based on this insight, we propose PLANSEARCH, a\nnovel search algorithm which shows strong results across HumanEval+, MBPP+, and\nLiveCodeBench (a contamination-free benchmark for competitive coding).\nPLANSEARCH generates a diverse set of observations about the problem and then\nuses these observations to construct plans for solving the problem. By\nsearching over plans in natural language rather than directly over code\nsolutions, PLANSEARCH explores a significantly more diverse range of potential\nsolutions compared to baseline search methods. Using PLANSEARCH on top of\nClaude 3.5 Sonnet achieves a state-of-the-art pass@200 of 77.0% on\nLiveCodeBench, outperforming both the best score achieved without search\n(pass@1 = 41.4%) and using standard repeated sampling (pass@200 = 60.6%).\nFinally, we show that, across all models, search algorithms, and benchmarks\nanalyzed, we can accurately predict performance gains due to search as a direct\nfunction of the diversity over generated ideas.\n","authors":["Evan Wang","Federico Cassano","Catherine Wu","Yunfeng Bai","Will Song","Vaskar Nath","Ziwen Han","Sean Hendryx","Summer Yue","Hugh Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06477v3","updated":"2024-09-05T17:25:01Z","published":"2024-01-12T09:56:57Z","title":"Kun: Answer Polishment for Chinese Self-Alignment with Instruction\n Back-Translation","summary":" In this paper, we introduce Kun, a novel approach for creating high-quality\ninstruction-tuning datasets for large language models (LLMs) without relying on\nmanual annotations. Adapting a self-training algorithm based on instruction\nback-translation and answer polishment, Kun leverages unlabelled data from\ndiverse sources such as Wudao, Wanjuan, and SkyPile to generate a substantial\ndataset of over a million Chinese instructional data points. This approach\nsignificantly deviates from traditional methods by using a self-curation\nprocess to refine and select the most effective instruction-output pairs. Our\nexperiments with the 6B-parameter Yi model across various benchmarks\ndemonstrate Kun's robustness and scalability. Our method's core contributions\nlie in its algorithmic advancement, which enhances data retention and clarity,\nand its innovative data generation approach that substantially reduces the\nreliance on costly and time-consuming manual annotations. This methodology\npresents a scalable and efficient solution for improving the\ninstruction-following capabilities of LLMs, with significant implications for\ntheir application across diverse fields. The code and dataset can be found at\nhttps://github.com/Zheng0428/COIG-Kun\n","authors":["Tianyu Zheng","Shuyue Guo","Xingwei Qu","Jiawei Guo","Xinrun Du","Qi Jia","Chenghua Lin","Wenhao Huang","Jie Fu","Ge Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.06477v3.pdf","comment":"12 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.03708v1","updated":"2024-09-05T17:14:23Z","published":"2024-09-05T17:14:23Z","title":"RAG based Question-Answering for Contextual Response Prediction System","summary":" Large Language Models (LLMs) have shown versatility in various Natural\nLanguage Processing (NLP) tasks, including their potential as effective\nquestion-answering systems. However, to provide precise and relevant\ninformation in response to specific customer queries in industry settings, LLMs\nrequire access to a comprehensive knowledge base to avoid hallucinations.\nRetrieval Augmented Generation (RAG) emerges as a promising technique to\naddress this challenge. Yet, developing an accurate question-answering\nframework for real-world applications using RAG entails several challenges: 1)\ndata availability issues, 2) evaluating the quality of generated content, and\n3) the costly nature of human evaluation. In this paper, we introduce an\nend-to-end framework that employs LLMs with RAG capabilities for industry use\ncases. Given a customer query, the proposed system retrieves relevant knowledge\ndocuments and leverages them, along with previous chat history, to generate\nresponse suggestions for customer service agents in the contact centers of a\nmajor retail company. Through comprehensive automated and human evaluations, we\nshow that this solution outperforms the current BERT-based algorithms in\naccuracy and relevance. Our findings suggest that RAG-based LLMs can be an\nexcellent support to human customer service representatives by lightening their\nworkload.\n","authors":["Sriram Veturi","Saurabh Vaichal","Nafis Irtiza Tripto","Reshma Lal Jagadheesh","Nian Yan"],"pdf_url":"https://arxiv.org/pdf/2409.03708v1.pdf","comment":"Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise,\n CIKM'24. 6 pages"},{"id":"http://arxiv.org/abs/2409.03707v1","updated":"2024-09-05T17:13:38Z","published":"2024-09-05T17:13:38Z","title":"A Different Level Text Protection Mechanism With Differential Privacy","summary":" The article introduces a method for extracting words of different degrees of\nimportance based on the BERT pre-training model and proves the effectiveness of\nthis method. The article also discusses the impact of maintaining the same\nperturbation results for words of different importance on the overall text\nutility. This method can be applied to long text protection.\n","authors":["Qingwen Fu"],"pdf_url":"https://arxiv.org/pdf/2409.03707v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03701v1","updated":"2024-09-05T16:57:39Z","published":"2024-09-05T16:57:39Z","title":"LAST: Language Model Aware Speech Tokenization","summary":" Speech tokenization serves as the foundation of speech language model (LM),\nenabling them to perform various tasks such as spoken language modeling,\ntext-to-speech, speech-to-text, etc. Most speech tokenizers are trained\nindependently of the LM training process, relying on separate acoustic models\nand quantization methods. Following such an approach may create a mismatch\nbetween the tokenization process and its usage afterward. In this study, we\npropose a novel approach to training a speech tokenizer by leveraging\nobjectives from pre-trained textual LMs. We advocate for the integration of\nthis objective into the process of learning discrete speech representations.\nOur aim is to transform features from a pre-trained speech model into a new\nfeature space that enables better clustering for speech LMs. We empirically\ninvestigate the impact of various model design choices, including speech\nvocabulary size and text LM size. Our results demonstrate the proposed\ntokenization method outperforms the evaluated baselines considering both spoken\nlanguage modeling and speech-to-text. More importantly, unlike prior work, the\nproposed method allows the utilization of a single pre-trained LM for\nprocessing both speech and text inputs, setting it apart from conventional\ntokenization approaches.\n","authors":["Arnon Turetzky","Yossi Adi"],"pdf_url":"https://arxiv.org/pdf/2409.03701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16607v3","updated":"2024-09-05T16:39:44Z","published":"2024-07-23T16:13:22Z","title":"Data Mixture Inference: What do BPE Tokenizers Reveal about their\n Training Data?","summary":" The pretraining data of today's strongest language models is opaque; in\nparticular, little is known about the proportions of various domains or\nlanguages represented. In this work, we tackle a task which we call data\nmixture inference, which aims to uncover the distributional make-up of training\ndata. We introduce a novel attack based on a previously overlooked source of\ninformation: byte-pair encoding (BPE) tokenizers, used by the vast majority of\nmodern language models. Our key insight is that the ordered list of merge rules\nlearned by a BPE tokenizer naturally reveals information about the token\nfrequencies in its training data. Given a tokenizer's merge list along with\nexample data for each category of interest, we formulate a linear program that\nsolves for the proportion of each category in the tokenizer's training set. In\ncontrolled experiments, we show that our attack recovers mixture ratios with\nhigh precision for tokenizers trained on known mixtures of natural languages,\nprogramming languages, and data sources. We then apply our approach to\noff-the-shelf tokenizers released with recent LMs. We confirm much publicly\ndisclosed information about these models, and also make several new inferences:\nGPT-4o and Mistral NeMo's tokenizers are much more multilingual than their\npredecessors, training on 39% and 47% non-English language data, respectively;\nLlama 3 extends GPT-3.5's tokenizer primarily for multilingual (48%) use;\nGPT-3.5's and Claude's tokenizers are trained on predominantly code (~60%). We\nhope our work sheds light on current design practices for pretraining data, and\ninspires continued research into data mixture inference for LMs.\n","authors":["Jonathan Hayase","Alisa Liu","Yejin Choi","Sewoong Oh","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2407.16607v3.pdf","comment":"new robustness experiments; new baselines; include Mistral,\n Mistral-Nemo and GPT-NeoX; link to code"},{"id":"http://arxiv.org/abs/2409.03668v1","updated":"2024-09-05T16:22:31Z","published":"2024-09-05T16:22:31Z","title":"A Fused Large Language Model for Predicting Startup Success","summary":" Investors are continuously seeking profitable investment opportunities in\nstartups and, hence, for effective decision-making, need to predict a startup's\nprobability of success. Nowadays, investors can use not only various\nfundamental information about a startup (e.g., the age of the startup, the\nnumber of founders, and the business sector) but also textual description of a\nstartup's innovation and business model, which is widely available through\nonline venture capital (VC) platforms such as Crunchbase. To support the\ndecision-making of investors, we develop a machine learning approach with the\naim of locating successful startups on VC platforms. Specifically, we develop,\ntrain, and evaluate a tailored, fused large language model to predict startup\nsuccess. Thereby, we assess to what extent self-descriptions on VC platforms\nare predictive of startup success. Using 20,172 online profiles from\nCrunchbase, we find that our fused large language model can predict startup\nsuccess, with textual self-descriptions being responsible for a significant\npart of the predictive power. Our work provides a decision support tool for\ninvestors to find profitable investment opportunities.\n","authors":["Abdurahman Maarouf","Stefan Feuerriegel","Nicolas Pröllochs"],"pdf_url":"https://arxiv.org/pdf/2409.03668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14101v2","updated":"2024-09-05T16:21:56Z","published":"2024-02-21T19:53:36Z","title":"Cost-Efficient Subjective Task Annotation and Modeling through Few-Shot\n Annotator Adaptation","summary":" In subjective NLP tasks, where a single ground truth does not exist, the\ninclusion of diverse annotators becomes crucial as their unique perspectives\nsignificantly influence the annotations. In realistic scenarios, the annotation\nbudget often becomes the main determinant of the number of perspectives (i.e.,\nannotators) included in the data and subsequent modeling. We introduce a novel\nframework for annotation collection and modeling in subjective tasks that aims\nto minimize the annotation budget while maximizing the predictive performance\nfor each annotator. Our framework has a two-stage design: first, we rely on a\nsmall set of annotators to build a multitask model, and second, we augment the\nmodel for a new perspective by strategically annotating a few samples per\nannotator. To test our framework at scale, we introduce and release a unique\ndataset, Moral Foundations Subjective Corpus, of 2000 Reddit posts annotated by\n24 annotators for moral sentiment. We demonstrate that our framework surpasses\nthe previous SOTA in capturing the annotators' individual perspectives with as\nlittle as 25% of the original annotation budget on two datasets. Furthermore,\nour framework results in more equitable models, reducing the performance\ndisparity among annotators.\n","authors":["Preni Golazizian","Alireza S. Ziabari","Ali Omrani","Morteza Dehghani"],"pdf_url":"https://arxiv.org/pdf/2402.14101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.06120v3","updated":"2024-09-05T16:19:32Z","published":"2024-02-09T01:10:25Z","title":"Exploring Group and Symmetry Principles in Large Language Models","summary":" Large Language Models (LLMs) have demonstrated impressive performance across\na wide range of applications; however, assessing their reasoning capabilities\nremains a significant challenge. In this paper, we introduce a framework\ngrounded in group and symmetry principles, which have played a crucial role in\nfields such as physics and mathematics, and offer another way to evaluate their\ncapabilities. While the proposed framework is general, to showcase the benefits\nof employing these properties, we focus on arithmetic reasoning and investigate\nthe performance of these models on four group properties: closure, identity,\ninverse, and associativity. Our findings reveal that LLMs studied in this work\nstruggle to preserve group properties across different test regimes. In the\nclosure test, we observe biases towards specific outputs and an abrupt\ndegradation in their performance from 100% to 0% after a specific sequence\nlength. They also perform poorly in the identity test, which represents adding\nirrelevant information in the context, and show sensitivity when subjected to\ninverse test, which examines the robustness of the model with respect to\nnegation. In addition, we demonstrate that breaking down problems into smaller\nsteps helps LLMs in the associativity test that we have conducted. To support\nthese tests we have developed a synthetic dataset which will be released.\n","authors":["Shima Imani","Hamid Palangi"],"pdf_url":"https://arxiv.org/pdf/2402.06120v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16639v3","updated":"2024-09-05T16:17:20Z","published":"2023-11-28T09:45:02Z","title":"Positioning Political Texts with Large Language Models by Asking and\n Averaging","summary":" We use instruction-tuned Large Language Models (LLMs) like GPT-4, Llama 3,\nMiXtral, or Aya to position political texts within policy and ideological\nspaces. We ask an LLM where a tweet or a sentence of a political text stands on\nthe focal dimension and take the average of the LLM responses to position\npolitical actors such as US Senators, or longer texts such as UK party\nmanifestos or EU policy speeches given in 10 different languages. The\ncorrelations between the position estimates obtained with the best LLMs and\nbenchmarks based on text coding by experts, crowdworkers, or roll call votes\nexceed .90. This approach is generally more accurate than the positions\nobtained with supervised classifiers trained on large amounts of research data.\nUsing instruction-tuned LLMs to position texts in policy and ideological spaces\nis fast, cost-efficient, reliable, and reproducible (in the case of open LLMs)\neven if the texts are short and written in different languages. We conclude\nwith cautionary notes about the need for empirical validation.\n","authors":["Gaël Le Mens","Aina Gallego"],"pdf_url":"https://arxiv.org/pdf/2311.16639v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03662v1","updated":"2024-09-05T16:15:12Z","published":"2024-09-05T16:15:12Z","title":"The representation landscape of few-shot learning and fine-tuning in\n large language models","summary":" In-context learning (ICL) and supervised fine-tuning (SFT) are two common\nstrategies for improving the performance of modern large language models (LLMs)\non specific tasks. Despite their different natures, these strategies often lead\nto comparable performance gains. However, little is known about whether they\ninduce similar representations inside LLMs. We approach this problem by\nanalyzing the probability landscape of their hidden representations in the two\ncases. More specifically, we compare how LLMs solve the same question-answering\ntask, finding that ICL and SFT create very different internal structures, in\nboth cases undergoing a sharp transition in the middle of the network. In the\nfirst half of the network, ICL shapes interpretable representations\nhierarchically organized according to their semantic content. In contrast, the\nprobability landscape obtained with SFT is fuzzier and semantically mixed. In\nthe second half of the model, the fine-tuned representations develop\nprobability modes that better encode the identity of answers, while the\nlandscape of ICL representations is characterized by less defined peaks. Our\napproach reveals the diverse computational strategies developed inside LLMs to\nsolve the same task across different conditions, allowing us to make a step\ntowards designing optimal methods to extract information from language models.\n","authors":["Diego Doimo","Alessandro Serra","Alessio Ansuini","Alberto Cazzaniga"],"pdf_url":"https://arxiv.org/pdf/2409.03662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03659v1","updated":"2024-09-05T16:12:29Z","published":"2024-09-05T16:12:29Z","title":"LLM-based multi-agent poetry generation in non-cooperative environments","summary":" Despite substantial progress of large language models (LLMs) for automatic\npoetry generation, the generated poetry lacks diversity while the training\nprocess differs greatly from human learning. Under the rationale that the\nlearning process of the poetry generation systems should be more human-like and\ntheir output more diverse and novel, we introduce a framework based on social\nlearning where we emphasize non-cooperative interactions besides cooperative\ninteractions to encourage diversity. Our experiments are the first attempt at\nLLM-based multi-agent systems in non-cooperative environments for poetry\ngeneration employing both TRAINING-BASED agents (GPT-2) and PROMPTING-BASED\nagents (GPT-3 and GPT-4). Our evaluation based on 96k generated poems shows\nthat our framework benefits the poetry generation process for TRAINING-BASED\nagents resulting in 1) a 3.0-3.7 percentage point (pp) increase in diversity\nand a 5.6-11.3 pp increase in novelty according to distinct and novel n-grams.\nThe generated poetry from TRAINING-BASED agents also exhibits group divergence\nin terms of lexicons, styles and semantics. PROMPTING-BASED agents in our\nframework also benefit from non-cooperative environments and a more diverse\nensemble of models with non-homogeneous agents has the potential to further\nenhance diversity, with an increase of 7.0-17.5 pp according to our\nexperiments. However, PROMPTING-BASED agents show a decrease in lexical\ndiversity over time and do not exhibit the group-based divergence intended in\nthe social network. Our paper argues for a paradigm shift in creative tasks\nsuch as automatic poetry generation to include social learning processes (via\nLLM-based agent modeling) similar to human interaction.\n","authors":["Ran Zhang","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2409.03659v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.03650v1","updated":"2024-09-05T16:08:19Z","published":"2024-09-05T16:08:19Z","title":"On the Limited Generalization Capability of the Implicit Reward Model\n Induced by Direct Preference Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) is an effective approach\nfor aligning language models to human preferences. Central to RLHF is learning\na reward function for scoring human preferences. Two main approaches for\nlearning a reward model are 1) training an EXplicit Reward Model (EXRM) as in\nRLHF, and 2) using an implicit reward learned from preference data through\nmethods such as Direct Preference Optimization (DPO). Prior work has shown that\nthe implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in\nthe limit. DPORM's effectiveness directly implies the optimality of the learned\npolicy, and also has practical implication for LLM alignment methods including\niterative DPO. However, it is unclear how well DPORM empirically matches the\nperformance of EXRM. This work studies the accuracy at distinguishing preferred\nand rejected answers for both DPORM and EXRM. Our findings indicate that even\nthough DPORM fits the training dataset comparably, it generalizes less\neffectively than EXRM, especially when the validation datasets contain\ndistribution shifts. Across five out-of-distribution settings, DPORM has a mean\ndrop in accuracy of 3% and a maximum drop of 7%. These findings highlight that\nDPORM has limited generalization ability and substantiates the integration of\nan explicit reward model in iterative DPO approaches.\n","authors":["Yong Lin","Skyler Seto","Maartje ter Hoeve","Katherine Metcalf","Barry-John Theobald","Xuan Wang","Yizhe Zhang","Chen Huang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03650v1.pdf","comment":"12 pages, 8 tables, 2 figures"},{"id":"http://arxiv.org/abs/2408.12547v2","updated":"2024-09-05T16:07:37Z","published":"2024-08-22T17:01:34Z","title":"Towards Evaluating and Building Versatile Large Language Models for\n Medicine","summary":" In this study, we present MedS-Bench, a comprehensive benchmark designed to\nevaluate the performance of large language models (LLMs) in clinical contexts.\nUnlike existing benchmarks that focus on multiple-choice question answering,\nMedS-Bench spans 11 high-level clinical tasks, including clinical report\nsummarization, treatment recommendations, diagnosis, named entity recognition,\nand medical concept explanation, among others. We evaluated six leading LLMs,\ne.g., MEDITRON, Mistral, InternLM 2, Llama 3, GPT-4, and Claude-3.5 using\nfew-shot prompting, and found that even the most sophisticated models struggle\nwith these complex tasks. To address these limitations, we developed MedS-Ins,\na large-scale instruction tuning dataset for medicine. MedS-Ins comprises 58\nmedically oriented language corpora, totaling 13.5 million samples across 122\ntasks. To demonstrate the dataset's utility, we conducted a proof-of-concept\nexperiment by performing instruction tuning on a lightweight, open-source\nmedical language model. The resulting model, MMedIns-Llama 3, significantly\noutperformed existing models across nearly all clinical tasks. To promote\nfurther advancements in the application of LLMs to clinical challenges, we have\nmade the MedS-Ins dataset fully accessible and invite the research community to\ncontribute to its expansion.Additionally, we have launched a dynamic\nleaderboard for MedS-Bench, which we plan to regularly update the test set to\ntrack progress and enhance the adaptation of general LLMs to the medical\ndomain. Leaderboard: https://henrychur.github.io/MedS-Bench/. Github:\nhttps://github.com/MAGIC-AI4Med/MedS-Ins.\n","authors":["Chaoyi Wu","Pengcheng Qiu","Jinxin Liu","Hongfei Gu","Na Li","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2408.12547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03643v1","updated":"2024-09-05T16:01:21Z","published":"2024-09-05T16:01:21Z","title":"CDM: A Reliable Metric for Fair and Accurate Formula Recognition\n Evaluation","summary":" Formula recognition presents significant challenges due to the complicated\nstructure and varied notation of mathematical expressions. Despite continuous\nadvancements in formula recognition models, the evaluation metrics employed by\nthese models, such as BLEU and Edit Distance, still exhibit notable\nlimitations. They overlook the fact that the same formula has diverse\nrepresentations and is highly sensitive to the distribution of training data,\nthereby causing the unfairness in formula recognition evaluation. To this end,\nwe propose a Character Detection Matching (CDM) metric, ensuring the evaluation\nobjectivity by designing a image-level rather than LaTex-level metric score.\nSpecifically, CDM renders both the model-predicted LaTeX and the ground-truth\nLaTeX formulas into image-formatted formulas, then employs visual feature\nextraction and localization techniques for precise character-level matching,\nincorporating spatial position information. Such a spatially-aware and\ncharacter-matching method offers a more accurate and equitable evaluation\ncompared with previous BLEU and Edit Distance metrics that rely solely on\ntext-based character matching. Experimentally, we evaluated various formula\nrecognition models using CDM, BLEU, and ExpRate metrics. Their results\ndemonstrate that the CDM aligns more closely with human evaluation standards\nand provides a fairer comparison across different models by eliminating\ndiscrepancies caused by diverse formula representations.\n","authors":["Bin Wang","Fan Wu","Linke Ouyang","Zhuangcheng Gu","Rui Zhang","Renqiu Xia","Bo Zhang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2409.03643v1.pdf","comment":"Project Website:\n https://github.com/opendatalab/UniMERNet/tree/main/cdm"},{"id":"http://arxiv.org/abs/2408.15488v2","updated":"2024-09-05T15:50:44Z","published":"2024-08-28T02:27:07Z","title":"Legilimens: Practical and Unified Content Moderation for Large Language\n Model Services","summary":" Given the societal impact of unsafe content generated by large language\nmodels (LLMs), ensuring that LLM services comply with safety standards is a\ncrucial concern for LLM service providers. Common content moderation methods\nare limited by an effectiveness-and-efficiency dilemma, where simple models are\nfragile while sophisticated models consume excessive computational resources.\nIn this paper, we reveal for the first time that effective and efficient\ncontent moderation can be achieved by extracting conceptual features from\nchat-oriented LLMs, despite their initial fine-tuning for conversation rather\nthan content moderation. We propose a practical and unified content moderation\nframework for LLM services, named Legilimens, which features both effectiveness\nand efficiency. Our red-team model-based data augmentation enhances the\nrobustness of Legilimens against state-of-the-art jailbreaking. Additionally,\nwe develop a framework to theoretically analyze the cost-effectiveness of\nLegilimens compared to other methods. We have conducted extensive experiments\non five host LLMs, seventeen datasets, and nine jailbreaking methods to verify\nthe effectiveness, efficiency, and robustness of Legilimens against normal and\nadaptive adversaries. A comparison of Legilimens with both commercial and\nacademic baselines demonstrates the superior performance of Legilimens.\nFurthermore, we confirm that Legilimens can be applied to few-shot scenarios\nand extended to multi-label classification tasks.\n","authors":["Jialin Wu","Jiangyi Deng","Shengyuan Pang","Yanjiao Chen","Jiayang Xu","Xinfeng Li","Wenyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2408.15488v2.pdf","comment":"Accepted by ACM Conference on Computer and Communications Security\n (CCS) 2024"},{"id":"http://arxiv.org/abs/2408.10468v4","updated":"2024-09-05T15:47:45Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n Influence Functions","summary":" The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E\ndataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03621v1","updated":"2024-09-05T15:33:24Z","published":"2024-09-05T15:33:24Z","title":"Attend First, Consolidate Later: On the Importance of Attention in\n Different LLM Layers","summary":" In decoder-based LLMs, the representation of a given layer serves two\npurposes: as input to the next layer during the computation of the current\ntoken; and as input to the attention mechanism of future tokens. In this work,\nwe show that the importance of the latter role might be overestimated. To show\nthat, we start by manipulating the representations of previous tokens; e.g. by\nreplacing the hidden states at some layer k with random vectors. Our\nexperimenting with four LLMs and four tasks show that this operation often\nleads to small to negligible drop in performance. Importantly, this happens if\nthe manipulation occurs in the top part of the model-k is in the final 30-50%\nof the layers. In contrast, doing the same manipulation in earlier layers might\nlead to chance level performance. We continue by switching the hidden state of\ncertain tokens with hidden states of other tokens from another prompt; e.g.,\nreplacing the word \"Italy\" with \"France\" in \"What is the capital of Italy?\". We\nfind that when applying this switch in the top 1/3 of the model, the model\nignores it (answering \"Rome\"). However if we apply it before, the model\nconforms to the switch (\"Paris\"). Our results hint at a two stage process in\ntransformer-based LLMs: the first part gathers input from previous tokens,\nwhile the second mainly processes that information internally.\n","authors":["Amit Ben Artzy","Roy Schwartz"],"pdf_url":"https://arxiv.org/pdf/2409.03621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07666v4","updated":"2024-09-05T14:37:59Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03563v1","updated":"2024-09-05T14:19:45Z","published":"2024-09-05T14:19:45Z","title":"100 instances is all you need: predicting the success of a new LLM on\n unseen data by testing on a few instances","summary":" Predicting the performance of LLMs on individual task instances is essential\nto ensure their reliability in high-stakes applications. To do so, a\npossibility is to evaluate the considered LLM on a set of task instances and\ntrain an assessor to predict its performance based on features of the\ninstances. However, this approach requires evaluating each new LLM on a\nsufficiently large set of task instances to train an assessor specific to it.\nIn this work, we leverage the evaluation results of previously tested LLMs to\nreduce the number of evaluations required to predict the performance of a new\nLLM. In practice, we propose to test the new LLM on a small set of reference\ninstances and train a generic assessor which predicts the performance of the\nLLM on an instance based on the performance of the former on the reference set\nand features of the instance of interest. We conduct empirical studies on\nHELM-Lite and KindsOfReasoning, a collection of existing reasoning datasets\nthat we introduce, where we evaluate all instruction-fine-tuned OpenAI models\nuntil the January 2024 version of GPT4. When predicting performance on\ninstances with the same distribution as those used to train the generic\nassessor, we find this achieves performance comparable to the LLM-specific\nassessors trained on the full set of instances. Additionally, we find that\nrandomly selecting the reference instances performs as well as some advanced\nselection methods we tested. For out of distribution, however, no clear winner\nemerges and the overall performance is worse, suggesting that the inherent\npredictability of LLMs is low.\n","authors":["Lorenzo Pacchiardi","Lucy G. Cheke","José Hernández-Orallo"],"pdf_url":"https://arxiv.org/pdf/2409.03563v1.pdf","comment":"Presented at the 2024 KDD workshop on Evaluation and Trustworthiness\n of Generative AI Models"},{"id":"http://arxiv.org/abs/2409.03512v1","updated":"2024-09-05T13:22:51Z","published":"2024-09-05T13:22:51Z","title":"From MOOC to MAIC: Reshaping Online Teaching and Learning through\n LLM-driven Agents","summary":" Since the first instances of online education, where courses were uploaded to\naccessible and shared online platforms, this form of scaling the dissemination\nof human knowledge to reach a broader audience has sparked extensive discussion\nand widespread adoption. Recognizing that personalized learning still holds\nsignificant potential for improvement, new AI technologies have been\ncontinuously integrated into this learning format, resulting in a variety of\neducational AI applications such as educational recommendation and intelligent\ntutoring. The emergence of intelligence in large language models (LLMs) has\nallowed for these educational enhancements to be built upon a unified\nfoundational model, enabling deeper integration. In this context, we propose\nMAIC (Massive AI-empowered Course), a new form of online education that\nleverages LLM-driven multi-agent systems to construct an AI-augmented\nclassroom, balancing scalability with adaptivity. Beyond exploring the\nconceptual framework and technical innovations, we conduct preliminary\nexperiments at Tsinghua University, one of China's leading universities.\nDrawing from over 100,000 learning records of more than 500 students, we obtain\na series of valuable observations and initial analyses. This project will\ncontinue to evolve, ultimately aiming to establish a comprehensive open\nplatform that supports and unifies research, technology, and applications in\nexploring the possibilities of online education in the era of large model AI.\nWe envision this platform as a collaborative hub, bringing together educators,\nresearchers, and innovators to collectively explore the future of AI-driven\nonline education.\n","authors":["Jifan Yu","Zheyuan Zhang","Daniel Zhang-li","Shangqing Tu","Zhanxin Hao","Rui Miao Li","Haoxuan Li","Yuanchun Wang","Hanming Li","Linlu Gong","Jie Cao","Jiayin Lin","Jinchang Zhou","Fei Qin","Haohua Wang","Jianxiao Jiang","Lijun Deng","Yisi Zhan","Chaojun Xiao","Xusheng Dai","Xuan Yan","Nianyi Lin","Nan Zhang","Ruixin Ni","Yang Dang","Lei Hou","Yu Zhang","Xu Han","Manli Li","Juanzi Li","Zhiyuan Liu","Huiqin Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2409.03512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03454v1","updated":"2024-09-05T12:06:38Z","published":"2024-09-05T12:06:38Z","title":"How Much Data is Enough Data? Fine-Tuning Large Language Models for\n In-House Translation: Performance Evaluation Across Multiple Dataset Sizes","summary":" Decoder-only LLMs have shown impressive performance in MT due to their\nability to learn from extensive datasets and generate high-quality\ntranslations. However, LLMs often struggle with the nuances and style required\nfor organisation-specific translation. In this study, we explore the\neffectiveness of fine-tuning Large Language Models (LLMs), particularly Llama 3\n8B Instruct, leveraging translation memories (TMs), as a valuable resource to\nenhance accuracy and efficiency. We investigate the impact of fine-tuning the\nLlama 3 model using TMs from a specific organisation in the software sector.\nOur experiments cover five translation directions across languages of varying\nresource levels (English to Brazilian Portuguese, Czech, German, Finnish, and\nKorean). We analyse diverse sizes of training datasets (1k to 207k segments) to\nevaluate their influence on translation quality. We fine-tune separate models\nfor each training set and evaluate their performance based on automatic\nmetrics, BLEU, chrF++, TER, and COMET. Our findings reveal improvement in\ntranslation performance with larger datasets across all metrics. On average,\nBLEU and COMET scores increase by 13 and 25 points, respectively, on the\nlargest training set against the baseline model. Notably, there is a\nperformance deterioration in comparison with the baseline model when\nfine-tuning on only 1k and 2k examples; however, we observe a substantial\nimprovement as the training dataset size increases. The study highlights the\npotential of integrating TMs with LLMs to create bespoke translation models\ntailored to the specific needs of businesses, thus enhancing translation\nquality and reducing turn-around times. This approach offers a valuable insight\nfor organisations seeking to leverage TMs and LLMs for optimal translation\noutcomes, especially in narrower domains.\n","authors":["Inacio Vieira","Will Allred","Seamus Lankford","Sheila Castilho Monteiro De Sousa","Andy Way"],"pdf_url":"https://arxiv.org/pdf/2409.03454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14735v5","updated":"2024-09-05T12:00:55Z","published":"2023-10-23T09:15:18Z","title":"Unleashing the potential of prompt engineering in Large Language Models:\n a comprehensive review","summary":" This comprehensive review delves into the pivotal role of prompt engineering\nin unleashing the capabilities of Large Language Models (LLMs). The development\nof Artificial Intelligence (AI), from its inception in the 1950s to the\nemergence of advanced neural networks and deep learning architectures, has made\na breakthrough in LLMs, with models such as GPT-4o and Claude-3, and in\nVision-Language Models (VLMs), with models such as CLIP and ALIGN. Prompt\nengineering is the process of structuring inputs, which has emerged as a\ncrucial technique to maximize the utility and accuracy of these models. This\npaper explores both foundational and advanced methodologies of prompt\nengineering, including techniques such as self-consistency, chain-of-thought,\nand generated knowledge, which significantly enhance model performance.\nAdditionally, it examines the prompt method of VLMs through innovative\napproaches such as Context Optimization (CoOp), Conditional Context\nOptimization (CoCoOp), and Multimodal Prompt Learning (MaPLe). Critical to this\ndiscussion is the aspect of AI security, particularly adversarial attacks that\nexploit vulnerabilities in prompt engineering. Strategies to mitigate these\nrisks and enhance model robustness are thoroughly reviewed. The evaluation of\nprompt methods is also addressed, through both subjective and objective\nmetrics, ensuring a robust analysis of their efficacy. This review also\nreflects the essential role of prompt engineering in advancing AI capabilities,\nproviding a structured framework for future research and application.\n","authors":["Banghao Chen","Zhaofeng Zhang","Nicolas Langrené","Shengxin Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.14735v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02050v2","updated":"2024-09-05T11:54:52Z","published":"2024-09-03T16:53:38Z","title":"Enhancing Code-Switching Speech Recognition with LID-Based Collaborative\n Mixture of Experts Model","summary":" Due to the inherent difficulty in modeling phonetic similarities across\ndifferent languages, code-switching speech recognition presents a formidable\nchallenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE)\nmodel that leverages a collaborative mechanism among expert groups. Initially,\na preceding routing network explicitly learns Language Identification (LID)\ntasks and selects experts based on acquired LID weights. This process ensures\nrobust routing information to the MoE layer, mitigating interference from\ndiverse language domains on expert network parameter updates. The LID weights\nare also employed to facilitate inter-group collaboration, enabling the\nintegration of language-specific representations. Furthermore, within each\nlanguage expert group, a gating network operates unsupervised to foster\ncollaboration on attributes beyond language. Extensive experiments demonstrate\nthe efficacy of our approach, achieving significant performance enhancements\ncompared to alternative methods. Importantly, our method preserves the\nefficient inference capabilities characteristic of MoE models without\nnecessitating additional pre-training.\n","authors":["Hukai Huang","Jiayan Lin","Kaidi Wang","Yishuang Li","Wenhao Guan","Lin Li","Qingyang Hong"],"pdf_url":"https://arxiv.org/pdf/2409.02050v2.pdf","comment":"Accepted by IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.03444v1","updated":"2024-09-05T11:49:53Z","published":"2024-09-05T11:49:53Z","title":"Fine-tuning large language models for domain adaptation: Exploration of\n training strategies, scaling, model merging and synergistic capabilities","summary":" The advancement of Large Language Models (LLMs) for domain applications in\nfields such as materials science and engineering depends on the development of\nfine-tuning strategies that adapt models for specialized, technical\ncapabilities. In this work, we explore the effects of Continued Pretraining\n(CPT), Supervised Fine-Tuning (SFT), and various preference-based optimization\napproaches, including Direct Preference Optimization (DPO) and Odds Ratio\nPreference Optimization (ORPO), on fine-tuned LLM performance. Our analysis\nshows how these strategies influence model outcomes and reveals that the\nmerging of multiple fine-tuned models can lead to the emergence of capabilities\nthat surpass the individual contributions of the parent models. We find that\nmodel merging leads to new functionalities that neither parent model could\nachieve alone, leading to improved performance in domain-specific assessments.\nExperiments with different model architectures are presented, including Llama\n3.1 8B and Mistral 7B models, where similar behaviors are observed. Exploring\nwhether the results hold also for much smaller models, we use a tiny LLM with\n1.7 billion parameters and show that very small LLMs do not necessarily feature\nemergent capabilities under model merging, suggesting that model scaling may be\na key component. In open-ended yet consistent chat conversations between a\nhuman and AI models, our assessment reveals detailed insights into how\ndifferent model variants perform and show that the smallest model achieves a\nhigh intelligence score across key criteria including reasoning depth,\ncreativity, clarity, and quantitative precision. Other experiments include the\ndevelopment of image generation prompts based on disparate biological material\ndesign concepts, to create new microstructures, architectural concepts, and\nurban design based on biological materials-inspired construction principles.\n","authors":["Wei Lu","Rachel K. Luu","Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2409.03444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03440v1","updated":"2024-09-05T11:42:26Z","published":"2024-09-05T11:42:26Z","title":"Rx Strategist: Prescription Verification using LLM Agents System","summary":" To protect patient safety, modern pharmaceutical complexity demands strict\nprescription verification. We offer a new approach - Rx Strategist - that makes\nuse of knowledge graphs and different search strategies to enhance the power of\nLarge Language Models (LLMs) inside an agentic framework. This multifaceted\ntechnique allows for a multi-stage LLM pipeline and reliable information\nretrieval from a custom-built active ingredient database. Different facets of\nprescription verification, such as indication, dose, and possible drug\ninteractions, are covered in each stage of the pipeline. We alleviate the\ndrawbacks of monolithic LLM techniques by spreading reasoning over these\nstages, improving correctness and reliability while reducing memory demands.\nOur findings demonstrate that Rx Strategist surpasses many current LLMs,\nachieving performance comparable to that of a highly experienced clinical\npharmacist. In the complicated world of modern medications, this combination of\nLLMs with organized knowledge and sophisticated search methods presents a\nviable avenue for reducing prescription errors and enhancing patient outcomes.\n","authors":["Phuc Phan Van","Dat Nguyen Minh","An Dinh Ngoc","Huy Phan Thanh"],"pdf_url":"https://arxiv.org/pdf/2409.03440v1.pdf","comment":"17 Pages, 6 Figures, Under Review"},{"id":"http://arxiv.org/abs/2409.02239v2","updated":"2024-09-05T11:34:00Z","published":"2024-09-03T19:11:15Z","title":"Temporal Order Preserved Optimal Transport-based Cross-modal Knowledge\n Transfer Learning for ASR","summary":" Transferring linguistic knowledge from a pretrained language model (PLM) to\nan acoustic model has been shown to greatly improve the performance of\nautomatic speech recognition (ASR). However, due to the heterogeneous feature\ndistributions in cross-modalities, designing an effective model for feature\nalignment and knowledge transfer between linguistic and acoustic sequences\nremains a challenging task. Optimal transport (OT), which efficiently measures\nprobability distribution discrepancies, holds great potential for aligning and\ntransferring knowledge between acoustic and linguistic modalities. Nonetheless,\nthe original OT treats acoustic and linguistic feature sequences as two\nunordered sets in alignment and neglects temporal order information during OT\ncoupling estimation. Consequently, a time-consuming pretraining stage is\nrequired to learn a good alignment between the acoustic and linguistic\nrepresentations. In this paper, we propose a Temporal Order Preserved OT\n(TOT)-based Cross-modal Alignment and Knowledge Transfer (CAKT) (TOT-CAKT) for\nASR. In the TOT-CAKT, local neighboring frames of acoustic sequences are\nsmoothly mapped to neighboring regions of linguistic sequences, preserving\ntheir temporal order relationship in feature alignment and matching. With the\nTOT-CAKT model framework, we conduct Mandarin ASR experiments with a pretrained\nChinese PLM for linguistic knowledge transfer. Our results demonstrate that the\nproposed TOT-CAKT significantly improves ASR performance compared to several\nstate-of-the-art models employing linguistic knowledge transfer, and addresses\nthe weaknesses of the original OT-based method in sequential feature alignment\nfor ASR.\n","authors":["Xugang Lu","Peng Shen","Yu Tsao","Hisashi Kawai"],"pdf_url":"https://arxiv.org/pdf/2409.02239v2.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2408.15778v3","updated":"2024-09-05T10:30:39Z","published":"2024-08-28T13:16:41Z","title":"LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language\n Models","summary":" Large Language Models (LLMs) have demonstrated notable capabilities across\nvarious tasks, showcasing complex problem-solving abilities. Understanding and\nexecuting complex rules, along with multi-step planning, are fundamental to\nlogical reasoning and critical for practical LLM agents and decision-making\nsystems. However, evaluating LLMs as effective rule-based executors and\nplanners remains underexplored. In this paper, we introduce LogicGame, a novel\nbenchmark designed to evaluate the comprehensive rule understanding, execution,\nand planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame\nprovides diverse games that contain a series of rules with an initial state,\nrequiring models to comprehend and apply predefined regulations to solve\nproblems. We create simulated scenarios in which models execute or plan\noperations to achieve specific outcomes. These game scenarios are specifically\ndesigned to distinguish logical reasoning from mere knowledge by relying\nexclusively on predefined rules. This separation allows for a pure assessment\nof rule-based reasoning capabilities. The evaluation considers not only final\noutcomes but also intermediate steps, providing a comprehensive assessment of\nmodel performance. Moreover, these intermediate steps are deterministic and can\nbe automatically verified. LogicGame defines game scenarios with varying\ndifficulty levels, from simple rule applications to complex reasoning chains,\nin order to offer a precise evaluation of model performance on rule\nunderstanding and multi-step execution. Utilizing LogicGame, we test various\nLLMs and identify notable shortcomings in their rule-based logical reasoning\nabilities.\n","authors":["Jiayi Gui","Yiming Liu","Jiale Cheng","Xiaotao Gu","Xiao Liu","Hongning Wang","Yuxiao Dong","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2408.15778v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06668v2","updated":"2024-09-05T10:07:46Z","published":"2024-05-03T14:49:04Z","title":"Exposing and Explaining Fake News On-the-Fly","summary":" Social media platforms enable the rapid dissemination and consumption of\ninformation. However, users instantly consume such content regardless of the\nreliability of the shared data. Consequently, the latter crowdsourcing model is\nexposed to manipulation. This work contributes with an explainable and online\nclassification method to recognize fake news in real-time. The proposed method\ncombines both unsupervised and supervised Machine Learning approaches with\nonline created lexica. The profiling is built using creator-, content- and\ncontext-based features using Natural Language Processing techniques. The\nexplainable classification mechanism displays in a dashboard the features\nselected for classification and the prediction confidence. The performance of\nthe proposed solution has been validated with real data sets from Twitter and\nthe results attain 80 % accuracy and macro F-measure. This proposal is the\nfirst to jointly provide data stream processing, profiling, classification and\nexplainability. Ultimately, the proposed early detection, isolation and\nexplanation of fake news contribute to increase the quality and trustworthiness\nof social media contents.\n","authors":["Francisco de Arriba-Pérez","Silvia García-Méndez","Fátima Leal","Benedita Malheiro","Juan Carlos Burguillo"],"pdf_url":"https://arxiv.org/pdf/2405.06668v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11983v2","updated":"2024-09-05T10:01:39Z","published":"2024-05-20T12:33:42Z","title":"A review on the use of large language models as virtual tutors","summary":" Transformer architectures contribute to managing long-term dependencies for\nNatural Language Processing, representing one of the most recent changes in the\nfield. These architectures are the basis of the innovative, cutting-edge Large\nLanguage Models (LLMs) that have produced a huge buzz in several fields and\nindustrial sectors, among the ones education stands out. Accordingly, these\ngenerative Artificial Intelligence-based solutions have directed the change in\ntechniques and the evolution in educational methods and contents, along with\nnetwork infrastructure, towards high-quality learning. Given the popularity of\nLLMs, this review seeks to provide a comprehensive overview of those solutions\ndesigned specifically to generate and evaluate educational materials and which\ninvolve students and teachers in their design or experimental plan. To the best\nof our knowledge, this is the first review of educational applications (e.g.,\nstudent assessment) of LLMs. As expected, the most common role of these systems\nis as virtual tutors for automatic question generation. Moreover, the most\npopular models are GTP-3 and BERT. However, due to the continuous launch of new\ngenerative models, new works are expected to be published shortly.\n","authors":["Silvia García-Méndez","Francisco de Arriba-Pérez","María del Carmen Somoza-López"],"pdf_url":"https://arxiv.org/pdf/2405.11983v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03381v1","updated":"2024-09-05T09:33:24Z","published":"2024-09-05T09:33:24Z","title":"CogniDual Framework: Self-Training Large Language Models within a\n Dual-System Theoretical Framework for Improving Cognitive Tasks","summary":" Cognitive psychology investigates perception, attention, memory, language,\nproblem-solving, decision-making, and reasoning. Kahneman's dual-system theory\nelucidates the human decision-making process, distinguishing between the rapid,\nintuitive System 1 and the deliberative, rational System 2. Recent advancements\nhave positioned large language Models (LLMs) as formidable tools nearing\nhuman-level proficiency in various cognitive tasks. Nonetheless, the presence\nof a dual-system framework analogous to human cognition in LLMs remains\nunexplored. This study introduces the \\textbf{CogniDual Framework for LLMs}\n(CFLLMs), designed to assess whether LLMs can, through self-training, evolve\nfrom deliberate deduction to intuitive responses, thereby emulating the human\nprocess of acquiring and mastering new information. Our findings reveal the\ncognitive mechanisms behind LLMs' response generation, enhancing our\nunderstanding of their capabilities in cognitive psychology. Practically,\nself-trained models can provide faster responses to certain queries, reducing\ncomputational demands during inference.\n","authors":["Yongxin Deng","Xihe Qiu","Xiaoyu Tan","Chao Qu","Jing Pan","Yuan Cheng","Yinghui Xu","Wei Chu"],"pdf_url":"https://arxiv.org/pdf/2409.03381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03375v1","updated":"2024-09-05T09:27:05Z","published":"2024-09-05T09:27:05Z","title":"Leveraging Large Language Models through Natural Language Processing to\n provide interpretable Machine Learning predictions of mental deterioration in\n real time","summary":" Based on official estimates, 50 million people worldwide are affected by\ndementia, and this number increases by 10 million new patients every year.\nWithout a cure, clinical prognostication and early intervention represent the\nmost effective ways to delay its progression. To this end, Artificial\nIntelligence and computational linguistics can be exploited for natural\nlanguage analysis, personalized assessment, monitoring, and treatment. However,\ntraditional approaches need more semantic knowledge management and\nexplicability capabilities. Moreover, using Large Language Models (LLMs) for\ncognitive decline diagnosis is still scarce, even though these models represent\nthe most advanced way for clinical-patient communication using intelligent\nsystems. Consequently, we leverage an LLM using the latest Natural Language\nProcessing (NLP) techniques in a chatbot solution to provide interpretable\nMachine Learning prediction of cognitive decline in real-time.\nLinguistic-conceptual features are exploited for appropriate natural language\nanalysis. Through explainability, we aim to fight potential biases of the\nmodels and improve their potential to help clinical workers in their diagnosis\ndecisions. More in detail, the proposed pipeline is composed of (i) data\nextraction employing NLP-based prompt engineering; (ii) stream-based data\nprocessing including feature engineering, analysis, and selection; (iii)\nreal-time classification; and (iv) the explainability dashboard to provide\nvisual and natural language descriptions of the prediction outcome.\nClassification results exceed 80 % in all evaluation metrics, with a recall\nvalue for the mental deterioration class about 85 %. To sum up, we contribute\nwith an affordable, flexible, non-invasive, personalized diagnostic system to\nthis work.\n","authors":["Francisco de Arriba-Pérez","Silvia García-Méndez"],"pdf_url":"https://arxiv.org/pdf/2409.03375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03363v1","updated":"2024-09-05T09:10:38Z","published":"2024-09-05T09:10:38Z","title":"Con-ReCall: Detecting Pre-training Data in LLMs via Contrastive Decoding","summary":" The training data in large language models is key to their success, but it\nalso presents privacy and security risks, as it may contain sensitive\ninformation. Detecting pre-training data is crucial for mitigating these\nconcerns. Existing methods typically analyze target text in isolation or solely\nwith non-member contexts, overlooking potential insights from simultaneously\nconsidering both member and non-member contexts. While previous work suggested\nthat member contexts provide little information due to the minor distributional\nshift they induce, our analysis reveals that these subtle shifts can be\neffectively leveraged when contrasted with non-member contexts. In this paper,\nwe propose Con-ReCall, a novel approach that leverages the asymmetric\ndistributional shifts induced by member and non-member contexts through\ncontrastive decoding, amplifying subtle differences to enhance membership\ninference. Extensive empirical evaluations demonstrate that Con-ReCall achieves\nstate-of-the-art performance on the WikiMIA benchmark and is robust against\nvarious text manipulation techniques.\n","authors":["Cheng Wang","Yiwei Wang","Bryan Hooi","Yujun Cai","Nanyun Peng","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2409.03363v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03346v1","updated":"2024-09-05T08:45:44Z","published":"2024-09-05T08:45:44Z","title":"Sketch: A Toolkit for Streamlining LLM Operations","summary":" Large language models (LLMs) represented by GPT family have achieved\nremarkable success. The characteristics of LLMs lie in their ability to\naccommodate a wide range of tasks through a generative approach. However, the\nflexibility of their output format poses challenges in controlling and\nharnessing the model's outputs, thereby constraining the application of LLMs in\nvarious domains. In this work, we present Sketch, an innovative toolkit\ndesigned to streamline LLM operations across diverse fields. Sketch comprises\nthe following components: (1) a suite of task description schemas and prompt\ntemplates encompassing various NLP tasks; (2) a user-friendly, interactive\nprocess for building structured output LLM services tailored to various NLP\ntasks; (3) an open-source dataset for output format control, along with tools\nfor dataset construction; and (4) an open-source model based on\nLLaMA3-8B-Instruct that adeptly comprehends and adheres to output formatting\ninstructions. We anticipate this initiative to bring considerable convenience\nto LLM users, achieving the goal of ''plug-and-play'' for various applications.\nThe components of Sketch will be progressively open-sourced at\nhttps://github.com/cofe-ai/Sketch.\n","authors":["Xin Jiang","Xiang Li","Wenjia Ma","Xuezhi Fang","Yiqun Yao","Naitong Yu","Xuying Meng","Peng Han","Jing Li","Aixin Sun","Yequan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03327v1","updated":"2024-09-05T08:03:47Z","published":"2024-09-05T08:03:47Z","title":"Normal forms in Virus Machines","summary":" In the present work, we further study the computational power of virus\nmachines (VMs in short). VMs provide a computing paradigm inspired by the\ntransmission and replication networks of viruses. VMs consist of process units\n(called hosts) structured by a directed graph whose arcs are called channels\nand an instruction graph that controls the transmissions of virus objects among\nhosts. The present work complements our understanding of the computing power of\nVMs by introducing normal forms; these expressions restrict the features in a\ngiven computing model. Some of the features that we restrict in our normal\nforms include (a) the number of hosts, (b) the number of instructions, and (c)\nthe number of virus objects in each host. After we recall some known results on\nthe computing power of VMs we give our normal forms, such as the size of the\nloops in the network, proving new characterisations of family of sets, such as\nthe finite sets, semilinear sets, or NRE.\n","authors":["A. Ramírez-de-Arellano","F. G. C. Cabarle","D. Orellana-Martín","M. J. Pérez-Jiménez"],"pdf_url":"https://arxiv.org/pdf/2409.03327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06570v2","updated":"2024-09-05T07:46:09Z","published":"2024-03-11T10:11:29Z","title":"Improving Speaker Assignment in Speaker-Attributed ASR for Real Meeting\n Applications","summary":" Past studies on end-to-end meeting transcription have focused on model\narchitecture and have mostly been evaluated on simulated meeting data. We\npresent a novel study aiming to optimize the use of a Speaker-Attributed ASR\n(SA-ASR) system in real-life scenarios, such as the AMI meeting corpus, for\nimproved speaker assignment of speech segments. First, we propose a pipeline\ntailored to real-life applications involving Voice Activity Detection (VAD),\nSpeaker Diarization (SD), and SA-ASR. Second, we advocate using VAD output\nsegments to fine-tune the SA-ASR model, considering that it is also applied to\nVAD segments during test, and show that this results in a relative reduction of\nSpeaker Error Rate (SER) up to 28%. Finally, we explore strategies to enhance\nthe extraction of the speaker embedding templates used as inputs by the SA-ASR\nsystem. We show that extracting them from SD output rather than annotated\nspeaker segments results in a relative SER reduction up to 20%.\n","authors":["Can Cui","Imran Ahamad Sheikh","Mostafa Sadeghi","Emmanuel Vincent"],"pdf_url":"https://arxiv.org/pdf/2403.06570v2.pdf","comment":"Submitted to Odyssey 2024"},{"id":"http://arxiv.org/abs/2409.02727v2","updated":"2024-09-05T07:17:59Z","published":"2024-09-04T14:01:48Z","title":"Pooling And Attention: What Are Effective Designs For LLM-Based\n Embedding Models?","summary":" The significant advancements of Large Language Models (LLMs) in generative\ntasks have led to a growing body of work exploring LLM-based embedding models.\nWhile these models, employing different pooling and attention strategies, have\nachieved state-of-the-art performance on public embedding benchmarks, questions\nstill arise about what constitutes an effective design for LLM-based embedding\nmodels. However, these models are often trained on different datasets, using\ndifferent LLM base models or training settings. Moreover, evaluations on public\nembedding benchmarks often fail to report statistical significance, making it\ndifficult to determine which designs truly contribute to final performance.\nThis complicates the process for practitioners seeking optimal training recipes\nfor LLM-based embedding models. In this study, we conduct a large-scale\nexperiment by training a series of LLM-based embedding models using the same\ntraining data and base model but differing in their pooling and attention\nstrategies. The results show that there is no one-size-fits-all solution: while\nbidirectional attention and an additional trainable pooling layer outperform in\ntext similarity and information retrieval tasks, they do not significantly\nsurpass simpler designs like EOS-last token pooling and default causal\nattention in clustering and classification tasks. Furthermore, we propose a new\npooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs\nof all hidden layers, rather than just the last layer, using a cross-attention\nnetwork. This method proves to be statistically superior in text similarity and\nretrieval tasks compared to existing pooling methods. Overall, this paper sheds\nlight on effective training strategies for LLM-based embedding models.\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02727v2.pdf","comment":"https://github.com/yixuantt/PoolingAndAttn"},{"id":"http://arxiv.org/abs/2409.03295v1","updated":"2024-09-05T07:03:23Z","published":"2024-09-05T07:03:23Z","title":"N-gram Prediction and Word Difference Representations for Language\n Modeling","summary":" Causal language modeling (CLM) serves as the foundational framework\nunderpinning remarkable successes of recent large language models (LLMs).\nDespite its success, the training approach for next word prediction poses a\npotential risk of causing the model to overly focus on local dependencies\nwithin a sentence. While prior studies have been introduced to predict future N\nwords simultaneously, they were primarily applied to tasks such as masked\nlanguage modeling (MLM) and neural machine translation (NMT). In this study, we\nintroduce a simple N-gram prediction framework for the CLM task. Moreover, we\nintroduce word difference representation (WDR) as a surrogate and\ncontextualized target representation during model training on the basis of\nN-gram prediction framework. To further enhance the quality of next word\nprediction, we propose an ensemble method that incorporates the future N words'\nprediction results. Empirical evaluations across multiple benchmark datasets\nencompassing CLM and NMT tasks demonstrate the significant advantages of our\nproposed methods over the conventional CLM.\n","authors":["DongNyeong Heo","Daniela Noemi Rim","Heeyoul Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03291v1","updated":"2024-09-05T06:55:13Z","published":"2024-09-05T06:55:13Z","title":"LLM Detectors Still Fall Short of Real World: Case of LLM-Generated\n Short News-Like Posts","summary":" With the emergence of widely available powerful LLMs, disinformation\ngenerated by large Language Models (LLMs) has become a major concern.\nHistorically, LLM detectors have been touted as a solution, but their\neffectiveness in the real world is still to be proven. In this paper, we focus\non an important setting in information operations -- short news-like posts\ngenerated by moderately sophisticated attackers.\n We demonstrate that existing LLM detectors, whether zero-shot or\npurpose-trained, are not ready for real-world use in that setting. All tested\nzero-shot detectors perform inconsistently with prior benchmarks and are highly\nvulnerable to sampling temperature increase, a trivial attack absent from\nrecent benchmarks. A purpose-trained detector generalizing across LLMs and\nunseen attacks can be developed, but it fails to generalize to new\nhuman-written texts.\n We argue that the former indicates domain-specific benchmarking is needed,\nwhile the latter suggests a trade-off between the adversarial evasion\nresilience and overfitting to the reference human text, with both needing\nevaluation in benchmarks and currently absent. We believe this suggests a\nre-consideration of current LLM detector benchmarking approaches and provides a\ndynamically extensible benchmark to allow it\n(https://github.com/Reliable-Information-Lab-HEVS/dynamic_llm_detector_benchmark).\n","authors":["Henrique Da Silva Gameiro","Andrei Kucharavy","Ljiljana Dolamic"],"pdf_url":"https://arxiv.org/pdf/2409.03291v1.pdf","comment":"20 pages, 7 tables, 13 figures, under consideration for EMNLP"},{"id":"http://arxiv.org/abs/2409.03284v1","updated":"2024-09-05T06:49:14Z","published":"2024-09-05T06:49:14Z","title":"iText2KG: Incremental Knowledge Graphs Construction Using Large Language\n Models","summary":" Most available data is unstructured, making it challenging to access valuable\ninformation. Automatically building Knowledge Graphs (KGs) is crucial for\nstructuring data and making it accessible, allowing users to search for\ninformation effectively. KGs also facilitate insights, inference, and\nreasoning. Traditional NLP methods, such as named entity recognition and\nrelation extraction, are key in information retrieval but face limitations,\nincluding the use of predefined entity types and the need for supervised\nlearning. Current research leverages large language models' capabilities, such\nas zero- or few-shot learning. However, unresolved and semantically duplicated\nentities and relations still pose challenges, leading to inconsistent graphs\nand requiring extensive post-processing. Additionally, most approaches are\ntopic-dependent. In this paper, we propose iText2KG, a method for incremental,\ntopic-independent KG construction without post-processing. This plug-and-play,\nzero-shot method is applicable across a wide range of KG construction scenarios\nand comprises four modules: Document Distiller, Incremental Entity Extractor,\nIncremental Relation Extractor, and Graph Integrator and Visualization. Our\nmethod demonstrates superior performance compared to baseline methods across\nthree scenarios: converting scientific papers to graphs, websites to graphs,\nand CVs to graphs.\n","authors":["Yassir Lairgi","Ludovic Moncla","Rémy Cazabet","Khalid Benabdeslem","Pierre Cléau"],"pdf_url":"https://arxiv.org/pdf/2409.03284v1.pdf","comment":"Accepted at The International Web Information Systems Engineering\n conference (the WISE conference) 2024"},{"id":"http://arxiv.org/abs/2406.16672v2","updated":"2024-09-05T06:44:24Z","published":"2024-06-24T14:27:54Z","title":"CAVE: Controllable Authorship Verification Explanations","summary":" Authorship Verification (AV) (do two documents have the same author?) is\nessential in many sensitive real-life applications. AV is often used in\nproprietary domains that require a private, offline model, making SOTA online\nmodels like ChatGPT undesirable. Current offline models however have lower\ndownstream utility due to low accuracy/scalability (eg: traditional stylometry\nAV systems) and lack of accessible post-hoc explanations. In this work, we take\nthe first step to address the above challenges with our trained, offline\nLlama-3-8B model CAVE (Controllable Authorship Verification Explanations): CAVE\ngenerates free-text AV explanations that are controlled to be (1) structured\n(can be decomposed into sub-explanations in terms of relevant linguistic\nfeatures), and (2) easily verified for explanation-label consistency (via\nintermediate labels in sub-explanations). We first engineer a prompt that can\ngenerate silver training data from a SOTA teacher model in the desired CAVE\noutput format. We then filter and distill this data into a pretrained\nLlama-3-8B, our carefully selected student model. Results on three difficult AV\ndatasets IMDb62, Blog-Auth, and Fanfiction show that CAVE generates high\nquality explanations (as measured by automatic and human evaluation) as well as\ncompetitive task accuracies.\n","authors":["Sahana Ramnath","Kartik Pandey","Elizabeth Boschee","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2406.16672v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03277v1","updated":"2024-09-05T06:41:02Z","published":"2024-09-05T06:41:02Z","title":"ChartMoE: Mixture of Expert Connector for Advanced Chart Understanding","summary":" Automatic chart understanding is crucial for content comprehension and\ndocument parsing. Multimodal large language models (MLLMs) have demonstrated\nremarkable capabilities in chart understanding through domain-specific\nalignment and fine-tuning. However, the application of alignment training\nwithin the chart domain is still underexplored. To address this, we propose\nChartMoE, which employs the mixture of expert (MoE) architecture to replace the\ntraditional linear projector to bridge the modality gap. Specifically, we train\nmultiple linear connectors through distinct alignment tasks, which are utilized\nas the foundational initialization parameters for different experts.\nAdditionally, we introduce ChartMoE-Align, a dataset with over 900K\nchart-table-JSON-code quadruples to conduct three alignment tasks\n(chart-table/JSON/code). Combined with the vanilla connector, we initialize\ndifferent experts in four distinct ways and adopt high-quality knowledge\nlearning to further refine the MoE connector and LLM parameters. Extensive\nexperiments demonstrate the effectiveness of the MoE connector and our\ninitialization strategy, e.g., ChartMoE improves the accuracy of the previous\nstate-of-the-art from 80.48% to 84.64% on the ChartQA benchmark.\n","authors":["Zhengzhuo Xu","Bowen Qu","Yiyan Qi","Sinan Du","Chengjin Xu","Chun Yuan","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2409.03277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16567v3","updated":"2024-09-05T06:34:11Z","published":"2024-02-26T13:46:51Z","title":"Aligning Large Language Models to a Domain-specific Graph Database for\n NL2GQL","summary":" Graph Databases (Graph DB) find extensive application across diverse domains\nsuch as finance, social networks, and medicine. Yet, the translation of Natural\nLanguage (NL) into the Graph Query Language (GQL), referred to as NL2GQL, poses\nsignificant challenges owing to its intricate and specialized nature. Some\napproaches have sought to utilize Large Language Models (LLMs) to address\nanalogous tasks like text2SQL. Nonetheless, in the realm of NL2GQL tasks\ntailored to a particular domain, the absence of domain-specific NL-GQL data\npairs adds complexity to aligning LLMs with the graph DB. To tackle this\nchallenge, we present a well-defined pipeline. Initially, we utilize ChatGPT to\ngenerate NL-GQL data pairs, leveraging the provided graph DB with\nself-instruction. Subsequently, we employ the generated data to fine-tune LLMs,\nensuring alignment between LLMs and the graph DB. Moreover, we find the\nimportance of relevant schema in efficiently generating accurate GQLs. Thus, we\nintroduce a method to extract relevant schema as the input context. We evaluate\nour method using two carefully constructed datasets derived from graph DBs in\nthe finance and medicine domains, named FinGQL and MediGQL. Experimental\nresults reveal that our approach significantly outperforms a set of baseline\nmethods, with improvements of 5.90 and 6.36 absolute points on EM, and 6.00 and\n7.09 absolute points on EX for FinGQL and MediGQL, respectively.\n","authors":["Yuanyuan Liang","Keren Tan","Tingyu Xie","Wenbiao Tao","Siyuan Wang","Yunshi Lan","Weining Qian"],"pdf_url":"https://arxiv.org/pdf/2402.16567v3.pdf","comment":"13 pages,2 figures"},{"id":"http://arxiv.org/abs/2408.15966v2","updated":"2024-09-05T06:33:31Z","published":"2024-08-28T17:38:44Z","title":"More Text, Less Point: Towards 3D Data-Efficient Point-Language\n Understanding","summary":" Enabling Large Language Models (LLMs) to comprehend the 3D physical world\nremains a significant challenge. Due to the lack of large-scale 3D-text pair\ndatasets, the success of LLMs has yet to be replicated in 3D understanding. In\nthis paper, we rethink this issue and propose a new task: 3D Data-Efficient\nPoint-Language Understanding. The goal is to enable LLMs to achieve robust 3D\nobject understanding with minimal 3D point cloud and text data pairs. To\naddress this task, we introduce GreenPLM, which leverages more text data to\ncompensate for the lack of 3D data. First, inspired by using CLIP to align\nimages and text, we utilize a pre-trained point cloud-text encoder to map the\n3D point cloud space to the text space. This mapping leaves us to seamlessly\nconnect the text space with LLMs. Once the point-text-LLM connection is\nestablished, we further enhance text-LLM alignment by expanding the\nintermediate text space, thereby reducing the reliance on 3D point cloud data.\nSpecifically, we generate 6M free-text descriptions of 3D objects, and design a\nthree-stage training strategy to help LLMs better explore the intrinsic\nconnections between different modalities. To achieve efficient modality\nalignment, we design a zero-parameter cross-attention module for token pooling.\nExtensive experimental results show that GreenPLM requires only 12% of the 3D\ntraining data used by existing state-of-the-art models to achieve superior 3D\nunderstanding. Remarkably, GreenPLM also achieves competitive performance using\ntext-only data. The code and weights are available at:\nhttps://github.com/TangYuan96/GreenPLM.\n","authors":["Yuan Tang","Xu Han","Xianzhi Li","Qiao Yu","Jinfeng Xu","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03271v1","updated":"2024-09-05T06:28:05Z","published":"2024-09-05T06:28:05Z","title":"Strategic Chain-of-Thought: Guiding Accurate Reasoning in LLMs through\n Strategy Elicitation","summary":" The Chain-of-Thought (CoT) paradigm has emerged as a critical approach for\nenhancing the reasoning capabilities of large language models (LLMs). However,\ndespite their widespread adoption and success, CoT methods often exhibit\ninstability due to their inability to consistently ensure the quality of\ngenerated reasoning paths, leading to sub-optimal reasoning performance. To\naddress this challenge, we propose the \\textbf{Strategic Chain-of-Thought}\n(SCoT), a novel methodology designed to refine LLM performance by integrating\nstrategic knowledge prior to generating intermediate reasoning steps. SCoT\nemploys a two-stage approach within a single prompt: first eliciting an\neffective problem-solving strategy, which is then used to guide the generation\nof high-quality CoT paths and final answers. Our experiments across eight\nchallenging reasoning datasets demonstrate significant improvements, including\na 21.05\\% increase on the GSM8K dataset and 24.13\\% on the Tracking\\_Objects\ndataset, respectively, using the Llama3-8b model. Additionally, we extend the\nSCoT framework to develop a few-shot method with automatically matched\ndemonstrations, yielding even stronger results. These findings underscore the\nefficacy of SCoT, highlighting its potential to substantially enhance LLM\nperformance in complex reasoning tasks.\n","authors":["Yu Wang","Shiwan Zhao","Zhihu Wang","Heyuan Huang","Ming Fan","Yubo Zhang","Zhixing Wang","Haijun Wang","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2409.03271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02649v2","updated":"2024-09-05T06:20:36Z","published":"2024-09-04T12:26:26Z","title":"OpenFact at CheckThat! 2024: Combining Multiple Attack Methods for\n Effective Adversarial Text Generation","summary":" This paper presents the experiments and results for the CheckThat! Lab at\nCLEF 2024 Task 6: Robustness of Credibility Assessment with Adversarial\nExamples (InCrediblAE). The primary objective of this task was to generate\nadversarial examples in five problem domains in order to evaluate the\nrobustness of widely used text classification methods (fine-tuned BERT, BiLSTM,\nand RoBERTa) when applied to credibility assessment issues.\n This study explores the application of ensemble learning to enhance\nadversarial attacks on natural language processing (NLP) models. We\nsystematically tested and refined several adversarial attack methods, including\nBERT-Attack, Genetic algorithms, TextFooler, and CLARE, on five datasets across\nvarious misinformation tasks. By developing modified versions of BERT-Attack\nand hybrid methods, we achieved significant improvements in attack\neffectiveness. Our results demonstrate the potential of modification and\ncombining multiple methods to create more sophisticated and effective\nadversarial attack strategies, contributing to the development of more robust\nand secure systems.\n","authors":["Włodzimierz Lewoniewski","Piotr Stolarski","Milena Stróżyna","Elzbieta Lewańska","Aleksandra Wojewoda","Ewelina Księżniak","Marcin Sawiński"],"pdf_url":"https://arxiv.org/pdf/2409.02649v2.pdf","comment":"CLEF 2024 - Conference and Labs of the Evaluation Forum"},{"id":"http://arxiv.org/abs/2409.02387v2","updated":"2024-09-05T05:36:10Z","published":"2024-09-04T02:30:12Z","title":"Large Language Models and Cognitive Science: A Comprehensive Review of\n Similarities, Differences, and Challenges","summary":" This comprehensive review explores the intersection of Large Language Models\n(LLMs) and cognitive science, examining similarities and differences between\nLLMs and human cognitive processes. We analyze methods for evaluating LLMs\ncognitive abilities and discuss their potential as cognitive models. The review\ncovers applications of LLMs in various cognitive fields, highlighting insights\ngained for cognitive science research. We assess cognitive biases and\nlimitations of LLMs, along with proposed methods for improving their\nperformance. The integration of LLMs with cognitive architectures is examined,\nrevealing promising avenues for enhancing artificial intelligence (AI)\ncapabilities. Key challenges and future research directions are identified,\nemphasizing the need for continued refinement of LLMs to better align with\nhuman cognition. This review provides a balanced perspective on the current\nstate and future potential of LLMs in advancing our understanding of both\nartificial and human intelligence.\n","authors":["Qian Niu","Junyu Liu","Ziqian Bi","Pohsun Feng","Benji Peng","Keyu Chen"],"pdf_url":"https://arxiv.org/pdf/2409.02387v2.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.03258v1","updated":"2024-09-05T05:34:16Z","published":"2024-09-05T05:34:16Z","title":"GraphInsight: Unlocking Insights in Large Language Models for Graph\n Structure Understanding","summary":" Although Large Language Models (LLMs) have demonstrated potential in\nprocessing graphs, they struggle with comprehending graphical structure\ninformation through prompts of graph description sequences, especially as the\ngraph size increases. We attribute this challenge to the uneven memory\nperformance of LLMs across different positions in graph description sequences,\nknown as ''positional biases''. To address this, we propose GraphInsight, a\nnovel framework aimed at improving LLMs' comprehension of both macro- and\nmicro-level graphical information. GraphInsight is grounded in two key\nstrategies: 1) placing critical graphical information in positions where LLMs\nexhibit stronger memory performance, and 2) investigating a lightweight\nexternal knowledge base for regions with weaker memory performance, inspired by\nretrieval-augmented generation (RAG). Moreover, GraphInsight explores\nintegrating these two strategies into LLM agent processes for composite graph\ntasks that require multi-step reasoning. Extensive empirical studies on\nbenchmarks with a wide range of evaluation tasks show that GraphInsight\nsignificantly outperforms all other graph description methods (e.g., prompting\ntechniques and reordering strategies) in understanding graph structures of\nvarying sizes.\n","authors":["Yukun Cao","Shuo Han","Zengyi Gao","Zezhong Ding","Xike Xie","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.03258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03257v1","updated":"2024-09-05T05:31:29Z","published":"2024-09-05T05:31:29Z","title":"Understanding LLM Development Through Longitudinal Study: Insights from\n the Open Ko-LLM Leaderboard","summary":" This paper conducts a longitudinal study over eleven months to address the\nlimitations of prior research on the Open Ko-LLM Leaderboard, which have relied\non empirical studies with restricted observation periods of only five months.\nBy extending the analysis duration, we aim to provide a more comprehensive\nunderstanding of the progression in developing Korean large language models\n(LLMs). Our study is guided by three primary research questions: (1) What are\nthe specific challenges in improving LLM performance across diverse tasks on\nthe Open Ko-LLM Leaderboard over time? (2) How does model size impact task\nperformance correlations across various benchmarks? (3) How have the patterns\nin leaderboard rankings shifted over time on the Open Ko-LLM Leaderboard?. By\nanalyzing 1,769 models over this period, our research offers a comprehensive\nexamination of the ongoing advancements in LLMs and the evolving nature of\nevaluation frameworks.\n","authors":["Chanjun Park","Hyeonwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2409.03257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03256v1","updated":"2024-09-05T05:22:27Z","published":"2024-09-05T05:22:27Z","title":"E2CL: Exploration-based Error Correction Learning for Embodied Agents","summary":" Language models are exhibiting increasing capability in knowledge utilization\nand reasoning. However, when applied as agents in embodied environments, they\noften suffer from misalignment between their intrinsic knowledge and\nenvironmental knowledge, leading to infeasible actions. Traditional environment\nalignment methods, such as supervised learning on expert trajectories and\nreinforcement learning, face limitations in covering environmental knowledge\nand achieving efficient convergence, respectively. Inspired by human learning,\nwe propose Exploration-based Error Correction Learning (E2CL), a novel\nframework that leverages exploration-induced errors and environmental feedback\nto enhance environment alignment for LM-based agents. E2CL incorporates\nteacher-guided and teacher-free exploration to gather environmental feedback\nand correct erroneous actions. The agent learns to provide feedback and\nself-correct, thereby enhancing its adaptability to target environments.\nEvaluations in the Virtualhome environment demonstrate that E2CL-trained agents\noutperform those trained by baseline methods and exhibit superior\nself-correction capabilities.\n","authors":["Hanlin Wang","Chak Tou Leong","Jian Wang","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2409.03256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03238v1","updated":"2024-09-05T04:38:49Z","published":"2024-09-05T04:38:49Z","title":"Preserving Empirical Probabilities in BERT for Small-sample Clinical\n Entity Recognition","summary":" Named Entity Recognition (NER) encounters the challenge of unbalanced labels,\nwhere certain entity types are overrepresented while others are\nunderrepresented in real-world datasets. This imbalance can lead to biased\nmodels that perform poorly on minority entity classes, impeding accurate and\nequitable entity recognition. This paper explores the effects of unbalanced\nentity labels of the BERT-based pre-trained model. We analyze the different\nmechanisms of loss calculation and loss propagation for the task of token\nclassification on randomized datasets. Then we propose ways to improve the\ntoken classification for the highly imbalanced task of clinical entity\nrecognition.\n","authors":["Abdul Rehman","Jian Jun Zhang","Xiaosong Yang"],"pdf_url":"https://arxiv.org/pdf/2409.03238v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.02897v2","updated":"2024-09-05T03:53:13Z","published":"2024-09-04T17:41:19Z","title":"LongCite: Enabling LLMs to Generate Fine-grained Citations in\n Long-context QA","summary":" Though current long-context large language models (LLMs) have demonstrated\nimpressive capacities in answering user questions based on extensive text, the\nlack of citations in their responses makes user verification difficult, leading\nto concerns about their trustworthiness due to their potential hallucinations.\nIn this work, we aim to enable long-context LLMs to generate responses with\nfine-grained sentence-level citations, improving their faithfulness and\nverifiability. We first introduce LongBench-Cite, an automated benchmark for\nassessing current LLMs' performance in Long-Context Question Answering with\nCitations (LQAC), revealing considerable room for improvement. To this end, we\npropose CoF (Coarse to Fine), a novel pipeline that utilizes off-the-shelf LLMs\nto automatically generate long-context QA instances with precise sentence-level\ncitations, and leverage this pipeline to construct LongCite-45k, a large-scale\nSFT dataset for LQAC. Finally, we train LongCite-8B and LongCite-9B using the\nLongCite-45k dataset, successfully enabling their generation of accurate\nresponses and fine-grained sentence-level citations in a single output. The\nevaluation results on LongBench-Cite show that our trained models achieve\nstate-of-the-art citation quality, surpassing advanced proprietary models\nincluding GPT-4o.\n","authors":["Jiajie Zhang","Yushi Bai","Xin Lv","Wanjun Gu","Danqing Liu","Minhao Zou","Shulin Cao","Lei Hou","Yuxiao Dong","Ling Feng","Juanzi Li"],"pdf_url":"https://arxiv.org/pdf/2409.02897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03225v1","updated":"2024-09-05T03:45:35Z","published":"2024-09-05T03:45:35Z","title":"Enhancing Healthcare LLM Trust with Atypical Presentations Recalibration","summary":" Black-box large language models (LLMs) are increasingly deployed in various\nenvironments, making it essential for these models to effectively convey their\nconfidence and uncertainty, especially in high-stakes settings. However, these\nmodels often exhibit overconfidence, leading to potential risks and\nmisjudgments. Existing techniques for eliciting and calibrating LLM confidence\nhave primarily focused on general reasoning datasets, yielding only modest\nimprovements. Accurate calibration is crucial for informed decision-making and\npreventing adverse outcomes but remains challenging due to the complexity and\nvariability of tasks these models perform. In this work, we investigate the\nmiscalibration behavior of black-box LLMs within the healthcare setting. We\npropose a novel method, \\textit{Atypical Presentations Recalibration}, which\nleverages atypical presentations to adjust the model's confidence estimates.\nOur approach significantly improves calibration, reducing calibration errors by\napproximately 60\\% on three medical question answering datasets and\noutperforming existing methods such as vanilla verbalized confidence, CoT\nverbalized confidence and others. Additionally, we provide an in-depth analysis\nof the role of atypicality within the recalibration framework.\n","authors":["Jeremy Qin","Bang Liu","Quoc Dinh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.03225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03215v1","updated":"2024-09-05T03:22:22Z","published":"2024-09-05T03:22:22Z","title":"xLAM: A Family of Large Action Models to Empower AI Agent Systems","summary":" Autonomous agents powered by large language models (LLMs) have attracted\nsignificant research interest. However, the open-source community faces many\nchallenges in developing specialized models for agent tasks, driven by the\nscarcity of high-quality agent datasets and the absence of standard protocols\nin this area. We introduce and publicly release xLAM, a series of large action\nmodels designed for AI agent tasks. The xLAM series includes five models with\nboth dense and mixture-of-expert architectures, ranging from 1B to 8x22B\nparameters, trained using a scalable, flexible pipeline that unifies, augments,\nand synthesizes diverse datasets to enhance AI agents' generalizability and\nperformance across varied environments. Our experimental results demonstrate\nthat xLAM consistently delivers exceptional performance across multiple agent\nability benchmarks, notably securing the 1st position on the Berkeley\nFunction-Calling Leaderboard, outperforming GPT-4, Claude-3, and many other\nmodels in terms of tool use. By releasing the xLAM series, we aim to advance\nthe performance of open-source LLMs for autonomous AI agents, potentially\naccelerating progress and democratizing access to high-performance models for\nagent tasks. Models are available at\nhttps://huggingface.co/collections/Salesforce/xlam-models-65f00e2a0a63bbcd1c2dade4\n","authors":["Jianguo Zhang","Tian Lan","Ming Zhu","Zuxin Liu","Thai Hoang","Shirley Kokane","Weiran Yao","Juntao Tan","Akshara Prabhakar","Haolin Chen","Zhiwei Liu","Yihao Feng","Tulika Awalgaonkar","Rithesh Murthy","Eric Hu","Zeyuan Chen","Ran Xu","Juan Carlos Niebles","Shelby Heinecke","Huan Wang","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2409.03215v1.pdf","comment":"Technical report for the Salesforce xLAM model series"},{"id":"http://arxiv.org/abs/2409.03203v1","updated":"2024-09-05T02:51:28Z","published":"2024-09-05T02:51:28Z","title":"An Effective Deployment of Diffusion LM for Data Augmentation in\n Low-Resource Sentiment Classification","summary":" Sentiment classification (SC) often suffers from low-resource challenges such\nas domain-specific contexts, imbalanced label distributions, and few-shot\nscenarios. The potential of the diffusion language model (LM) for textual data\naugmentation (DA) remains unexplored, moreover, textual DA methods struggle to\nbalance the diversity and consistency of new samples. Most DA methods either\nperform logical modifications or rephrase less important tokens in the original\nsequence with the language model. In the context of SC, strong emotional tokens\ncould act critically on the sentiment of the whole sequence. Therefore,\ncontrary to rephrasing less important context, we propose DiffusionCLS to\nleverage a diffusion LM to capture in-domain knowledge and generate pseudo\nsamples by reconstructing strong label-related tokens. This approach ensures a\nbalance between consistency and diversity, avoiding the introduction of noise\nand augmenting crucial features of datasets. DiffusionCLS also comprises a\nNoise-Resistant Training objective to help the model generalize. Experiments\ndemonstrate the effectiveness of our method in various low-resource scenarios\nincluding domain-specific and domain-general problems. Ablation studies confirm\nthe effectiveness of our framework's modules, and visualization studies\nhighlight optimal deployment conditions, reinforcing our conclusions.\n","authors":["Zhuowei Chen","Lianxi Wang","Yuben Wu","Xinfeng Liao","Yujia Tian","Junyang Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.03203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00935v2","updated":"2024-09-05T02:39:23Z","published":"2023-10-02T06:57:45Z","title":"Resolving Knowledge Conflicts in Large Language Models","summary":" Large language models (LLMs) often encounter knowledge conflicts, scenarios\nwhere discrepancy arises between the internal parametric knowledge of LLMs and\nnon-parametric information provided in the prompt context. In this work we ask\nwhat are the desiderata for LLMs when a knowledge conflict arises and whether\nexisting LLMs fulfill them. We posit that LLMs should 1) identify knowledge\nconflicts, 2) pinpoint conflicting information segments, and 3) provide\ndistinct answers or viewpoints in conflicting scenarios. To this end, we\nintroduce KNOWLEDGE CONFLICT, an evaluation framework for simulating contextual\nknowledge conflicts and quantitatively evaluating to what extent LLMs achieve\nthese goals. KNOWLEDGE CONFLICT includes diverse and complex situations of\nknowledge conflict, knowledge from diverse entities and domains, two synthetic\nconflict creation methods, and settings with progressively increasing\ndifficulty to reflect realistic knowledge conflicts. Extensive experiments with\nthe KNOWLEDGE CONFLICT framework reveal that while LLMs perform well in\nidentifying the existence of knowledge conflicts, they struggle to determine\nthe specific conflicting knowledge and produce a response with distinct answers\namidst conflicting information. To address these challenges, we propose new\ninstruction-based approaches that augment LLMs to better achieve the three\ngoals. Further analysis shows that abilities to tackle knowledge conflicts are\ngreatly impacted by factors such as knowledge domain and prompt text, while\ngenerating robust responses to knowledge conflict scenarios remains an open\nresearch question.\n","authors":["Yike Wang","Shangbin Feng","Heng Wang","Weijia Shi","Vidhisha Balachandran","Tianxing He","Yulia Tsvetkov"],"pdf_url":"https://arxiv.org/pdf/2310.00935v2.pdf","comment":"Published at COLM 2024"},{"id":"http://arxiv.org/abs/2409.03183v1","updated":"2024-09-05T02:19:34Z","published":"2024-09-05T02:19:34Z","title":"Bypassing DARCY Defense: Indistinguishable Universal Adversarial\n Triggers","summary":" Neural networks (NN) classification models for Natural Language Processing\n(NLP) are vulnerable to the Universal Adversarial Triggers (UAT) attack that\ntriggers a model to produce a specific prediction for any input. DARCY borrows\nthe \"honeypot\" concept to bait multiple trapdoors, effectively detecting the\nadversarial examples generated by UAT. Unfortunately, we find a new UAT\ngeneration method, called IndisUAT, which produces triggers (i.e., tokens) and\nuses them to craft adversarial examples whose feature distribution is\nindistinguishable from that of the benign examples in a randomly-chosen\ncategory at the detection layer of DARCY. The produced adversarial examples\nincur the maximal loss of predicting results in the DARCY-protected models.\nMeanwhile, the produced triggers are effective in black-box models for text\ngeneration, text inference, and reading comprehension. Finally, the evaluation\nresults under NN models for NLP tasks indicate that the IndisUAT method can\neffectively circumvent DARCY and penetrate other defenses. For example,\nIndisUAT can reduce the true positive rate of DARCY's detection by at least\n40.8% and 90.6%, and drop the accuracy by at least 33.3% and 51.6% in the RNN\nand CNN models, respectively. IndisUAT reduces the accuracy of the BERT's\nadversarial defense model by at least 34.0%, and makes the GPT-2 language model\nspew racist outputs even when conditioned on non-racial context.\n","authors":["Zuquan Peng","Yuanyuan He","Jianbing Ni","Ben Niu"],"pdf_url":"https://arxiv.org/pdf/2409.03183v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.13958v2","updated":"2024-09-05T02:07:11Z","published":"2024-08-25T23:41:39Z","title":"Prediction of COPD Using Machine Learning, Clinical Summary Notes, and\n Vital Signs","summary":" Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung\ndisease that causes obstructed airflow from the lungs. In the United States,\nmore than 15.7 million Americans have been diagnosed with COPD, with 96% of\nindividuals living with at least one other chronic health condition. It is the\n4th leading cause of death in the country. Over 2.2 million patients are\nadmitted to hospitals annually due to COPD exacerbations. Monitoring and\npredicting patient exacerbations on-time could save their life. This paper\npresents two different predictive models to predict COPD exacerbation using AI\nand natural language processing (NLP) approaches. These models use respiration\nsummary notes, symptoms, and vital signs. To train and test these models, data\nrecords containing physiologic signals and vital signs time series were used.\nThese records were captured from patient monitors and comprehensive clinical\ndata obtained from hospital medical information systems for tens of thousands\nof Intensive Care Unit (ICU) patients. We achieved an area under the Receiver\noperating characteristic (ROC) curve of 0.82 in detection and prediction of\nCOPD exacerbation.\n","authors":["Negar Orangi-Fard"],"pdf_url":"https://arxiv.org/pdf/2408.13958v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.03171v1","updated":"2024-09-05T01:58:29Z","published":"2024-09-05T01:58:29Z","title":"MARAGS: A Multi-Adapter System for Multi-Task Retrieval Augmented\n Generation Question Answering","summary":" In this paper we present a multi-adapter retrieval augmented generation\nsystem (MARAGS) for Meta's Comprehensive RAG (CRAG) competition for KDD CUP\n2024. CRAG is a question answering dataset contains 3 different subtasks aimed\nat realistic question and answering RAG related tasks, with a diverse set of\nquestion topics, question types, time dynamic answers, and questions featuring\nentities of varying popularity.\n Our system follows a standard setup for web based RAG, which uses processed\nweb pages to provide context for an LLM to produce generations, while also\nquerying API endpoints for additional information. MARAGS also utilizes\nmultiple different adapters to solve the various requirements for these tasks\nwith a standard cross-encoder model for ranking candidate passages relevant for\nanswering the question. Our system achieved 2nd place for Task 1 as well as 3rd\nplace on Task 2.\n","authors":["Mitchell DeHaven"],"pdf_url":"https://arxiv.org/pdf/2409.03171v1.pdf","comment":"Accepted to CRAG KDD Cup 24 Workshop"},{"id":"http://arxiv.org/abs/2409.03166v1","updated":"2024-09-05T01:51:54Z","published":"2024-09-05T01:51:54Z","title":"Continual Skill and Task Learning via Dialogue","summary":" Continual and interactive robot learning is a challenging problem as the\nrobot is present with human users who expect the robot to learn novel skills to\nsolve novel tasks perpetually with sample efficiency. In this work we present a\nframework for robots to query and learn visuo-motor robot skills and task\nrelevant information via natural language dialog interactions with human users.\nPrevious approaches either focus on improving the performance of instruction\nfollowing agents, or passively learn novel skills or concepts. Instead, we used\ndialog combined with a language-skill grounding embedding to query or confirm\nskills and/or tasks requested by a user. To achieve this goal, we developed and\nintegrated three different components for our agent. Firstly, we propose a\nnovel visual-motor control policy ACT with Low Rank Adaptation (ACT-LoRA),\nwhich enables the existing SoTA ACT model to perform few-shot continual\nlearning. Secondly, we develop an alignment model that projects demonstrations\nacross skill embodiments into a shared embedding allowing us to know when to\nask questions and/or demonstrations from users. Finally, we integrated an\nexisting LLM to interact with a human user to perform grounded interactive\ncontinual skill learning to solve a task. Our ACT-LoRA model learns novel\nfine-tuned skills with a 100% accuracy when trained with only five\ndemonstrations for a novel skill while still maintaining a 74.75% accuracy on\npre-trained skills in the RLBench dataset where other models fall significantly\nshort. We also performed a human-subjects study with 8 subjects to demonstrate\nthe continual learning capabilities of our combined framework. We achieve a\nsuccess rate of 75% in the task of sandwich making with the real robot learning\nfrom participant data demonstrating that robots can learn novel skills or task\nknowledge from dialogue with non-expert users using our approach.\n","authors":["Weiwei Gu","Suresh Kondepudi","Lixiao Huang","Nakul Gopalan"],"pdf_url":"https://arxiv.org/pdf/2409.03166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03161v1","updated":"2024-09-05T01:36:00Z","published":"2024-09-05T01:36:00Z","title":"MaterialBENCH: Evaluating College-Level Materials Science\n Problem-Solving Abilities of Large Language Models","summary":" A college-level benchmark dataset for large language models (LLMs) in the\nmaterials science field, MaterialBENCH, is constructed. This dataset consists\nof problem-answer pairs, based on university textbooks. There are two types of\nproblems: one is the free-response answer type, and the other is the\nmultiple-choice type. Multiple-choice problems are constructed by adding three\nincorrect answers as choices to a correct answer, so that LLMs can choose one\nof the four as a response. Most of the problems for free-response answer and\nmultiple-choice types overlap except for the format of the answers. We also\nconduct experiments using the MaterialBENCH on LLMs, including ChatGPT-3.5,\nChatGPT-4, Bard (at the time of the experiments), and GPT-3.5 and GPT-4 with\nthe OpenAI API. The differences and similarities in the performance of LLMs\nmeasured by the MaterialBENCH are analyzed and discussed. Performance\ndifferences between the free-response type and multiple-choice type in the same\nmodels and the influence of using system massages on multiple-choice problems\nare also studied. We anticipate that MaterialBENCH will encourage further\ndevelopments of LLMs in reasoning abilities to solve more complicated problems\nand eventually contribute to materials research and discovery.\n","authors":["Michiko Yoshitake","Yuta Suzuki","Ryo Igarashi","Yoshitaka Ushiku","Keisuke Nagato"],"pdf_url":"https://arxiv.org/pdf/2409.03161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03155v1","updated":"2024-09-05T01:11:58Z","published":"2024-09-05T01:11:58Z","title":"Debate on Graph: a Flexible and Reliable Reasoning Framework for Large\n Language Models","summary":" Large Language Models (LLMs) may suffer from hallucinations in real-world\napplications due to the lack of relevant knowledge. In contrast, knowledge\ngraphs encompass extensive, multi-relational structures that store a vast array\nof symbolic facts. Consequently, integrating LLMs with knowledge graphs has\nbeen extensively explored, with Knowledge Graph Question Answering (KGQA)\nserving as a critical touchstone for the integration. This task requires LLMs\nto answer natural language questions by retrieving relevant triples from\nknowledge graphs. However, existing methods face two significant challenges:\n\\textit{excessively long reasoning paths distracting from the answer\ngeneration}, and \\textit{false-positive relations hindering the path\nrefinement}. In this paper, we propose an iterative interactive KGQA framework\nthat leverages the interactive learning capabilities of LLMs to perform\nreasoning and Debating over Graphs (DoG). Specifically, DoG employs a\nsubgraph-focusing mechanism, allowing LLMs to perform answer trying after each\nreasoning step, thereby mitigating the impact of lengthy reasoning paths. On\nthe other hand, DoG utilizes a multi-role debate team to gradually simplify\ncomplex questions, reducing the influence of false-positive relations. This\ndebate mechanism ensures the reliability of the reasoning process. Experimental\nresults on five public datasets demonstrate the effectiveness and superiority\nof our architecture. Notably, DoG outperforms the state-of-the-art method ToG\nby 23.7\\% and 9.1\\% in accuracy on WebQuestions and GrailQA, respectively.\nFurthermore, the integration experiments with various LLMs on the mentioned\ndatasets highlight the flexibility of DoG. Code is available at\n\\url{https://github.com/reml-group/DoG}.\n","authors":["Jie Ma","Zhitao Gao","Qi Chai","Wangchun Sun","Pinghui Wang","Hongbin Pei","Jing Tao","Lingyun Song","Jun Liu","Chen Zhang","Lizhen Cui"],"pdf_url":"https://arxiv.org/pdf/2409.03155v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2405.10443v3","updated":"2024-09-05T01:06:40Z","published":"2024-05-16T21:07:42Z","title":"Simultaneous Masking, Not Prompting Optimization: A Paradigm Shift in\n Fine-tuning LLMs for Simultaneous Translation","summary":" Large language models (LLMs) have achieved state-of-the-art performance in\nvarious language processing tasks, motivating their adoption in simultaneous\ntranslation. Current fine-tuning methods to adapt LLMs for simultaneous\ntranslation focus on prompting optimization strategies using either data\naugmentation or prompt structure modifications. However, these methods suffer\nfrom several issues, such as unnecessarily expanded training sets,\ncomputational inefficiency from dumping the key and value cache, increased\nprompt sizes, or restriction to a single decision policy. To eliminate these\nissues, in this work, we propose SimulMask, a new paradigm for fine-tuning LLMs\nfor simultaneous translation. It utilizes a novel attention mask approach that\nmodels simultaneous translation during fine-tuning by masking attention for a\ndesired decision policy. Applying the proposed SimulMask on a Falcon LLM for\nthe IWSLT 2017 dataset, we have observed a significant translation quality\nimprovement compared to state-of-the-art prompting optimization strategies on\nfive language pairs while reducing the computational cost.\n","authors":["Matthew Raffel","Victor Agostinelli","Lizhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.10443v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03140v1","updated":"2024-09-05T00:25:37Z","published":"2024-09-05T00:25:37Z","title":"GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase\n Recommendation","summary":" Online sellers and advertisers are recommended keyphrases for their listed\nproducts, which they bid on to enhance their sales. One popular paradigm that\ngenerates such recommendations is Extreme Multi-Label Classification (XMC),\nwhich involves tagging/mapping keyphrases to items. We outline the limitations\nof using traditional item-query based tagging or mapping techniques for\nkeyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an\ninnovative graph-based approach that recommends keyphrases to sellers using\nextraction of token permutations from item titles. Additionally, we demonstrate\nthat relying on traditional metrics such as precision/recall can be misleading\nin practical applications, thereby necessitating a combination of metrics to\nevaluate performance in real-world scenarios. These metrics are designed to\nassess the relevance of keyphrases to items and the potential for buyer\noutreach. GraphEx outperforms production models at eBay, achieving the\nobjectives mentioned above. It supports near real-time inferencing in\nresource-constrained production environments and scales effectively for\nbillions of items.\n","authors":["Ashirbad Mishra","Soumik Dey","Marshall Wu","Jinyu Zhao","He Yu","Kaichen Ni","Binbin Li","Kamesh Madduri"],"pdf_url":"https://arxiv.org/pdf/2409.03140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16966v2","updated":"2024-09-05T23:18:00Z","published":"2024-08-30T01:56:57Z","title":"UserSumBench: A Benchmark Framework for Evaluating User Summarization\n Approaches","summary":" Large language models (LLMs) have shown remarkable capabilities in generating\nuser summaries from a long list of raw user activity data. These summaries\ncapture essential user information such as preferences and interests, and\ntherefore are invaluable for LLM-based personalization applications, such as\nexplainable recommender systems. However, the development of new summarization\ntechniques is hindered by the lack of ground-truth labels, the inherent\nsubjectivity of user summaries, and human evaluation which is often costly and\ntime-consuming. To address these challenges, we introduce \\UserSumBench, a\nbenchmark framework designed to facilitate iterative development of LLM-based\nsummarization approaches. This framework offers two key components: (1) A\nreference-free summary quality metric. We show that this metric is effective\nand aligned with human preferences across three diverse datasets (MovieLens,\nYelp and Amazon Review). (2) A novel robust summarization method that leverages\ntime-hierarchical summarizer and self-critique verifier to produce high-quality\nsummaries while eliminating hallucination. This method serves as a strong\nbaseline for further innovation in summarization techniques.\n","authors":["Chao Wang","Neo Wu","Lin Ning","Jiaxing Wu","Luyang Liu","Jun Xie","Shawn O'Banion","Bradley Green"],"pdf_url":"https://arxiv.org/pdf/2408.16966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03939v1","updated":"2024-09-05T23:17:18Z","published":"2024-09-05T23:17:18Z","title":"Experimentation in Content Moderation using RWKV","summary":" This paper investigates the RWKV model's efficacy in content moderation\nthrough targeted experimentation. We introduce a novel dataset specifically\ndesigned for distillation into smaller models, enhancing content moderation\npractices. This comprehensive dataset encompasses images, videos, sounds, and\ntext data that present societal challenges. Leveraging advanced Large Language\nModels (LLMs), we generated an extensive set of responses -- 558,958 for text\nand 83,625 for images -- to train and refine content moderation systems. Our\ncore experimentation involved fine-tuning the RWKV model, capitalizing on its\nCPU-efficient architecture to address large-scale content moderation tasks. By\nhighlighting the dataset's potential for knowledge distillation, this study not\nonly demonstrates RWKV's capability in improving the accuracy and efficiency of\ncontent moderation systems but also paves the way for developing more compact,\nresource-efficient models in this domain. Datasets and models can be found in\nHuggingFace: https://huggingface.co/modrwkv\n","authors":["Umut Yildirim","Rohan Dutta","Burak Yildirim","Atharva Vaidya"],"pdf_url":"https://arxiv.org/pdf/2409.03939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03905v1","updated":"2024-09-05T20:42:35Z","published":"2024-09-05T20:42:35Z","title":"CACER: Clinical Concept Annotations for Cancer Events and Relations","summary":" Clinical notes contain unstructured representations of patient histories,\nincluding the relationships between medical problems and prescription drugs. To\ninvestigate the relationship between cancer drugs and their associated symptom\nburden, we extract structured, semantic representations of medical problem and\ndrug information from the clinical narratives of oncology notes. We present\nClinical Concept Annotations for Cancer Events and Relations (CACER), a novel\ncorpus with fine-grained annotations for over 48,000 medical problems and drug\nevents and 10,000 drug-problem and problem-problem relations. Leveraging CACER,\nwe develop and evaluate transformer-based information extraction (IE) models\nsuch as BERT, Flan-T5, Llama3, and GPT-4 using fine-tuning and in-context\nlearning (ICL). In event extraction, the fine-tuned BERT and Llama3 models\nachieved the highest performance at 88.2-88.0 F1, which is comparable to the\ninter-annotator agreement (IAA) of 88.4 F1. In relation extraction, the\nfine-tuned BERT, Flan-T5, and Llama3 achieved the highest performance at\n61.8-65.3 F1. GPT-4 with ICL achieved the worst performance across both tasks.\nThe fine-tuned models significantly outperformed GPT-4 in ICL, highlighting the\nimportance of annotated training data and model optimization. Furthermore, the\nBERT models performed similarly to Llama3. For our task, LLMs offer no\nperformance advantage over the smaller BERT models. The results emphasize the\nneed for annotated training data to optimize models. Multiple fine-tuned\ntransformer models achieved performance comparable to IAA for several\nextraction tasks.\n","authors":["Yujuan Fu","Giridhar Kaushik Ramachandran","Ahmad Halwani","Bridget T. McInnes","Fei Xia","Kevin Lybarger","Meliha Yetisgen","Özlem Uzuner"],"pdf_url":"https://arxiv.org/pdf/2409.03905v1.pdf","comment":"This is a pre-copy-editing, author-produced PDF of an article\n accepted for publication in JAMIA following peer review. The definitive\n publisher-authenticated version is available online at\n https://academic.oup.com/jamia/advance-article/doi/10.1093/jamia/ocae231/7748302"},{"id":"http://arxiv.org/abs/2409.03856v1","updated":"2024-09-05T18:38:07Z","published":"2024-09-05T18:38:07Z","title":"Sirius: Contextual Sparsity with Correction for Efficient LLMs","summary":" With the blossom of large language models (LLMs), inference efficiency\nbecomes increasingly important. Various approximation methods are proposed to\nreduce the cost at inference time. Contextual Sparsity (CS) is appealing for\nits training-free nature and its ability to reach a higher compression ratio\nseemingly without quality degradation. However, after a comprehensive\nevaluation of contextual sparsity methods on various complex generation tasks,\nwe find that although CS succeeds in prompt-understanding tasks, CS\nsignificantly degrades the model performance for reasoning, deduction, and\nknowledge-based tasks. Despite the gap in end-to-end accuracy, we observed that\nsparse models often share general problem-solving logic and require only a few\ntoken corrections to recover the original model performance. This paper\nintroduces Sirius, an efficient correction mechanism, which significantly\nrecovers CS models quality on reasoning tasks while maintaining its efficiency\ngain. Sirius is evaluated on 6 models with 8 difficult generation tasks in\nreasoning, math, and coding and shows consistent effectiveness and efficiency.\nAlso, we carefully develop a system implementation for Sirius and show that\nSirius achieves roughly 20% reduction in latency for 8B model on-chip and 35%\nreduction for 70B model offloading. We open-source our implementation of Sirius\nat https://github.com/Infini-AI-Lab/Sirius.git.\n","authors":["Yang Zhou","Zhuoming Chen","Zhaozhuo Xu","Victoria Lin","Beidi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.03856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07088v6","updated":"2024-09-05T18:26:56Z","published":"2024-03-11T18:26:02Z","title":"SPA: Towards A Computational Friendly Cloud-Base and On-Devices\n Collaboration Seq2seq Personalized Generation with Casual Inference","summary":" Large language models(LLMs) have shown its outperforming ability on various\ntasks and question answering. However, LLMs require substantial memory storage\non low-resource devices. More critically, the computational speed on these\ndevices is also severely limited. In this paper, we propose SPA(Side Plugin\nAdaption), a lightweight architecture for fast on-devices inference on the\nconstraints of strict on-devices computation and memory constraints. Compared\nwith other on-devices seq2seq generation, SPA could make a fast and stable\ninference on low-resource constraints, allowing it to obtain cost effiency. Our\nmethod establish an interaction between a pretrained LLMs on-cloud and additive\nparameters on-devices, which could provide the knowledge on both pretrained\nLLMs and featured personal feature. Further more, SPA provides a framework to\nkeep feature-base parameters on low computational devices while leave the\nparameters containing general information on the high computational devices.\n","authors":["Yanming Liu","Xinyue Peng","Shi Bo","Ningjing Sang","Yafeng Yan","Xiaolan Ke","Zhiting Zheng","Shaobo Liu","Songhang Deng","Jiannan Cao","Le Dai","Xingzu Liu","Ruilin Nong","Weihao Liu"],"pdf_url":"https://arxiv.org/pdf/2403.07088v6.pdf","comment":"12 pages, third version of SPA(Side Plugin Adaption)"},{"id":"http://arxiv.org/abs/2409.03843v1","updated":"2024-09-05T18:08:47Z","published":"2024-09-05T18:08:47Z","title":"Persona Setting Pitfall: Persistent Outgroup Biases in Large Language\n Models Arising from Social Identity Adoption","summary":" Drawing parallels between human cognition and artificial intelligence, we\nexplored how large language models (LLMs) internalize identities imposed by\ntargeted prompts. Informed by Social Identity Theory, these identity\nassignments lead LLMs to distinguish between \"we\" (the ingroup) and \"they\" (the\noutgroup). This self-categorization generates both ingroup favoritism and\noutgroup bias. Nonetheless, existing literature has predominantly focused on\ningroup favoritism, often overlooking outgroup bias, which is a fundamental\nsource of intergroup prejudice and discrimination. Our experiment addresses\nthis gap by demonstrating that outgroup bias manifests as strongly as ingroup\nfavoritism. Furthermore, we successfully mitigated the inherent pro-liberal,\nanti-conservative bias in LLMs by guiding them to adopt the perspectives of the\ninitially disfavored group. These results were replicated in the context of\ngender bias. Our findings highlight the potential to develop more equitable and\nbalanced language models.\n","authors":["Wenchao Dong","Assem Zhunis","Dongyoung Jeong","Hyojin Chin","Jiyoung Han","Meeyoung Cha"],"pdf_url":"https://arxiv.org/pdf/2409.03843v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.03810v1","updated":"2024-09-05T17:46:30Z","published":"2024-09-05T17:46:30Z","title":"How Do Your Code LLMs Perform? Empowering Code Instruction Tuning with\n High-Quality Data","summary":" Recently, there has been a growing interest in studying how to construct\nbetter code instruction tuning data. However, we observe Code models trained\nwith these datasets exhibit high performance on HumanEval but perform worse on\nother benchmarks such as LiveCodeBench. Upon further investigation, we find\nthat many datasets suffer from severe data leakage. After cleaning up most of\nthe leaked data, some well-known high-quality datasets perform poorly. This\ndiscovery reveals a new challenge: identifying which dataset genuinely qualify\nas high-quality code instruction data. To address this, we propose an efficient\ncode data pruning strategy for selecting good samples. Our approach is based on\nthree dimensions: instruction complexity, response quality, and instruction\ndiversity. Based on our selected data, we present XCoder, a family of models\nfinetuned from LLaMA3. Our experiments show XCoder achieves new\nstate-of-the-art performance using fewer training data, which verify the\neffectiveness of our data strategy. Moreover, we perform a comprehensive\nanalysis on the data composition and find existing code datasets have different\ncharacteristics according to their construction methods, which provide new\ninsights for future code LLMs. Our models and dataset are released in\nhttps://github.com/banksy23/XCoder\n","authors":["Yejie Wang","Keqing He","Dayuan Fu","Zhuoma Gongque","Heyang Xu","Yanxu Chen","Zhexu Wang","Yujia Fu","Guanting Dong","Muxi Diao","Jingang Wang","Mengdi Zhang","Xunliang Cai","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2409.03810v1.pdf","comment":"Working in progress"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.03757v1","updated":"2024-09-05T17:59:56Z","published":"2024-09-05T17:59:56Z","title":"Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene\n Understanding","summary":" Complex 3D scene understanding has gained increasing attention, with scene\nencoding strategies playing a crucial role in this success. However, the\noptimal scene encoding strategies for various scenarios remain unclear,\nparticularly compared to their image-based counterparts. To address this issue,\nwe present a comprehensive study that probes various visual encoding models for\n3D scene understanding, identifying the strengths and limitations of each model\nacross different scenarios. Our evaluation spans seven vision foundation\nencoders, including image-based, video-based, and 3D foundation models. We\nevaluate these models in four tasks: Vision-Language Scene Reasoning, Visual\nGrounding, Segmentation, and Registration, each focusing on different aspects\nof scene understanding. Our evaluations yield key findings: DINOv2 demonstrates\nsuperior performance, video models excel in object-level tasks, diffusion\nmodels benefit geometric tasks, and language-pretrained models show unexpected\nlimitations in language-related tasks. These insights challenge some\nconventional understandings, provide novel perspectives on leveraging visual\nfoundation models, and highlight the need for more flexible encoder selection\nin future vision-language and scene-understanding tasks.\n","authors":["Yunze Man","Shuhong Zheng","Zhipeng Bao","Martial Hebert","Liang-Yan Gui","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03757v1.pdf","comment":"Project page: https://yunzeman.github.io/lexicon3d , Github:\n https://github.com/YunzeMan/Lexicon3D"},{"id":"http://arxiv.org/abs/2409.03755v1","updated":"2024-09-05T17:59:46Z","published":"2024-09-05T17:59:46Z","title":"DC-Solver: Improving Predictor-Corrector Diffusion Sampler via Dynamic\n Compensation","summary":" Diffusion probabilistic models (DPMs) have shown remarkable performance in\nvisual synthesis but are computationally expensive due to the need for multiple\nevaluations during the sampling. Recent predictor-corrector diffusion samplers\nhave significantly reduced the required number of function evaluations (NFE),\nbut inherently suffer from a misalignment issue caused by the extra corrector\nstep, especially with a large classifier-free guidance scale (CFG). In this\npaper, we introduce a new fast DPM sampler called DC-Solver, which leverages\ndynamic compensation (DC) to mitigate the misalignment of the\npredictor-corrector samplers. The dynamic compensation is controlled by\ncompensation ratios that are adaptive to the sampling steps and can be\noptimized on only 10 datapoints by pushing the sampling trajectory toward a\nground truth trajectory. We further propose a cascade polynomial regression\n(CPR) which can instantly predict the compensation ratios on unseen sampling\nconfigurations. Additionally, we find that the proposed dynamic compensation\ncan also serve as a plug-and-play module to boost the performance of\npredictor-only samplers. Extensive experiments on both unconditional sampling\nand conditional sampling demonstrate that our DC-Solver can consistently\nimprove the sampling quality over previous methods on different DPMs with a\nwide range of resolutions up to 1024$\\times$1024. Notably, we achieve 10.38 FID\n(NFE=5) on unconditional FFHQ and 0.394 MSE (NFE=5, CFG=7.5) on\nStable-Diffusion-2.1. Code is available at https://github.com/wl-zhao/DC-Solver\n","authors":["Wenliang Zhao","Haolin Wang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2409.03755v1.pdf","comment":"Accepted by ECCV 2024"},{"id":"http://arxiv.org/abs/2409.03754v1","updated":"2024-09-05T17:59:32Z","published":"2024-09-05T17:59:32Z","title":"Foundation Model or Finetune? Evaluation of few-shot semantic\n segmentation for river pollution","summary":" Foundation models (FMs) are a popular topic of research in AI. Their ability\nto generalize to new tasks and datasets without retraining or needing an\nabundance of data makes them an appealing candidate for applications on\nspecialist datasets. In this work, we compare the performance of FMs to\nfinetuned pre-trained supervised models in the task of semantic segmentation on\nan entirely new dataset. We see that finetuned models consistently outperform\nthe FMs tested, even in cases were data is scarce. We release the code and\ndataset for this work on GitHub.\n","authors":["Marga Don","Stijn Pinson","Blanca Guillen Cebrian","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2409.03754v1.pdf","comment":"Accepted at ECCV 2024 Green Foundation Models workshop"},{"id":"http://arxiv.org/abs/2409.03745v1","updated":"2024-09-05T17:57:59Z","published":"2024-09-05T17:57:59Z","title":"ArtiFade: Learning to Generate High-quality Subject from Blemished\n Images","summary":" Subject-driven text-to-image generation has witnessed remarkable advancements\nin its ability to learn and capture characteristics of a subject using only a\nlimited number of images. However, existing methods commonly rely on\nhigh-quality images for training and may struggle to generate reasonable images\nwhen the input images are blemished by artifacts. This is primarily attributed\nto the inadequate capability of current techniques in distinguishing\nsubject-related features from disruptive artifacts. In this paper, we introduce\nArtiFade to tackle this issue and successfully generate high-quality\nartifact-free images from blemished datasets. Specifically, ArtiFade exploits\nfine-tuning of a pre-trained text-to-image model, aiming to remove artifacts.\nThe elimination of artifacts is achieved by utilizing a specialized dataset\nthat encompasses both unblemished images and their corresponding blemished\ncounterparts during fine-tuning. ArtiFade also ensures the preservation of the\noriginal generative capabilities inherent within the diffusion model, thereby\nenhancing the overall performance of subject-driven methods in generating\nhigh-quality and artifact-free images. We further devise evaluation benchmarks\ntailored for this task. Through extensive qualitative and quantitative\nexperiments, we demonstrate the generalizability of ArtiFade in effective\nartifact removal under both in-distribution and out-of-distribution scenarios.\n","authors":["Shuya Yang","Shaozhe Hao","Yukang Cao","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2409.03745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03718v1","updated":"2024-09-05T17:21:54Z","published":"2024-09-05T17:21:54Z","title":"Geometry Image Diffusion: Fast and Data-Efficient Text-to-3D with\n Image-Based Surface Representation","summary":" Generating high-quality 3D objects from textual descriptions remains a\nchallenging problem due to computational cost, the scarcity of 3D data, and\ncomplex 3D representations. We introduce Geometry Image Diffusion\n(GIMDiffusion), a novel Text-to-3D model that utilizes geometry images to\nefficiently represent 3D shapes using 2D images, thereby avoiding the need for\ncomplex 3D-aware architectures. By integrating a Collaborative Control\nmechanism, we exploit the rich 2D priors of existing Text-to-Image models such\nas Stable Diffusion. This enables strong generalization even with limited 3D\ntraining data (allowing us to use only high-quality training data) as well as\nretaining compatibility with guidance techniques such as IPAdapter. In short,\nGIMDiffusion enables the generation of 3D assets at speeds comparable to\ncurrent Text-to-Image models. The generated objects consist of semantically\nmeaningful, separate parts and include internal structures, enhancing both\nusability and versatility.\n","authors":["Slava Elizarov","Ciara Rowles","Simon Donné"],"pdf_url":"https://arxiv.org/pdf/2409.03718v1.pdf","comment":"11 pages, 9 figures, Project page:\n https://unity-research.github.io/Geometry-Image-Diffusion.github.io/"},{"id":"http://arxiv.org/abs/2409.03685v1","updated":"2024-09-05T16:39:21Z","published":"2024-09-05T16:39:21Z","title":"View-Invariant Policy Learning via Zero-Shot Novel View Synthesis","summary":" Large-scale visuomotor policy learning is a promising approach toward\ndeveloping generalizable manipulation systems. Yet, policies that can be\ndeployed on diverse embodiments, environments, and observational modalities\nremain elusive. In this work, we investigate how knowledge from large-scale\nvisual data of the world may be used to address one axis of variation for\ngeneralizable manipulation: observational viewpoint. Specifically, we study\nsingle-image novel view synthesis models, which learn 3D-aware scene-level\npriors by rendering images of the same scene from alternate camera viewpoints\ngiven a single input image. For practical application to diverse robotic data,\nthese models must operate zero-shot, performing view synthesis on unseen tasks\nand environments. We empirically analyze view synthesis models within a simple\ndata-augmentation scheme that we call View Synthesis Augmentation (VISTA) to\nunderstand their capabilities for learning viewpoint-invariant policies from\nsingle-viewpoint demonstration data. Upon evaluating the robustness of policies\ntrained with our method to out-of-distribution camera viewpoints, we find that\nthey outperform baselines in both simulated and real-world manipulation tasks.\nVideos and additional visualizations are available at\nhttps://s-tian.github.io/projects/vista.\n","authors":["Stephen Tian","Blake Wulfe","Kyle Sargent","Katherine Liu","Sergey Zakharov","Vitor Guizilini","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2409.03685v1.pdf","comment":"Accepted to CoRL 2024"},{"id":"http://arxiv.org/abs/2312.08673v3","updated":"2024-09-05T16:05:12Z","published":"2023-12-14T06:17:15Z","title":"Segment Beyond View: Handling Partially Missing Modality for\n Audio-Visual Semantic Segmentation","summary":" Augmented Reality (AR) devices, emerging as prominent mobile interaction\nplatforms, face challenges in user safety, particularly concerning oncoming\nvehicles. While some solutions leverage onboard camera arrays, these cameras\noften have limited field-of-view (FoV) with front or downward perspectives.\nAddressing this, we propose a new out-of-view semantic segmentation task and\nSegment Beyond View (SBV), a novel audio-visual semantic segmentation method.\nSBV supplements the visual modality, which miss the information beyond FoV,\nwith the auditory information using a teacher-student distillation model\n(Omni2Ego). The model consists of a vision teacher utilising panoramic\ninformation, an auditory teacher with 8-channel audio, and an audio-visual\nstudent that takes views with limited FoV and binaural audio as input and\nproduce semantic segmentation for objects outside FoV. SBV outperforms existing\nmodels in comparative evaluations and shows a consistent performance across\nvarying FoV ranges and in monaural audio settings.\n","authors":["Renjie Wu","Hu Wang","Feras Dayoub","Hsiang-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2312.08673v3.pdf","comment":"AAAI-24 (Fixed some erros)"},{"id":"http://arxiv.org/abs/2409.03644v1","updated":"2024-09-05T16:02:11Z","published":"2024-09-05T16:02:11Z","title":"RealisHuman: A Two-Stage Approach for Refining Malformed Human Parts in\n Generated Images","summary":" In recent years, diffusion models have revolutionized visual generation,\noutperforming traditional frameworks like Generative Adversarial Networks\n(GANs). However, generating images of humans with realistic semantic parts,\nsuch as hands and faces, remains a significant challenge due to their intricate\nstructural complexity. To address this issue, we propose a novel\npost-processing solution named RealisHuman. The RealisHuman framework operates\nin two stages. First, it generates realistic human parts, such as hands or\nfaces, using the original malformed parts as references, ensuring consistent\ndetails with the original image. Second, it seamlessly integrates the rectified\nhuman parts back into their corresponding positions by repainting the\nsurrounding areas to ensure smooth and realistic blending. The RealisHuman\nframework significantly enhances the realism of human generation, as\ndemonstrated by notable improvements in both qualitative and quantitative\nmetrics. Code is available at https://github.com/Wangbenzhi/RealisHuman.\n","authors":["Benzhi Wang","Jingkai Zhou","Jingqi Bai","Yang Yang","Weihua Chen","Fan Wang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2409.03644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03643v1","updated":"2024-09-05T16:01:21Z","published":"2024-09-05T16:01:21Z","title":"CDM: A Reliable Metric for Fair and Accurate Formula Recognition\n Evaluation","summary":" Formula recognition presents significant challenges due to the complicated\nstructure and varied notation of mathematical expressions. Despite continuous\nadvancements in formula recognition models, the evaluation metrics employed by\nthese models, such as BLEU and Edit Distance, still exhibit notable\nlimitations. They overlook the fact that the same formula has diverse\nrepresentations and is highly sensitive to the distribution of training data,\nthereby causing the unfairness in formula recognition evaluation. To this end,\nwe propose a Character Detection Matching (CDM) metric, ensuring the evaluation\nobjectivity by designing a image-level rather than LaTex-level metric score.\nSpecifically, CDM renders both the model-predicted LaTeX and the ground-truth\nLaTeX formulas into image-formatted formulas, then employs visual feature\nextraction and localization techniques for precise character-level matching,\nincorporating spatial position information. Such a spatially-aware and\ncharacter-matching method offers a more accurate and equitable evaluation\ncompared with previous BLEU and Edit Distance metrics that rely solely on\ntext-based character matching. Experimentally, we evaluated various formula\nrecognition models using CDM, BLEU, and ExpRate metrics. Their results\ndemonstrate that the CDM aligns more closely with human evaluation standards\nand provides a fairer comparison across different models by eliminating\ndiscrepancies caused by diverse formula representations.\n","authors":["Bin Wang","Fan Wu","Linke Ouyang","Zhuangcheng Gu","Rui Zhang","Renqiu Xia","Bo Zhang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2409.03643v1.pdf","comment":"Project Website:\n https://github.com/opendatalab/UniMERNet/tree/main/cdm"},{"id":"http://arxiv.org/abs/2403.19319v2","updated":"2024-09-05T15:55:22Z","published":"2024-03-28T11:22:53Z","title":"Mesh2NeRF: Direct Mesh Supervision for Neural Radiance Field\n Representation and Generation","summary":" We present Mesh2NeRF, an approach to derive ground-truth radiance fields from\ntextured meshes for 3D generation tasks. Many 3D generative approaches\nrepresent 3D scenes as radiance fields for training. Their ground-truth\nradiance fields are usually fitted from multi-view renderings from a\nlarge-scale synthetic 3D dataset, which often results in artifacts due to\nocclusions or under-fitting issues. In Mesh2NeRF, we propose an analytic\nsolution to directly obtain ground-truth radiance fields from 3D meshes,\ncharacterizing the density field with an occupancy function featuring a defined\nsurface thickness, and determining view-dependent color through a reflection\nfunction considering both the mesh and environment lighting. Mesh2NeRF extracts\naccurate radiance fields which provides direct supervision for training\ngenerative NeRFs and single scene representation. We validate the effectiveness\nof Mesh2NeRF across various tasks, achieving a noteworthy 3.12dB improvement in\nPSNR for view synthesis in single scene representation on the ABO dataset, a\n0.69 PSNR enhancement in the single-view conditional generation of ShapeNet\nCars, and notably improved mesh extraction from NeRF in the unconditional\ngeneration of Objaverse Mugs.\n","authors":["Yujin Chen","Yinyu Nie","Benjamin Ummenhofer","Reiner Birkl","Michael Paulitsch","Matthias Müller","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2403.19319v2.pdf","comment":"Accepted to ECCV 2024, Project page:\n https://terencecyj.github.io/projects/Mesh2NeRF/ Video:\n https://youtu.be/SsFkhSuQYGM"},{"id":"http://arxiv.org/abs/2409.03634v1","updated":"2024-09-05T15:48:02Z","published":"2024-09-05T15:48:02Z","title":"Surface-Centric Modeling for High-Fidelity Generalizable Neural Surface\n Reconstruction","summary":" Reconstructing the high-fidelity surface from multi-view images, especially\nsparse images, is a critical and practical task that has attracted widespread\nattention in recent years. However, existing methods are impeded by the memory\nconstraint or the requirement of ground-truth depths and cannot recover\nsatisfactory geometric details. To this end, we propose SuRF, a new\nSurface-centric framework that incorporates a new Region sparsification based\non a matching Field, achieving good trade-offs between performance, efficiency\nand scalability. To our knowledge, this is the first unsupervised method\nachieving end-to-end sparsification powered by the introduced matching field,\nwhich leverages the weight distribution to efficiently locate the boundary\nregions containing surface. Instead of predicting an SDF value for each voxel,\nwe present a new region sparsification approach to sparse the volume by judging\nwhether the voxel is inside the surface region. In this way, our model can\nexploit higher frequency features around the surface with less memory and\ncomputational consumption. Extensive experiments on multiple benchmarks\ncontaining complex large-scale scenes show that our reconstructions exhibit\nhigh-quality details and achieve new state-of-the-art performance, i.e., 46%\nimprovements with 80% less memory consumption. Code is available at\nhttps://github.com/prstrive/SuRF.\n","authors":["Rui Peng","Shihe Shen","Kaiqiang Xiong","Huachen Gao","Jianbo Jiao","Xiaodong Gu","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03634v1.pdf","comment":"ECCV 2024 Accepted"},{"id":"http://arxiv.org/abs/2404.15254v2","updated":"2024-09-05T15:42:25Z","published":"2024-04-23T17:39:27Z","title":"UniMERNet: A Universal Network for Real-World Mathematical Expression\n Recognition","summary":" The paper introduces the UniMER dataset, marking the first study on\nMathematical Expression Recognition (MER) targeting complex real-world\nscenarios. The UniMER dataset includes a large-scale training set, UniMER-1M,\nwhich offers unprecedented scale and diversity with one million training\ninstances to train high-quality, robust models. Additionally, UniMER features a\nmeticulously designed, diverse test set, UniMER-Test, which covers a variety of\nformula distributions found in real-world scenarios, providing a more\ncomprehensive and fair evaluation. To better utilize the UniMER dataset, the\npaper proposes a Universal Mathematical Expression Recognition Network\n(UniMERNet), tailored to the characteristics of formula recognition. UniMERNet\nconsists of a carefully designed encoder that incorporates detail-aware and\nlocal context features, and an optimized decoder for accelerated performance.\nExtensive experiments conducted using the UniMER-1M dataset and UniMERNet\ndemonstrate that training on the large-scale UniMER-1M dataset can produce a\nmore generalizable formula recognition model, significantly outperforming all\nprevious datasets. Furthermore, the introduction of UniMERNet enhances the\nmodel's performance in formula recognition, achieving higher accuracy and\nspeeds. All data, models, and code are available at\nhttps://github.com/opendatalab/UniMERNet.\n","authors":["Bin Wang","Zhuangcheng Gu","Guang Liang","Chao Xu","Bo Zhang","Botian Shi","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2404.15254v2.pdf","comment":"Project Website: https://github.com/opendatalab/UniMERNet"},{"id":"http://arxiv.org/abs/2409.03605v1","updated":"2024-09-05T15:11:40Z","published":"2024-09-05T15:11:40Z","title":"SegTalker: Segmentation-based Talking Face Generation with Mask-guided\n Local Editing","summary":" Audio-driven talking face generation aims to synthesize video with lip\nmovements synchronized to input audio. However, current generative techniques\nface challenges in preserving intricate regional textures (skin, teeth). To\naddress the aforementioned challenges, we propose a novel framework called\nSegTalker to decouple lip movements and image textures by introducing\nsegmentation as intermediate representation. Specifically, given the mask of\nimage employed by a parsing network, we first leverage the speech to drive the\nmask and generate talking segmentation. Then we disentangle semantic regions of\nimage into style codes using a mask-guided encoder. Ultimately, we inject the\npreviously generated talking segmentation and style codes into a mask-guided\nStyleGAN to synthesize video frame. In this way, most of textures are fully\npreserved. Moreover, our approach can inherently achieve background separation\nand facilitate mask-guided facial local editing. In particular, by editing the\nmask and swapping the region textures from a given reference image (e.g. hair,\nlip, eyebrows), our approach enables facial editing seamlessly when generating\ntalking face video. Experiments demonstrate that our proposed approach can\neffectively preserve texture details and generate temporally consistent video\nwhile remaining competitive in lip synchronization. Quantitative and\nqualitative results on the HDTF and MEAD datasets illustrate the superior\nperformance of our method over existing methods.\n","authors":["Lingyu Xiong","Xize Cheng","Jintao Tan","Xianjia Wu","Xiandong Li","Lei Zhu","Fei Ma","Minglei Li","Huang Xu","Zhihu Hu"],"pdf_url":"https://arxiv.org/pdf/2409.03605v1.pdf","comment":"10 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2409.03600v1","updated":"2024-09-05T14:59:41Z","published":"2024-09-05T14:59:41Z","title":"TCDiff: Triple Condition Diffusion Model with 3D Constraints for\n Stylizing Synthetic Faces","summary":" A robust face recognition model must be trained using datasets that include a\nlarge number of subjects and numerous samples per subject under varying\nconditions (such as pose, expression, age, noise, and occlusion). Due to\nethical and privacy concerns, large-scale real face datasets have been\ndiscontinued, such as MS1MV3, and synthetic face generators have been proposed,\nutilizing GANs and Diffusion Models, such as SYNFace, SFace, DigiFace-1M,\nIDiff-Face, DCFace, and GANDiffFace, aiming to supply this demand. Some of\nthese methods can produce high-fidelity realistic faces, but with low\nintra-class variance, while others generate high-variance faces with low\nidentity consistency. In this paper, we propose a Triple Condition Diffusion\nModel (TCDiff) to improve face style transfer from real to synthetic faces\nthrough 2D and 3D facial constraints, enhancing face identity consistency while\nkeeping the necessary high intra-class variance. Face recognition experiments\nusing 1k, 2k, and 5k classes of our new dataset for training outperform\nstate-of-the-art synthetic datasets in real face benchmarks such as LFW,\nCFP-FP, AgeDB, and BUPT. Our source code is available at:\nhttps://github.com/BOVIFOCR/tcdiff.\n","authors":["Bernardo Biesseck","Pedro Vidal","Luiz Coelho","Roger Granada","David Menotti|"],"pdf_url":"https://arxiv.org/pdf/2409.03600v1.pdf","comment":"SIBGRAPI 2024"},{"id":"http://arxiv.org/abs/2409.03598v1","updated":"2024-09-05T14:57:01Z","published":"2024-09-05T14:57:01Z","title":"A practical approach to evaluating the adversarial distance for machine\n learning classifiers","summary":" Robustness is critical for machine learning (ML) classifiers to ensure\nconsistent performance in real-world applications where models may encounter\ncorrupted or adversarial inputs. In particular, assessing the robustness of\nclassifiers to adversarial inputs is essential to protect systems from\nvulnerabilities and thus ensure safety in use. However, methods to accurately\ncompute adversarial robustness have been challenging for complex ML models and\nhigh-dimensional data. Furthermore, evaluations typically measure adversarial\naccuracy on specific attack budgets, limiting the informative value of the\nresulting metrics. This paper investigates the estimation of the more\ninformative adversarial distance using iterative adversarial attacks and a\ncertification approach. Combined, the methods provide a comprehensive\nevaluation of adversarial robustness by computing estimates for the upper and\nlower bounds of the adversarial distance. We present visualisations and\nablation studies that provide insights into how this evaluation method should\nbe applied and parameterised. We find that our adversarial attack approach is\neffective compared to related implementations, while the certification method\nfalls short of expectations. The approach in this paper should encourage a more\ninformative way of evaluating the adversarial robustness of ML classifiers.\n","authors":["Georg Siedel","Ekagra Gupta","Andrey Morozov"],"pdf_url":"https://arxiv.org/pdf/2409.03598v1.pdf","comment":"Accepted manuscript at International Mechanical Engineering Congress\n and Exposition IMECE2024"},{"id":"http://arxiv.org/abs/2311.06551v2","updated":"2024-09-05T14:47:24Z","published":"2023-11-11T12:00:24Z","title":"FDNet: Feature Decoupled Segmentation Network for Tooth CBCT Image","summary":" Precise Tooth Cone Beam Computed Tomography (CBCT) image segmentation is\ncrucial for orthodontic treatment planning. In this paper, we propose FDNet, a\nFeature Decoupled Segmentation Network, to excel in the face of the variable\ndental conditions encountered in CBCT scans, such as complex artifacts and\nindistinct tooth boundaries. The Low-Frequency Wavelet Transform (LF-Wavelet)\nis employed to enrich the semantic content by emphasizing the global structural\nintegrity of the teeth, while the SAM encoder is leveraged to refine the\nboundary delineation, thus improving the contrast between adjacent dental\nstructures. By integrating these dual aspects, FDNet adeptly addresses the\nsemantic gap, providing a detailed and accurate segmentation. The framework's\neffectiveness is validated through rigorous benchmarks, achieving the top Dice\nand IoU scores of 85.28% and 75.23%, respectively. This innovative decoupling\nof semantic and boundary features capitalizes on the unique strengths of each\nelement to elevate the quality of segmentation performance.\n","authors":["Xiang Feng","Chengkai Wang","Chengyu Wu","Yunxiang Li","Yongbo He","Shuai Wang","Yaiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2311.06551v2.pdf","comment":"IEEE ISBI 2024, Oral"},{"id":"http://arxiv.org/abs/2408.07666v4","updated":"2024-09-05T14:37:59Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03583v1","updated":"2024-09-05T14:37:43Z","published":"2024-09-05T14:37:43Z","title":"Text-Guided Mixup Towards Long-Tailed Image Categorization","summary":" In many real-world applications, the frequency distribution of class labels\nfor training data can exhibit a long-tailed distribution, which challenges\ntraditional approaches of training deep neural networks that require heavy\namounts of balanced data. Gathering and labeling data to balance out the class\nlabel distribution can be both costly and time-consuming. Many existing\nsolutions that enable ensemble learning, re-balancing strategies, or\nfine-tuning applied to deep neural networks are limited by the inert problem of\nfew class samples across a subset of classes. Recently, vision-language models\nlike CLIP have been observed as effective solutions to zero-shot or few-shot\nlearning by grasping a similarity between vision and language features for\nimage and text pairs. Considering that large pre-trained vision-language models\nmay contain valuable side textual information for minor classes, we propose to\nleverage text supervision to tackle the challenge of long-tailed learning.\nConcretely, we propose a novel text-guided mixup technique that takes advantage\nof the semantic relations between classes recognized by the pre-trained text\nencoder to help alleviate the long-tailed problem. Our empirical study on\nbenchmark long-tailed tasks demonstrates the effectiveness of our proposal with\na theoretical guarantee. Our code is available at\nhttps://github.com/rsamf/text-guided-mixup.\n","authors":["Richard Franklin","Jiawei Yao","Deyang Zhong","Qi Qian","Juhua Hu"],"pdf_url":"https://arxiv.org/pdf/2409.03583v1.pdf","comment":"Accepted by BMVC'24, code is available at\n https://github.com/rsamf/text-guided-mixup"},{"id":"http://arxiv.org/abs/2404.13400v2","updated":"2024-09-05T14:33:04Z","published":"2024-04-20T14:57:31Z","title":"HiVG: Hierarchical Multimodal Fine-grained Modulation for Visual\n Grounding","summary":" Visual grounding, which aims to ground a visual region via natural language,\nis a task that heavily relies on cross-modal alignment. Existing works utilized\nuni-modal pre-trained models to transfer visual or linguistic knowledge\nseparately while ignoring the multimodal corresponding information. Motivated\nby recent advancements in contrastive language-image pre-training and low-rank\nadaptation (LoRA) methods, we aim to solve the grounding task based on\nmultimodal pre-training. However, there exists significant task gaps between\npre-training and grounding. Therefore, to address these gaps, we propose a\nconcise and efficient hierarchical multimodal fine-grained modulation\nframework, namely HiVG. Specifically, HiVG consists of a multi-layer adaptive\ncross-modal bridge and a hierarchical multimodal low-rank adaptation (HiLoRA)\nparadigm. The cross-modal bridge can address the inconsistency between visual\nfeatures and those required for grounding, and establish a connection between\nmulti-level visual and text features. HiLoRA prevents the accumulation of\nperceptual errors by adapting the cross-modal features from shallow to deep\nlayers in a hierarchical manner. Experimental results on five datasets\ndemonstrate the effectiveness of our approach and showcase the significant\ngrounding capabilities as well as promising energy efficiency advantages. The\nproject page: https://github.com/linhuixiao/HiVG.\n","authors":["Linhui Xiao","Xiaoshan Yang","Fang Peng","Yaowei Wang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2404.13400v2.pdf","comment":"Accepted by ACM MM 2024. The project page:\n https://github.com/linhuixiao/HiVG"},{"id":"http://arxiv.org/abs/2409.03556v1","updated":"2024-09-05T14:17:01Z","published":"2024-09-05T14:17:01Z","title":"MaskVal: Simple but Effective Uncertainty Quantification for 6D Pose\n Estimation","summary":" For the use of 6D pose estimation in robotic applications, reliable poses are\nof utmost importance to ensure a safe, reliable and predictable operational\nperformance. Despite these requirements, state-of-the-art 6D pose estimators\noften do not provide any uncertainty quantification for their pose estimates at\nall, or if they do, it has been shown that the uncertainty provided is only\nweakly correlated with the actual true error. To address this issue, we\ninvestigate a simple but effective uncertainty quantification, that we call\nMaskVal, which compares the pose estimates with their corresponding instance\nsegmentations by rendering and does not require any modification of the pose\nestimator itself. Despite its simplicity, MaskVal significantly outperforms a\nstate-of-the-art ensemble method on both a dataset and a robotic setup. We show\nthat by using MaskVal, the performance of a state-of-the-art 6D pose estimator\nis significantly improved towards a safe and reliable operation. In addition,\nwe propose a new and specific approach to compare and evaluate uncertainty\nquantification methods for 6D pose estimation in the context of robotic\nmanipulation.\n","authors":["Philipp Quentin","Daniel Goehring"],"pdf_url":"https://arxiv.org/pdf/2409.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06949v2","updated":"2024-09-05T14:16:31Z","published":"2024-06-11T05:21:30Z","title":"Triple-domain Feature Learning with Frequency-aware Memory Enhancement\n for Moving Infrared Small Target Detection","summary":" As a sub-field of object detection, moving infrared small target detection\npresents significant challenges due to tiny target sizes and low contrast\nagainst backgrounds. Currently-existing methods primarily rely on the features\nextracted only from spatio-temporal domain. Frequency domain has hardly been\nconcerned yet, although it has been widely applied in image processing. To\nextend feature source domains and enhance feature representation, we propose a\nnew Triple-domain Strategy (Tridos) with the frequency-aware memory enhancement\non spatio-temporal domain for infrared small target detection. In this scheme,\nit effectively detaches and enhances frequency features by a local-global\nfrequency-aware module with Fourier transform. Inspired by human visual system,\nour memory enhancement is designed to capture the spatial relations of infrared\ntargets among video frames. Furthermore, it encodes temporal dynamics motion\nfeatures via differential learning and residual enhancing. Additionally, we\nfurther design a residual compensation to reconcile possible cross-domain\nfeature mismatches. To our best knowledge, proposed Tridos is the first work to\nexplore infrared target feature learning comprehensively in\nspatio-temporal-frequency domains. The extensive experiments on three datasets\n(i.e., DAUB, ITSDT-15K and IRDST) validate that our triple-domain infrared\nfeature learning scheme could often be obviously superior to state-of-the-art\nones. Source codes are available at https://github.com/UESTC-nnLab/Tridos.\n","authors":["Weiwei Duan","Luping Ji","Shengjia Chen","Sicheng Zhu","Mao Ye"],"pdf_url":"https://arxiv.org/pdf/2406.06949v2.pdf","comment":"This paper has accepted IEEE TGRS"},{"id":"http://arxiv.org/abs/2409.03555v1","updated":"2024-09-05T14:15:54Z","published":"2024-09-05T14:15:54Z","title":"Unified Framework for Neural Network Compression via Decomposition and\n Optimal Rank Selection","summary":" Despite their high accuracy, complex neural networks demand significant\ncomputational resources, posing challenges for deployment on\nresource-constrained devices such as mobile phones and embedded systems.\nCompression algorithms have been developed to address these challenges by\nreducing model size and computational demands while maintaining accuracy. Among\nthese approaches, factorization methods based on tensor decomposition are\ntheoretically sound and effective. However, they face difficulties in selecting\nthe appropriate rank for decomposition. This paper tackles this issue by\npresenting a unified framework that simultaneously applies decomposition and\noptimal rank selection, employing a composite compression loss within defined\nrank constraints. Our approach includes an automatic rank search in a\ncontinuous space, efficiently identifying optimal rank configurations without\nthe use of training data, making it computationally efficient. Combined with a\nsubsequent fine-tuning step, our approach maintains the performance of highly\ncompressed models on par with their original counterparts. Using various\nbenchmark datasets, we demonstrate the efficacy of our method through a\ncomprehensive analysis.\n","authors":["Ali Aghababaei-Harandi","Massih-Reza Amini"],"pdf_url":"https://arxiv.org/pdf/2409.03555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03553v1","updated":"2024-09-05T14:13:05Z","published":"2024-09-05T14:13:05Z","title":"Organized Grouped Discrete Representation for Object-Centric Learning","summary":" Object-Centric Learning (OCL) represents dense image or video pixels as\nsparse object features. Representative methods utilize discrete representation\ncomposed of Variational Autoencoder (VAE) template features to suppress\npixel-level information redundancy and guide object-level feature aggregation.\nThe most recent advancement, Grouped Discrete Representation (GDR), further\ndecomposes these template features into attributes. However, its naive channel\ngrouping as decomposition may erroneously group channels belonging to different\nattributes together and discretize them as sub-optimal template attributes,\nwhich losses information and harms expressivity. We propose Organized GDR\n(OGDR) to organize channels belonging to the same attributes together for\ncorrect decomposition from features into attributes. In unsupervised\nsegmentation experiments, OGDR is fully superior to GDR in augmentating\nclassical transformer-based OCL methods; it even improves state-of-the-art\ndiffusion-based ones. Codebook PCA and representation similarity analyses show\nthat compared with GDR, our OGDR eliminates redundancy and preserves\ninformation better for guiding object representation learning. The source code\nis available in the supplementary material.\n","authors":["Rongzhen Zhao","Vivienne Wang","Juho Kannala","Joni Pajarinen"],"pdf_url":"https://arxiv.org/pdf/2409.03553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03550v1","updated":"2024-09-05T14:12:22Z","published":"2024-09-05T14:12:22Z","title":"DKDM: Data-Free Knowledge Distillation for Diffusion Models with Any\n Architecture","summary":" Diffusion models (DMs) have demonstrated exceptional generative capabilities\nacross various areas, while they are hindered by slow inference speeds and high\ncomputational demands during deployment. The most common way to accelerate DMs\ninvolves reducing the number of denoising steps during generation, achieved\nthrough faster sampling solvers or knowledge distillation (KD). In contrast to\nprior approaches, we propose a novel method that transfers the capability of\nlarge pretrained DMs to faster architectures. Specifically, we employ KD in a\ndistinct manner to compress DMs by distilling their generative ability into\nmore rapid variants. Furthermore, considering that the source data is either\nunaccessible or too enormous to store for current generative models, we\nintroduce a new paradigm for their distillation without source data, termed\nData-Free Knowledge Distillation for Diffusion Models (DKDM). Generally, our\nestablished DKDM framework comprises two main components: 1) a DKDM objective\nthat uses synthetic denoising data produced by pretrained DMs to optimize\nfaster DMs without source data, and 2) a dynamic iterative distillation method\nthat flexibly organizes the synthesis of denoising data, preventing it from\nslowing down the optimization process as the generation is slow. To our\nknowledge, this is the first attempt at using KD to distill DMs into any\narchitecture in a data-free manner. Importantly, our DKDM is orthogonal to most\nexisting acceleration methods, such as denoising step reduction, quantization\nand pruning. Experiments show that our DKDM is capable of deriving 2x faster\nDMs with performance remaining on par with the baseline. Notably, our DKDM\nenables pretrained DMs to function as \"datasets\" for training new DMs.\n","authors":["Qianlong Xiang","Miao Zhang","Yuzhang Shang","Jianlong Wu","Yan Yan","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2409.03550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03543v1","updated":"2024-09-05T14:06:56Z","published":"2024-09-05T14:06:56Z","title":"Prediction Accuracy & Reliability: Classification and Object\n Localization under Distribution Shift","summary":" Natural distribution shift causes a deterioration in the perception\nperformance of convolutional neural networks (CNNs). This comprehensive\nanalysis for real-world traffic data addresses: 1) investigating the effect of\nnatural distribution shift and weather augmentations on both detection quality\nand confidence estimation, 2) evaluating model performance for both\nclassification and object localization, and 3) benchmarking two common\nuncertainty quantification methods - Ensembles and different variants of\nMonte-Carlo (MC) Dropout - under natural and close-to-natural distribution\nshift. For this purpose, a novel dataset has been curated from publicly\navailable autonomous driving datasets. The in-distribution (ID) data is based\non cutouts of a single object, for which both class and bounding box\nannotations are available. The six distribution-shift datasets cover adverse\nweather scenarios, simulated rain and fog, corner cases, and\nout-of-distribution data. A granular analysis of CNNs under distribution shift\nallows to quantize the impact of different types of shifts on both, task\nperformance and confidence estimation: ConvNeXt-Tiny is more robust than\nEfficientNet-B0; heavy rain degrades classification stronger than localization,\ncontrary to heavy fog; integrating MC-Dropout into selected layers only has the\npotential to enhance task performance and confidence estimation, whereby the\nidentification of these layers depends on the type of distribution shift and\nthe considered task.\n","authors":["Fabian Diet","Moussa Kassem Sbeyti","Michelle Karg"],"pdf_url":"https://arxiv.org/pdf/2409.03543v1.pdf","comment":"This preprint has not undergone any post-submission improvements or\n corrections"},{"id":"http://arxiv.org/abs/2405.17609v3","updated":"2024-09-05T14:00:27Z","published":"2024-05-27T19:14:46Z","title":"GarmentCodeData: A Dataset of 3D Made-to-Measure Garments With Sewing\n Patterns","summary":" Recent research interest in the learning-based processing of garments, from\nvirtual fitting to generation and reconstruction, stumbles on a scarcity of\nhigh-quality public data in the domain. We contribute to resolving this need by\npresenting the first large-scale synthetic dataset of 3D made-to-measure\ngarments with sewing patterns, as well as its generation pipeline.\nGarmentCodeData contains 115,000 data points that cover a variety of designs in\nmany common garment categories: tops, shirts, dresses, jumpsuits, skirts,\npants, etc., fitted to a variety of body shapes sampled from a custom\nstatistical body model based on CAESAR, as well as a standard reference body\nshape, applying three different textile materials. To enable the creation of\ndatasets of such complexity, we introduce a set of algorithms for automatically\ntaking tailor's measures on sampled body shapes, sampling strategies for sewing\npattern design, and propose an automatic, open-source 3D garment draping\npipeline based on a fast XPBD simulator, while contributing several solutions\nfor collision resolution and drape correctness to enable scalability.\n Project Page: https://igl.ethz.ch/projects/GarmentCodeData/\n","authors":["Maria Korosteleva","Timur Levent Kesdogan","Fabian Kemper","Stephan Wenninger","Jasmin Koller","Yuhan Zhang","Mario Botsch","Olga Sorkine-Hornung"],"pdf_url":"https://arxiv.org/pdf/2405.17609v3.pdf","comment":"Accepted to ECCV 2024. Sept 4th, 2024: release of GarmentCodeData(v2)"},{"id":"http://arxiv.org/abs/2409.03530v1","updated":"2024-09-05T13:42:20Z","published":"2024-09-05T13:42:20Z","title":"Use of triplet loss for facial restoration in low-resolution images","summary":" In recent years, facial recognition (FR) models have become the most widely\nused biometric tool, achieving impressive results on numerous datasets.\nHowever, inherent hardware challenges or shooting distances often result in\nlow-resolution images, which significantly impact the performance of FR models.\nTo address this issue, several solutions have been proposed, including\nsuper-resolution (SR) models that generate highly realistic faces. Despite\nthese efforts, significant improvements in FR algorithms have not been\nachieved. We propose a novel SR model FTLGAN, which focuses on generating\nhigh-resolution images that preserve individual identities rather than merely\nimproving image quality, thereby maximizing the performance of FR models. The\nresults are compelling, demonstrating a mean value of d' 21% above the best\ncurrent state-of-the-art models, specifically having a value of d' = 1.099 and\nAUC = 0.78 for 14x14 pixels, d' = 2.112 and AUC = 0.92 for 28x28 pixels, and d'\n= 3.049 and AUC = 0.98 for 56x56 pixels. The contributions of this study are\nsignificant in several key areas. Firstly, a notable improvement in facial\nrecognition performance has been achieved in low-resolution images,\nspecifically at resolutions of 14x14, 28x28, and 56x56 pixels. Secondly, the\nenhancements demonstrated by FTLGAN show a consistent response across all\nresolutions, delivering outstanding performance uniformly, unlike other\ncomparative models. Thirdly, an innovative approach has been implemented using\ntriplet loss logic, enabling the training of the super-resolution model solely\nwith real images, contrasting with current models, and expanding potential\nreal-world applications. Lastly, this study introduces a novel model that\nspecifically addresses the challenge of improving classification performance in\nfacial recognition systems by integrating facial recognition quality as a loss\nduring model training.\n","authors":["Sebastian Pulgar","Domingo Mery"],"pdf_url":"https://arxiv.org/pdf/2409.03530v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.03525v1","updated":"2024-09-05T13:36:50Z","published":"2024-09-05T13:36:50Z","title":"FrozenSeg: Harmonizing Frozen Foundation Models for Open-Vocabulary\n Segmentation","summary":" Open-vocabulary segmentation poses significant challenges, as it requires\nsegmenting and recognizing objects across an open set of categories in\nunconstrained environments. Building on the success of powerful vision-language\n(ViL) foundation models, such as CLIP, recent efforts sought to harness their\nzero-short capabilities to recognize unseen categories. Despite notable\nperformance improvements, these models still encounter the critical issue of\ngenerating precise mask proposals for unseen categories and scenarios,\nresulting in inferior segmentation performance eventually. To address this\nchallenge, we introduce a novel approach, FrozenSeg, designed to integrate\nspatial knowledge from a localization foundation model (e.g., SAM) and semantic\nknowledge extracted from a ViL model (e.g., CLIP), in a synergistic framework.\nTaking the ViL model's visual encoder as the feature backbone, we inject the\nspace-aware feature into the learnable queries and CLIP features within the\ntransformer decoder. In addition, we devise a mask proposal ensemble strategy\nfor further improving the recall rate and mask quality. To fully exploit\npre-trained knowledge while minimizing training overhead, we freeze both\nfoundation models, focusing optimization efforts solely on a lightweight\ntransformer decoder for mask proposal generation-the performance bottleneck.\nExtensive experiments demonstrate that FrozenSeg advances state-of-the-art\nresults across various segmentation benchmarks, trained exclusively on COCO\npanoptic data, and tested in a zero-shot manner. Code is available at\nhttps://github.com/chenxi52/FrozenSeg.\n","authors":["Xi Chen","Haosen Yang","Sheng Jin","Xiatian Zhu","Hongxun Yao"],"pdf_url":"https://arxiv.org/pdf/2409.03525v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.03521v1","updated":"2024-09-05T13:33:57Z","published":"2024-09-05T13:33:57Z","title":"Have Large Vision-Language Models Mastered Art History?","summary":" The emergence of large Vision-Language Models (VLMs) has recently established\nnew baselines in image classification across multiple domains. However, the\nperformance of VLMs in the specific task of artwork classification,\nparticularly art style classification of paintings - a domain traditionally\nmastered by art historians - has not been explored yet. Artworks pose a unique\nchallenge compared to natural images due to their inherently complex and\ndiverse structures, characterized by variable compositions and styles. Art\nhistorians have long studied the unique aspects of artworks, with style\nprediction being a crucial component of their discipline. This paper\ninvestigates whether large VLMs, which integrate visual and textual data, can\neffectively predict the art historical attributes of paintings. We conduct an\nin-depth analysis of four VLMs, namely CLIP, LLaVA, OpenFlamingo, and GPT-4o,\nfocusing on zero-shot classification of art style, author and time period using\ntwo public benchmarks of artworks. Additionally, we present ArTest, a\nwell-curated test set of artworks, including pivotal paintings studied by art\nhistorians.\n","authors":["Ombretta Strafforello","Derya Soydaner","Michiel Willems","Anne-Sofie Maerten","Stefanie De Winter"],"pdf_url":"https://arxiv.org/pdf/2409.03521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03519v1","updated":"2024-09-05T13:32:40Z","published":"2024-09-05T13:32:40Z","title":"Tissue Concepts: supervised foundation models in computational pathology","summary":" Due to the increasing workload of pathologists, the need for automation to\nsupport diagnostic tasks and quantitative biomarker evaluation is becoming more\nand more apparent. Foundation models have the potential to improve\ngeneralizability within and across centers and serve as starting points for\ndata efficient development of specialized yet robust AI models. However, the\ntraining foundation models themselves is usually very expensive in terms of\ndata, computation, and time. This paper proposes a supervised training method\nthat drastically reduces these expenses. The proposed method is based on\nmulti-task learning to train a joint encoder, by combining 16 different\nclassification, segmentation, and detection tasks on a total of 912,000\npatches. Since the encoder is capable of capturing the properties of the\nsamples, we term it the Tissue Concepts encoder. To evaluate the performance\nand generalizability of the Tissue Concepts encoder across centers,\nclassification of whole slide images from four of the most prevalent solid\ncancers - breast, colon, lung, and prostate - was used. The experiments show\nthat the Tissue Concepts model achieve comparable performance to models trained\nwith self-supervision, while requiring only 6% of the amount of training\npatches. Furthermore, the Tissue Concepts encoder outperforms an ImageNet\npre-trained encoder on both in-domain and out-of-domain data.\n","authors":["Till Nicke","Jan Raphael Schaefer","Henning Hoefener","Friedrich Feuerhake","Dorit Merhof","Fabian Kiessling","Johannes Lotz"],"pdf_url":"https://arxiv.org/pdf/2409.03519v1.pdf","comment":"22 Pages, 3 Figures, submitted to and under revision at Computers in\n Biology and Medicine"},{"id":"http://arxiv.org/abs/2409.03516v1","updated":"2024-09-05T13:29:50Z","published":"2024-09-05T13:29:50Z","title":"LMLT: Low-to-high Multi-Level Vision Transformer for Image\n Super-Resolution","summary":" Recent Vision Transformer (ViT)-based methods for Image Super-Resolution have\ndemonstrated impressive performance. However, they suffer from significant\ncomplexity, resulting in high inference times and memory usage. Additionally,\nViT models using Window Self-Attention (WSA) face challenges in processing\nregions outside their windows. To address these issues, we propose the\nLow-to-high Multi-Level Transformer (LMLT), which employs attention with\nvarying feature sizes for each head. LMLT divides image features along the\nchannel dimension, gradually reduces spatial size for lower heads, and applies\nself-attention to each head. This approach effectively captures both local and\nglobal information. By integrating the results from lower heads into higher\nheads, LMLT overcomes the window boundary issues in self-attention. Extensive\nexperiments show that our model significantly reduces inference time and GPU\nmemory usage while maintaining or even surpassing the performance of\nstate-of-the-art ViT-based Image Super-Resolution methods. Our codes are\navailiable at https://github.com/jwgdmkj/LMLT.\n","authors":["Jeongsoo Kim","Jongho Nang","Junsuk Choe"],"pdf_url":"https://arxiv.org/pdf/2409.03516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03514v1","updated":"2024-09-05T13:23:52Z","published":"2024-09-05T13:23:52Z","title":"Blended Latent Diffusion under Attention Control for Real-World Video\n Editing","summary":" Due to lack of fully publicly available text-to-video models, current video\nediting methods tend to build on pre-trained text-to-image generation models,\nhowever, they still face grand challenges in dealing with the local editing of\nvideo with temporal information. First, although existing methods attempt to\nfocus on local area editing by a pre-defined mask, the preservation of the\noutside-area background is non-ideal due to the spatially entire generation of\neach frame. In addition, specially providing a mask by user is an additional\ncostly undertaking, so an autonomous masking strategy integrated into the\nediting process is desirable. Last but not least, image-level pretrained model\nhasn't learned temporal information across frames of a video which is vital for\nexpressing the motion and dynamics. In this paper, we propose to adapt a\nimage-level blended latent diffusion model to perform local video editing\ntasks. Specifically, we leverage DDIM inversion to acquire the latents as\nbackground latents instead of the randomly noised ones to better preserve the\nbackground information of the input video. We further introduce an autonomous\nmask manufacture mechanism derived from cross-attention maps in diffusion\nsteps. Finally, we enhance the temporal consistency across video frames by\ntransforming the self-attention blocks of U-Net into temporal-spatial blocks.\nThrough extensive experiments, our proposed approach demonstrates effectiveness\nin different real-world video editing tasks.\n","authors":["Deyin Liu","Lin Yuanbo Wu","Xianghua Xie"],"pdf_url":"https://arxiv.org/pdf/2409.03514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13520v3","updated":"2024-09-05T13:16:23Z","published":"2024-07-18T13:55:54Z","title":"EaDeblur-GS: Event assisted 3D Deblur Reconstruction with Gaussian\n Splatting","summary":" 3D deblurring reconstruction techniques have recently seen significant\nadvancements with the development of Neural Radiance Fields (NeRF) and 3D\nGaussian Splatting (3DGS). Although these techniques can recover relatively\nclear 3D reconstructions from blurry image inputs, they still face limitations\nin handling severe blurring and complex camera motion. To address these issues,\nwe propose Event-assisted 3D Deblur Reconstruction with Gaussian Splatting\n(EaDeblur-GS), which integrates event camera data to enhance the robustness of\n3DGS against motion blur. By employing an Adaptive Deviation Estimator (ADE)\nnetwork to estimate Gaussian center deviations and using novel loss functions,\nEaDeblur-GS achieves sharp 3D reconstructions in real-time, demonstrating\nperformance comparable to state-of-the-art methods.\n","authors":["Yuchen Weng","Zhengwen Shen","Ruofan Chen","Qi Wang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2407.13520v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05490v4","updated":"2024-09-05T13:05:31Z","published":"2023-12-09T07:35:09Z","title":"Shapley Values-enabled Progressive Pseudo Bag Augmentation for Whole\n Slide Image Classification","summary":" In computational pathology, whole-slide image (WSI) classification presents a\nformidable challenge due to its gigapixel resolution and limited fine-grained\nannotations. Multiple-instance learning (MIL) offers a weakly supervised\nsolution, yet refining instance-level information from bag-level labels remains\nchallenging. While most of the conventional MIL methods use attention scores to\nestimate instance importance scores (IIS) which contribute to the prediction of\nthe slide labels, these often lead to skewed attention distributions and\ninaccuracies in identifying crucial instances. To address these issues, we\npropose a new approach inspired by cooperative game theory: employing Shapley\nvalues to assess each instance's contribution, thereby improving IIS\nestimation. The computation of the Shapley value is then accelerated using\nattention, meanwhile retaining the enhanced instance identification and\nprioritization. We further introduce a framework for the progressive assignment\nof pseudo bags based on estimated IIS, encouraging more balanced attention\ndistributions in MIL models. Our extensive experiments on CAMELYON-16, BRACS,\nTCGA-LUNG, and TCGA-BRCA datasets show our method's superiority over existing\nstate-of-the-art approaches, offering enhanced interpretability and class-wise\ninsights. Our source code is available at https://github.com/RenaoYan/PMIL.\n","authors":["Renao Yan","Qiehe Sun","Cheng Jin","Yiqing Liu","Yonghong He","Tian Guan","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.05490v4.pdf","comment":"IEEE TRANSACTIONS ON MEDICAL IMAGING 2024"},{"id":"http://arxiv.org/abs/2409.03487v1","updated":"2024-09-05T12:52:24Z","published":"2024-09-05T12:52:24Z","title":"ScreenMark: Watermarking Arbitrary Visual Content on Screen","summary":" Digital watermarking has demonstrated its effectiveness in protecting\nmultimedia content. However, existing watermarking are predominantly tailored\nfor specific media types, rendering them less effective for the protection of\ncontent displayed on computer screens, which is often multimodal and dynamic.\nVisual Screen Content (VSC), is particularly susceptible to theft and leakage\nvia screenshots, a vulnerability that current watermarking methods fail to\nadequately address.To tackle these challenges, we propose ScreenMark, a robust\nand practical watermarking method designed specifically for arbitrary VSC\nprotection. ScreenMark utilizes a three-stage progressive watermarking\nframework. Initially, inspired by diffusion principles, we initialize the\nmutual transformation between regular watermark information and irregular\nwatermark patterns. Subsequently, these patterns are integrated with screen\ncontent using a pre-multiplication alpha blending technique, supported by a\npre-trained screen decoder for accurate watermark retrieval. The progressively\ncomplex distorter enhances the robustness of the watermark in real-world\nscreenshot scenarios. Finally, the model undergoes fine-tuning guided by a\njoint-level distorter to ensure optimal performance.To validate the\neffectiveness of ScreenMark, we compiled a dataset comprising 100,000\nscreenshots from various devices and resolutions. Extensive experiments across\ndifferent datasets confirm the method's superior robustness, imperceptibility,\nand practical applicability.\n","authors":["Xiujian Liang","Gaozhi Liu","Yichao Si","Xiaoxiao Hu","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03470v1","updated":"2024-09-05T12:31:51Z","published":"2024-09-05T12:31:51Z","title":"Improving Uncertainty-Error Correspondence in Deep Bayesian Medical\n Image Segmentation","summary":" Increased usage of automated tools like deep learning in medical image\nsegmentation has alleviated the bottleneck of manual contouring. This has\nshifted manual labour to quality assessment (QA) of automated contours which\ninvolves detecting errors and correcting them. A potential solution to\nsemi-automated QA is to use deep Bayesian uncertainty to recommend potentially\nerroneous regions, thus reducing time spent on error detection. Previous work\nhas investigated the correspondence between uncertainty and error, however, no\nwork has been done on improving the \"utility\" of Bayesian uncertainty maps such\nthat it is only present in inaccurate regions and not in the accurate ones. Our\nwork trains the FlipOut model with the Accuracy-vs-Uncertainty (AvU) loss which\npromotes uncertainty to be present only in inaccurate regions. We apply this\nmethod on datasets of two radiotherapy body sites, c.f. head-and-neck CT and\nprostate MR scans. Uncertainty heatmaps (i.e. predictive entropy) are evaluated\nagainst voxel inaccuracies using Receiver Operating Characteristic (ROC) and\nPrecision-Recall (PR) curves. Numerical results show that when compared to the\nBayesian baseline the proposed method successfully suppresses uncertainty for\naccurate voxels, with similar presence of uncertainty for inaccurate voxels.\nCode to reproduce experiments is available at\nhttps://github.com/prerakmody/bayesuncertainty-error-correspondence\n","authors":["Prerak Mody","Nicolas F. Chaves-de-Plaza","Chinmay Rao","Eleftheria Astrenidou","Mischa de Ridder","Nienke Hoekstra","Klaus Hildebrandt","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2409.03470v1.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024:018"},{"id":"http://arxiv.org/abs/2409.03460v1","updated":"2024-09-05T12:18:32Z","published":"2024-09-05T12:18:32Z","title":"LowFormer: Hardware Efficient Design for Convolutional Transformer\n Backbones","summary":" Research in efficient vision backbones is evolving into models that are a\nmixture of convolutions and transformer blocks. A smart combination of both,\narchitecture-wise and component-wise is mandatory to excel in the speedaccuracy\ntrade-off. Most publications focus on maximizing accuracy and utilize MACs\n(multiply accumulate operations) as an efficiency metric. The latter however\noften do not measure accurately how fast a model actually is due to factors\nlike memory access cost and degree of parallelism. We analyzed common modules\nand architectural design choices for backbones not in terms of MACs, but rather\nin actual throughput and latency, as the combination of the latter two is a\nbetter representation of the efficiency of models in real applications. We\napplied the conclusions taken from that analysis to create a recipe for\nincreasing hardware-efficiency in macro design. Additionally we introduce a\nsimple slimmed-down version of MultiHead Self-Attention, that aligns with our\nanalysis. We combine both macro and micro design to create a new family of\nhardware-efficient backbone networks called LowFormer. LowFormer achieves a\nremarkable speedup in terms of throughput and latency, while achieving similar\nor better accuracy than current state-of-the-art efficient backbones. In order\nto prove the generalizability of our hardware-efficient design, we evaluate our\nmethod on GPU, mobile GPU and ARM CPU. We further show that the downstream\ntasks object detection and semantic segmentation profit from our\nhardware-efficient architecture. Code and models are available at\nhttps://github.com/ altair199797/LowFormer.\n","authors":["Moritz Nottebaum","Matteo Dunnhofer","Christian Micheloni"],"pdf_url":"https://arxiv.org/pdf/2409.03460v1.pdf","comment":"Accepted at WACV 2025. Features 11 pages in total"},{"id":"http://arxiv.org/abs/2409.03458v1","updated":"2024-09-05T12:14:33Z","published":"2024-09-05T12:14:33Z","title":"Non-Uniform Illumination Attack for Fooling Convolutional Neural\n Networks","summary":" Convolutional Neural Networks (CNNs) have made remarkable strides; however,\nthey remain susceptible to vulnerabilities, particularly in the face of minor\nimage perturbations that humans can easily recognize. This weakness, often\ntermed as 'attacks', underscores the limited robustness of CNNs and the need\nfor research into fortifying their resistance against such manipulations. This\nstudy introduces a novel Non-Uniform Illumination (NUI) attack technique, where\nimages are subtly altered using varying NUI masks. Extensive experiments are\nconducted on widely-accepted datasets including CIFAR10, TinyImageNet, and\nCalTech256, focusing on image classification with 12 different NUI attack\nmodels. The resilience of VGG, ResNet, MobilenetV3-small and InceptionV3 models\nagainst NUI attacks are evaluated. Our results show a substantial decline in\nthe CNN models' classification accuracy when subjected to NUI attacks,\nindicating their vulnerability under non-uniform illumination. To mitigate\nthis, a defense strategy is proposed, including NUI-attacked images, generated\nthrough the new NUI transformation, into the training set. The results\ndemonstrate a significant enhancement in CNN model performance when confronted\nwith perturbed images affected by NUI attacks. This strategy seeks to bolster\nCNN models' resilience against NUI attacks.\n","authors":["Akshay Jain","Shiv Ram Dubey","Satish Kumar Singh","KC Santosh","Bidyut Baran Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2409.03458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03456v1","updated":"2024-09-05T12:09:02Z","published":"2024-09-05T12:09:02Z","title":"LM-Gaussian: Boost Sparse-view 3D Gaussian Splatting with Large Model\n Priors","summary":" We aim to address sparse-view reconstruction of a 3D scene by leveraging\npriors from large-scale vision models. While recent advancements such as 3D\nGaussian Splatting (3DGS) have demonstrated remarkable successes in 3D\nreconstruction, these methods typically necessitate hundreds of input images\nthat densely capture the underlying scene, making them time-consuming and\nimpractical for real-world applications. However, sparse-view reconstruction is\ninherently ill-posed and under-constrained, often resulting in inferior and\nincomplete outcomes. This is due to issues such as failed initialization,\noverfitting on input images, and a lack of details. To mitigate these\nchallenges, we introduce LM-Gaussian, a method capable of generating\nhigh-quality reconstructions from a limited number of images. Specifically, we\npropose a robust initialization module that leverages stereo priors to aid in\nthe recovery of camera poses and the reliable point clouds. Additionally, a\ndiffusion-based refinement is iteratively applied to incorporate image\ndiffusion priors into the Gaussian optimization process to preserve intricate\nscene details. Finally, we utilize video diffusion priors to further enhance\nthe rendered images for realistic visual effects. Overall, our approach\nsignificantly reduces the data acquisition requirements compared to previous\n3DGS methods. We validate the effectiveness of our framework through\nexperiments on various public datasets, demonstrating its potential for\nhigh-quality 360-degree scene reconstruction. Visual results are on our\nwebsite.\n","authors":["Hanyang Yu","Xiaoxiao Long","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2409.03456v1.pdf","comment":"Project page: https://hanyangyu1021.github.io/lm-gaussian.github.io/"},{"id":"http://arxiv.org/abs/2409.03455v1","updated":"2024-09-05T12:07:17Z","published":"2024-09-05T12:07:17Z","title":"Data-free Distillation with Degradation-prompt Diffusion for\n Multi-weather Image Restoration","summary":" Multi-weather image restoration has witnessed incredible progress, while the\nincreasing model capacity and expensive data acquisition impair its\napplications in memory-limited devices. Data-free distillation provides an\nalternative for allowing to learn a lightweight student model from a\npre-trained teacher model without relying on the original training data. The\nexisting data-free learning methods mainly optimize the models with the pseudo\ndata generated by GANs or the real data collected from the Internet. However,\nthey inevitably suffer from the problems of unstable training or domain shifts\nwith the original data. In this paper, we propose a novel Data-free\nDistillation with Degradation-prompt Diffusion framework for multi-weather\nImage Restoration (D4IR). It replaces GANs with pre-trained diffusion models to\navoid model collapse and incorporates a degradation-aware prompt adapter to\nfacilitate content-driven conditional diffusion for generating domain-related\nimages. Specifically, a contrast-based degradation prompt adapter is firstly\ndesigned to capture degradation-aware prompts from web-collected degraded\nimages. Then, the collected unpaired clean images are perturbed to latent\nfeatures of stable diffusion, and conditioned with the degradation-aware\nprompts to synthesize new domain-related degraded images for knowledge\ndistillation. Experiments illustrate that our proposal achieves comparable\nperformance to the model distilled with original training data, and is even\nsuperior to other mainstream unsupervised methods.\n","authors":["Pei Wang","Xiaotong Luo","Yuan Xie","Yanyun Qu"],"pdf_url":"https://arxiv.org/pdf/2409.03455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00343v2","updated":"2024-09-05T11:58:51Z","published":"2024-08-31T04:19:02Z","title":"EgoHDM: An Online Egocentric-Inertial Human Motion Capture,\n Localization, and Dense Mapping System","summary":" We present EgoHDM, an online egocentric-inertial human motion capture\n(mocap), localization, and dense mapping system. Our system uses 6 inertial\nmeasurement units (IMUs) and a commodity head-mounted RGB camera. EgoHDM is the\nfirst human mocap system that offers dense scene mapping in near real-time.\nFurther, it is fast and robust to initialize and fully closes the loop between\nphysically plausible map-aware global human motion estimation and mocap-aware\n3D scene reconstruction. Our key idea is integrating camera localization and\nmapping information with inertial human motion capture bidirectionally in our\nsystem. To achieve this, we design a tightly coupled mocap-aware dense bundle\nadjustment and physics-based body pose correction module leveraging a local\nbody-centric elevation map. The latter introduces a novel terrain-aware contact\nPD controller, which enables characters to physically contact the given local\nelevation map thereby reducing human floating or penetration. We demonstrate\nthe performance of our system on established synthetic and real-world\nbenchmarks. The results show that our method reduces human localization, camera\npose, and mapping accuracy error by 41%, 71%, 46%, respectively, compared to\nthe state of the art. Our qualitative evaluations on newly captured data\nfurther demonstrate that EgoHDM can cover challenging scenarios in non-flat\nterrain including stepping over stairs and outdoor scenes in the wild.\n","authors":["Bonan Liu","Handi Yin","Manuel Kaufmann","Jinhao He","Sammy Christen","Jie Song","Pan Hui"],"pdf_url":"https://arxiv.org/pdf/2409.00343v2.pdf","comment":"Project Page: https://handiyin.github.io/EgoHDM/"},{"id":"http://arxiv.org/abs/2409.03451v1","updated":"2024-09-05T11:58:36Z","published":"2024-09-05T11:58:36Z","title":"Automatic occlusion removal from 3D maps for maritime situational\n awareness","summary":" We introduce a novel method for updating 3D geospatial models, specifically\ntargeting occlusion removal in large-scale maritime environments. Traditional\n3D reconstruction techniques often face problems with dynamic objects, like\ncars or vessels, that obscure the true environment, leading to inaccurate\nmodels or requiring extensive manual editing. Our approach leverages deep\nlearning techniques, including instance segmentation and generative inpainting,\nto directly modify both the texture and geometry of 3D meshes without the need\nfor costly reprocessing. By selectively targeting occluding objects and\npreserving static elements, the method enhances both geometric and visual\naccuracy. This approach not only preserves structural and textural details of\nmap data but also maintains compatibility with current geospatial standards,\nensuring robust performance across diverse datasets. The results demonstrate\nsignificant improvements in 3D model fidelity, making this method highly\napplicable for maritime situational awareness and the dynamic display of\nauxiliary information.\n","authors":["Felix Sattler","Borja Carrillo Perez","Maurice Stephan","Sarah Barnes"],"pdf_url":"https://arxiv.org/pdf/2409.03451v1.pdf","comment":"Preprint of SPIE Sensor + Imaging 2024 conference paper"},{"id":"http://arxiv.org/abs/2409.03438v1","updated":"2024-09-05T11:39:43Z","published":"2024-09-05T11:39:43Z","title":"Shuffle Vision Transformer: Lightweight, Fast and Efficient Recognition\n of Driver Facial Expression","summary":" Existing methods for driver facial expression recognition (DFER) are often\ncomputationally intensive, rendering them unsuitable for real-time\napplications. In this work, we introduce a novel transfer learning-based dual\narchitecture, named ShuffViT-DFER, which elegantly combines computational\nefficiency and accuracy. This is achieved by harnessing the strengths of two\nlightweight and efficient models using convolutional neural network (CNN) and\nvision transformers (ViT). We efficiently fuse the extracted features to\nenhance the performance of the model in accurately recognizing the facial\nexpressions of the driver. Our experimental results on two benchmarking and\npublic datasets, KMU-FED and KDEF, highlight the validity of our proposed\nmethod for real-time application with superior performance when compared to\nstate-of-the-art methods.\n","authors":["Ibtissam Saadi","Douglas W. Cunningham","Taleb-ahmed Abdelmalik","Abdenour Hadid","Yassin El Hillali"],"pdf_url":"https://arxiv.org/pdf/2409.03438v1.pdf","comment":"Accepted for publication in The 6th IEEE International Conference on\n Artificial Intelligence Circuits and Systems (IEEE AICAS 2024), 5 pages, 3\n figures"},{"id":"http://arxiv.org/abs/2407.12390v3","updated":"2024-09-05T11:35:21Z","published":"2024-07-17T08:11:37Z","title":"Enhancing Facial Expression Recognition through Dual-Direction Attention\n Mixed Feature Networks: Application to 7th ABAW Challenge","summary":" We present our contribution to the 7th ABAW challenge at ECCV 2024, by\nutilizing a Dual-Direction Attention Mixed Feature Network (DDAMFN) for\nmultitask facial expression recognition, we achieve results far beyond the\nproposed baseline for the Multi-Task ABAW challenge. Our proposal uses the\nwell-known DDAMFN architecture as base to effectively predict valence-arousal,\nemotion recognition, and facial action units. We demonstrate the architecture\nability to handle these tasks simultaneously, providing insights into its\narchitecture and the rationale behind its design. Additionally, we compare our\nresults for a multitask solution with independent single-task performance.\n","authors":["Josep Cabacas-Maso","Elena Ortega-Beltrán","Ismael Benito-Altamirano","Carles Ventura"],"pdf_url":"https://arxiv.org/pdf/2407.12390v3.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.03434v1","updated":"2024-09-05T11:35:16Z","published":"2024-09-05T11:35:16Z","title":"A Key-Driven Framework for Identity-Preserving Face Anonymization","summary":" Virtual faces are crucial content in the metaverse. Recently, attempts have\nbeen made to generate virtual faces for privacy protection. Nevertheless, these\nvirtual faces either permanently remove the identifiable information or map the\noriginal identity into a virtual one, which loses the original identity\nforever. In this study, we first attempt to address the conflict between\nprivacy and identifiability in virtual faces, where a key-driven face\nanonymization and authentication recognition (KFAAR) framework is proposed.\nConcretely, the KFAAR framework consists of a head posture-preserving virtual\nface generation (HPVFG) module and a key-controllable virtual face\nauthentication (KVFA) module. The HPVFG module uses a user key to project the\nlatent vector of the original face into a virtual one. Then it maps the virtual\nvectors to obtain an extended encoding, based on which the virtual face is\ngenerated. By simultaneously adding a head posture and facial expression\ncorrection module, the virtual face has the same head posture and facial\nexpression as the original face. During the authentication, we propose a KVFA\nmodule to directly recognize the virtual faces using the correct user key,\nwhich can obtain the original identity without exposing the original face\nimage. We also propose a multi-task learning objective to train HPVFG and KVFA.\nExtensive experiments demonstrate the advantages of the proposed HPVFG and KVFA\nmodules, which effectively achieve both facial anonymity and identifiability.\n","authors":["Miaomiao Wang","Guang Hua","Sheng Li","Guorui Feng"],"pdf_url":"https://arxiv.org/pdf/2409.03434v1.pdf","comment":"Accepted by NDSS Symposium 2025. Please cite this paper as \"Miaomiao\n Wang, Guang Hua, Sheng Li, and Guorui Feng. A Key-Driven Framework for\n Identity-Preserving Face Anonymization. In the 32nd Annual Network and\n Distributed System Security Symposium (NDSS 2025).\""},{"id":"http://arxiv.org/abs/2409.03431v1","updated":"2024-09-05T11:23:41Z","published":"2024-09-05T11:23:41Z","title":"UV-Mamba: A DCN-Enhanced State Space Model for Urban Village Boundary\n Identification in High-Resolution Remote Sensing Images","summary":" Owing to the diverse geographical environments, intricate landscapes, and\nhigh-density settlements, the automatic identification of urban village\nboundaries using remote sensing images is a highly challenging task. This paper\nproposes a novel and efficient neural network model called UV-Mamba for\naccurate boundary detection in high-resolution remote sensing images. UV-Mamba\nmitigates the memory loss problem in long sequence modeling, which arises in\nstate space model (SSM) with increasing image size, by incorporating deformable\nconvolutions (DCN). Its architecture utilizes an encoder-decoder framework,\nincludes an encoder with four deformable state space augmentation (DSSA) blocks\nfor efficient multi-level semantic extraction and a decoder to integrate the\nextracted semantic information. We conducted experiments on the Beijing and\nXi'an datasets, and the results show that UV-Mamba achieves state-of-the-art\nperformance. Specifically, our model achieves 73.3% and 78.1% IoU on the\nBeijing and Xi'an datasets, respectively, representing improvements of 1.2% and\n3.4% IoU over the previous best model, while also being 6x faster in inference\nspeed and 40x smaller in parameter count. Source code and pre-trained models\nare available in the supplementary material.\n","authors":["Lulin Li","Ben Chen","Xuechao Zou","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2409.03431v1.pdf","comment":"5 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.03424v1","updated":"2024-09-05T11:10:34Z","published":"2024-09-05T11:10:34Z","title":"Weight Conditioning for Smooth Optimization of Neural Networks","summary":" In this article, we introduce a novel normalization technique for neural\nnetwork weight matrices, which we term weight conditioning. This approach aims\nto narrow the gap between the smallest and largest singular values of the\nweight matrices, resulting in better-conditioned matrices. The inspiration for\nthis technique partially derives from numerical linear algebra, where\nwell-conditioned matrices are known to facilitate stronger convergence results\nfor iterative solvers. We provide a theoretical foundation demonstrating that\nour normalization technique smoothens the loss landscape, thereby enhancing\nconvergence of stochastic gradient descent algorithms. Empirically, we validate\nour normalization across various neural network architectures, including\nConvolutional Neural Networks (CNNs), Vision Transformers (ViT), Neural\nRadiance Fields (NeRF), and 3D shape modeling. Our findings indicate that our\nnormalization method is not only competitive but also outperforms existing\nweight normalization techniques from the literature.\n","authors":["Hemanth Saratchandran","Thomas X. Wang","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2409.03424v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.03420v1","updated":"2024-09-05T11:09:00Z","published":"2024-09-05T11:09:00Z","title":"mPLUG-DocOwl2: High-resolution Compressing for OCR-free Multi-page\n Document Understanding","summary":" Multimodel Large Language Models(MLLMs) have achieved promising OCR-free\nDocument Understanding performance by increasing the supported resolution of\ndocument images. However, this comes at the cost of generating thousands of\nvisual tokens for a single document image, leading to excessive GPU memory and\nslower inference times, particularly in multi-page document comprehension. In\nthis work, to address these challenges, we propose a High-resolution\nDocCompressor module to compress each high-resolution document image into 324\ntokens, guided by low-resolution global visual features. With this compression\nmodule, to strengthen multi-page document comprehension ability and balance\nboth token efficiency and question-answering performance, we develop the\nDocOwl2 under a three-stage training framework: Single-image Pretraining,\nMulti-image Continue-pretraining, and Multi-task Finetuning. DocOwl2 sets a new\nstate-of-the-art across multi-page document understanding benchmarks and\nreduces first token latency by more than 50%, demonstrating advanced\ncapabilities in multi-page questioning answering, explanation with evidence\npages, and cross-page structure understanding. Additionally, compared to\nsingle-image MLLMs trained on similar data, our DocOwl2 achieves comparable\nsingle-page understanding performance with less than 20% of the visual tokens.\nOur codes, models, and data are publicly available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl2.\n","authors":["Anwen Hu","Haiyang Xu","Liang Zhang","Jiabo Ye","Ming Yan","Ji Zhang","Qin Jin","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.03420v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.03412v1","updated":"2024-09-05T11:01:48Z","published":"2024-09-05T11:01:48Z","title":"TG-LMM: Enhancing Medical Image Segmentation Accuracy through\n Text-Guided Large Multi-Modal Model","summary":" We propose TG-LMM (Text-Guided Large Multi-Modal Model), a novel approach\nthat leverages textual descriptions of organs to enhance segmentation accuracy\nin medical images. Existing medical image segmentation methods face several\nchallenges: current medical automatic segmentation models do not effectively\nutilize prior knowledge, such as descriptions of organ locations; previous\ntext-visual models focus on identifying the target rather than improving the\nsegmentation accuracy; prior models attempt to use prior knowledge to enhance\naccuracy but do not incorporate pre-trained models. To address these issues,\nTG-LMM integrates prior knowledge, specifically expert descriptions of the\nspatial locations of organs, into the segmentation process. Our model utilizes\npre-trained image and text encoders to reduce the number of training parameters\nand accelerate the training process. Additionally, we designed a comprehensive\nimage-text information fusion structure to ensure thorough integration of the\ntwo modalities of data. We evaluated TG-LMM on three authoritative medical\nimage datasets, encompassing the segmentation of various parts of the human\nbody. Our method demonstrated superior performance compared to existing\napproaches, such as MedSAM, SAM and nnUnet.\n","authors":["Yihao Zhao","Enhao Zhong","Cuiyun Yuan","Yang Li","Man Zhao","Chunxia Li","Jun Hu","Chenbin Liu"],"pdf_url":"https://arxiv.org/pdf/2409.03412v1.pdf","comment":"11 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.03404v1","updated":"2024-09-05T10:41:17Z","published":"2024-09-05T10:41:17Z","title":"KAN See In the Dark","summary":" Existing low-light image enhancement methods are difficult to fit the complex\nnonlinear relationship between normal and low-light images due to uneven\nillumination and noise effects. The recently proposed Kolmogorov-Arnold\nnetworks (KANs) feature spline-based convolutional layers and learnable\nactivation functions, which can effectively capture nonlinear dependencies. In\nthis paper, we design a KAN-Block based on KANs and innovatively apply it to\nlow-light image enhancement. This method effectively alleviates the limitations\nof current methods constrained by linear network structures and lack of\ninterpretability, further demonstrating the potential of KANs in low-level\nvision tasks. Given the poor perception of current low-light image enhancement\nmethods and the stochastic nature of the inverse diffusion process, we further\nintroduce frequency-domain perception for visually oriented enhancement.\nExtensive experiments demonstrate the competitive performance of our method on\nbenchmark datasets. The code will be available at:\nhttps://github.com/AXNing/KSID}{https://github.com/AXNing/KSID.\n","authors":["Aoxiang Ning","Minglong Xue","Jinhong He","Chengyun Song"],"pdf_url":"https://arxiv.org/pdf/2409.03404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03385v1","updated":"2024-09-05T09:44:43Z","published":"2024-09-05T09:44:43Z","title":"Make Graph-based Referring Expression Comprehension Great Again through\n Expression-guided Dynamic Gating and Regression","summary":" One common belief is that with complex models and pre-training on large-scale\ndatasets, transformer-based methods for referring expression comprehension\n(REC) perform much better than existing graph-based methods. We observe that\nsince most graph-based methods adopt an off-the-shelf detector to locate\ncandidate objects (i.e., regions detected by the object detector), they face\ntwo challenges that result in subpar performance: (1) the presence of\nsignificant noise caused by numerous irrelevant objects during reasoning, and\n(2) inaccurate localization outcomes attributed to the provided detector. To\naddress these issues, we introduce a plug-and-adapt module guided by\nsub-expressions, called dynamic gate constraint (DGC), which can adaptively\ndisable irrelevant proposals and their connections in graphs during reasoning.\nWe further introduce an expression-guided regression strategy (EGR) to refine\nlocation prediction. Extensive experimental results on the RefCOCO, RefCOCO+,\nRefCOCOg, Flickr30K, RefClef, and Ref-reasoning datasets demonstrate the\neffectiveness of the DGC module and the EGR strategy in consistently boosting\nthe performances of various graph-based REC methods. Without any pretaining,\nthe proposed graph-based method achieves better performance than the\nstate-of-the-art (SOTA) transformer-based methods.\n","authors":["Jingcheng Ke","Dele Wang","Jun-Cheng Chen","I-Hong Jhuo","Chia-Wen Lin","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2409.03385v1.pdf","comment":"12 pages to appear in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2409.02046v2","updated":"2024-09-05T09:41:37Z","published":"2024-09-03T16:48:07Z","title":"Human-AI Collaborative Multi-modal Multi-rater Learning for\n Endometriosis Diagnosis","summary":" Endometriosis, affecting about 10\\% of individuals assigned female at birth,\nis challenging to diagnose and manage. Diagnosis typically involves the\nidentification of various signs of the disease using either laparoscopic\nsurgery or the analysis of T1/T2 MRI images, with the latter being quicker and\ncheaper but less accurate. A key diagnostic sign of endometriosis is the\nobliteration of the Pouch of Douglas (POD). However, even experienced\nclinicians struggle with accurately classifying POD obliteration from MRI\nimages, which complicates the training of reliable AI models. In this paper, we\nintroduce the \\underline{H}uman-\\underline{AI} \\underline{Co}llaborative\n\\underline{M}ulti-modal \\underline{M}ulti-rater Learning (HAICOMM) methodology\nto address the challenge above. HAICOMM is the first method that explores three\nimportant aspects of this problem: 1) multi-rater learning to extract a cleaner\nlabel from the multiple ``noisy'' labels available per training sample; 2)\nmulti-modal learning to leverage the presence of T1/T2 MRI images for training\nand testing; and 3) human-AI collaboration to build a system that leverages the\npredictions from clinicians and the AI model to provide more accurate\nclassification than standalone clinicians and AI models. Presenting results on\nthe multi-rater T1/T2 MRI endometriosis dataset that we collected to validate\nour methodology, the proposed HAICOMM model outperforms an ensemble of\nclinicians, noisy-label learning models, and multi-rater learning methods.\n","authors":["Hu Wang","David Butler","Yuan Zhang","Jodie Avery","Steven Knox","Congbo Ma","Louise Hull","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2409.02046v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03048v2","updated":"2024-09-05T09:28:21Z","published":"2024-06-05T08:23:38Z","title":"Giving each task what it needs -- leveraging structured sparsity for\n tailored multi-task learning","summary":" In the Multi-task Learning (MTL) framework, every task demands distinct\nfeature representations, ranging from low-level to high-level attributes. It is\nvital to address the specific (feature/parameter) needs of each task,\nespecially in computationally constrained environments. This work, therefore,\nintroduces Layer-Optimized Multi-Task (LOMT) models that utilize structured\nsparsity to refine feature selection for individual tasks and enhance the\nperformance of all tasks in a multi-task scenario. Structured or group sparsity\nsystematically eliminates parameters from trivial channels and, sometimes,\neventually, entire layers within a convolution neural network during training.\nConsequently, the remaining layers provide the most optimal features for a\ngiven task. In this two-step approach, we subsequently leverage this\nsparsity-induced optimal layer information to build the LOMT models by\nconnecting task-specific decoders to these strategically identified layers,\ndeviating from conventional approaches that uniformly connect decoders at the\nend of the network. This tailored architecture optimizes the network, focusing\non essential features while reducing redundancy. We validate the efficacy of\nthe proposed approach on two datasets, i.e., NYU-v2 and CelebAMask-HD datasets,\nfor multiple heterogeneous tasks. A detailed performance analysis of the LOMT\nmodels, in contrast to the conventional MTL models, reveals that the LOMT\nmodels outperform for most task combinations. The excellent qualitative and\nquantitative outcomes highlight the effectiveness of employing structured\nsparsity for optimal layer (or feature) selection.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2406.03048v2.pdf","comment":"Accepted at ECCV 2024 workshop - Computational Aspects of Deep\n Learning"},{"id":"http://arxiv.org/abs/2104.09008v3","updated":"2024-09-05T09:15:34Z","published":"2021-04-19T01:51:21Z","title":"Kernel Adversarial Learning for Real-world Image Super-resolution","summary":" Current deep image super-resolution (SR) approaches aim to restore\nhigh-resolution images from down-sampled images or by assuming degradation from\nsimple Gaussian kernels and additive noises. However, these techniques only\nassume crude approximations of the real-world image degradation process, which\nshould involve complex kernels and noise patterns that are difficult to model\nusing simple assumptions. In this paper, we propose a more realistic process to\nsynthesise low-resolution images for real-world image SR by introducing a new\nKernel Adversarial Learning Super-resolution (KASR) framework. In the proposed\nframework, degradation kernels and noises are adaptively modelled rather than\nexplicitly specified. Moreover, we also propose a high-frequency selective\nobjective and an iterative supervision process to further boost the model SR\nreconstruction accuracy. Extensive experiments validate the effectiveness of\nthe proposed framework on real-world datasets.\n","authors":["Hu Wang","Congbo Ma","Jianpeng Zhang","Wei Emma Zhang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2104.09008v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03367v1","updated":"2024-09-05T09:14:03Z","published":"2024-09-05T09:14:03Z","title":"TBConvL-Net: A Hybrid Deep Learning Architecture for Robust Medical\n Image Segmentation","summary":" Deep learning has shown great potential for automated medical image\nsegmentation to improve the precision and speed of disease diagnostics.\nHowever, the task presents significant difficulties due to variations in the\nscale, shape, texture, and contrast of the pathologies. Traditional\nconvolutional neural network (CNN) models have certain limitations when it\ncomes to effectively modelling multiscale context information and facilitating\ninformation interaction between skip connections across levels. To overcome\nthese limitations, a novel deep learning architecture is introduced for medical\nimage segmentation, taking advantage of CNNs and vision transformers. Our\nproposed model, named TBConvL-Net, involves a hybrid network that combines the\nlocal features of a CNN encoder-decoder architecture with long-range and\ntemporal dependencies using biconvolutional long-short-term memory (LSTM)\nnetworks and vision transformers (ViT). This enables the model to capture\ncontextual channel relationships in the data and account for the uncertainty of\nsegmentation over time. Additionally, we introduce a novel composite loss\nfunction that considers both the segmentation robustness and the boundary\nagreement of the predicted output with the gold standard. Our proposed model\nshows consistent improvement over the state of the art on ten publicly\navailable datasets of seven different medical imaging modalities.\n","authors":["Shahzaib Iqbal","Tariq M. Khan","Syed S. Naqvi","Asim Naveed","Erik Meijering"],"pdf_url":"https://arxiv.org/pdf/2409.03367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02634v2","updated":"2024-09-05T09:11:25Z","published":"2024-09-04T11:55:14Z","title":"Loopy: Taming Audio-Driven Portrait Avatar with Long-Term Motion\n Dependency","summary":" With the introduction of diffusion-based video generation techniques,\naudio-conditioned human video generation has recently achieved significant\nbreakthroughs in both the naturalness of motion and the synthesis of portrait\ndetails. Due to the limited control of audio signals in driving human motion,\nexisting methods often add auxiliary spatial signals to stabilize movements,\nwhich may compromise the naturalness and freedom of motion. In this paper, we\npropose an end-to-end audio-only conditioned video diffusion model named Loopy.\nSpecifically, we designed an inter- and intra-clip temporal module and an\naudio-to-latents module, enabling the model to leverage long-term motion\ninformation from the data to learn natural motion patterns and improving\naudio-portrait movement correlation. This method removes the need for manually\nspecified spatial motion templates used in existing methods to constrain motion\nduring inference. Extensive experiments show that Loopy outperforms recent\naudio-driven portrait diffusion models, delivering more lifelike and\nhigh-quality results across various scenarios.\n","authors":["Jianwen Jiang","Chao Liang","Jiaqi Yang","Gaojie Lin","Tianyun Zhong","Yanbo Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.02634v2.pdf","comment":"Homepage: https://loopyavatar.github.io/"},{"id":"http://arxiv.org/abs/2408.15695v2","updated":"2024-09-05T09:05:39Z","published":"2024-08-28T10:43:42Z","title":"G-Style: Stylized Gaussian Splatting","summary":" We introduce G-Style, a novel algorithm designed to transfer the style of an\nimage onto a 3D scene represented using Gaussian Splatting. Gaussian Splatting\nis a powerful 3D representation for novel view synthesis, as -- compared to\nother approaches based on Neural Radiance Fields -- it provides fast scene\nrenderings and user control over the scene. Recent pre-prints have demonstrated\nthat the style of Gaussian Splatting scenes can be modified using an image\nexemplar. However, since the scene geometry remains fixed during the\nstylization process, current solutions fall short of producing satisfactory\nresults. Our algorithm aims to address these limitations by following a\nthree-step process: In a pre-processing step, we remove undesirable Gaussians\nwith large projection areas or highly elongated shapes. Subsequently, we\ncombine several losses carefully designed to preserve different scales of the\nstyle in the image, while maintaining as much as possible the integrity of the\noriginal scene content. During the stylization process and following the\noriginal design of Gaussian Splatting, we split Gaussians where additional\ndetail is necessary within our scene by tracking the gradient of the stylized\ncolor. Our experiments demonstrate that G-Style generates high-quality\nstylizations within just a few minutes, outperforming existing methods both\nqualitatively and quantitatively.\n","authors":["Áron Samuel Kovács","Pedro Hermosilla","Renata G. Raidou"],"pdf_url":"https://arxiv.org/pdf/2408.15695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03358v1","updated":"2024-09-05T09:01:11Z","published":"2024-09-05T09:01:11Z","title":"MouseSIS: A Frames-and-Events Dataset for Space-Time Instance\n Segmentation of Mice","summary":" Enabled by large annotated datasets, tracking and segmentation of objects in\nvideos has made remarkable progress in recent years. Despite these\nadvancements, algorithms still struggle under degraded conditions and during\nfast movements. Event cameras are novel sensors with high temporal resolution\nand high dynamic range that offer promising advantages to address these\nchallenges. However, annotated data for developing learning-based mask-level\ntracking algorithms with events is not available. To this end, we introduce:\n($i$) a new task termed \\emph{space-time instance segmentation}, similar to\nvideo instance segmentation, whose goal is to segment instances throughout the\nentire duration of the sensor input (here, the input are quasi-continuous\nevents and optionally aligned frames); and ($ii$) \\emph{\\dname}, a dataset for\nthe new task, containing aligned grayscale frames and events. It includes\nannotated ground-truth labels (pixel-level instance segmentation masks) of a\ngroup of up to seven freely moving and interacting mice. We also provide two\nreference methods, which show that leveraging event data can consistently\nimprove tracking performance, especially when used in combination with\nconventional cameras. The results highlight the potential of event-aided\ntracking in difficult scenarios. We hope our dataset opens the field of\nevent-based video instance segmentation and enables the development of robust\ntracking algorithms for challenging\nconditions.\\url{https://github.com/tub-rip/MouseSIS}\n","authors":["Friedhelm Hamann","Hanxiong Li","Paul Mieske","Lars Lewejohann","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2409.03358v1.pdf","comment":"18 pages, 5 figures, ECCV Workshops"},{"id":"http://arxiv.org/abs/2409.03354v1","updated":"2024-09-05T08:55:56Z","published":"2024-09-05T08:55:56Z","title":"Few-Shot Continual Learning for Activity Recognition in Classroom\n Surveillance Images","summary":" The application of activity recognition in the \"AI + Education\" field is\ngaining increasing attention. However, current work mainly focuses on the\nrecognition of activities in manually captured videos and a limited number of\nactivity types, with little attention given to recognizing activities in\nsurveillance images from real classrooms. In real classroom settings, normal\nteaching activities such as reading, account for a large proportion of samples,\nwhile rare non-teaching activities such as eating, continue to appear. This\nrequires a model that can learn non-teaching activities from few samples\nwithout forgetting the normal teaching activities, which necessitates fewshot\ncontinual learning (FSCL) capability. To address this gap, we constructed a\ncontinual learning dataset focused on classroom surveillance image activity\nrecognition called ARIC (Activity Recognition in Classroom). The dataset has\nadvantages such as multiple perspectives, a wide variety of activities, and\nreal-world scenarios, but it also presents challenges like similar activities\nand imbalanced sample distribution. To overcome these challenges, we designed a\nfew-shot continual learning method that combines supervised contrastive\nlearning (SCL) and an adaptive covariance classifier (ACC). During the base\nphase, we proposed a SCL approach based on feature augmentation to enhance the\nmodel's generalization ability. In the incremental phase, we employed an ACC to\nmore accurately describe the distribution of new classes. Experimental results\ndemonstrate that our method outperforms other existing methods on the ARIC\ndataset.\n","authors":["Yilei Qian","Kanglei Geng","Kailong Chen","Shaoxu Cheng","Linfeng Xu","Hongliang Li","Fanman Meng","Qingbo Wu"],"pdf_url":"https://arxiv.org/pdf/2409.03354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03336v1","updated":"2024-09-05T08:28:36Z","published":"2024-09-05T08:28:36Z","title":"Eetimating Indoor Scene Depth Maps from Ultrasonic Echoes","summary":" Measuring 3D geometric structures of indoor scenes requires dedicated depth\nsensors, which are not always available. Echo-based depth estimation has\nrecently been studied as a promising alternative solution. All previous studies\nhave assumed the use of echoes in the audible range. However, one major problem\nis that audible echoes cannot be used in quiet spaces or other situations where\nproducing audible sounds is prohibited. In this paper, we consider echo-based\ndepth estimation using inaudible ultrasonic echoes. While ultrasonic waves\nprovide high measurement accuracy in theory, the actual depth estimation\naccuracy when ultrasonic echoes are used has remained unclear, due to its\ndisadvantage of being sensitive to noise and susceptible to attenuation. We\nfirst investigate the depth estimation accuracy when the frequency of the sound\nsource is restricted to the high-frequency band, and found that the accuracy\ndecreased when the frequency was limited to ultrasonic ranges. Based on this\nobservation, we propose a novel deep learning method to improve the accuracy of\nultrasonic echo-based depth estimation by using audible echoes as auxiliary\ndata only during training. Experimental results with a public dataset\ndemonstrate that our method improves the estimation accuracy.\n","authors":["Junpei Honma","Akisato Kimura","Go Irie"],"pdf_url":"https://arxiv.org/pdf/2409.03336v1.pdf","comment":"ICIP 2024"},{"id":"http://arxiv.org/abs/2405.19921v2","updated":"2024-09-05T08:21:01Z","published":"2024-05-30T10:33:14Z","title":"MCDS-VSS: Moving Camera Dynamic Scene Video Semantic Segmentation by\n Filtering with Self-Supervised Geometry and Motion","summary":" Autonomous systems, such as self-driving cars, rely on reliable semantic\nenvironment perception for decision making. Despite great advances in video\nsemantic segmentation, existing approaches ignore important inductive biases\nand lack structured and interpretable internal representations. In this work,\nwe propose MCDS-VSS, a structured filter model that learns in a self-supervised\nmanner to estimate scene geometry and ego-motion of the camera, while also\nestimating the motion of external objects. Our model leverages these\nrepresentations to improve the temporal consistency of semantic segmentation\nwithout sacrificing segmentation accuracy. MCDS-VSS follows a prediction-fusion\napproach in which scene geometry and camera motion are first used to compensate\nfor ego-motion, then residual flow is used to compensate motion of dynamic\nobjects, and finally the predicted scene features are fused with the current\nfeatures to obtain a temporally consistent scene segmentation. Our model parses\nautomotive scenes into multiple decoupled interpretable representations such as\nscene geometry, ego-motion, and object motion. Quantitative evaluation shows\nthat MCDS-VSS achieves superior temporal consistency on video sequences while\nretaining competitive segmentation performance.\n","authors":["Angel Villar-Corrales","Moritz Austermann","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2405.19921v2.pdf","comment":"Accepted for publication at BMVC 2024"},{"id":"http://arxiv.org/abs/2409.03326v1","updated":"2024-09-05T07:55:55Z","published":"2024-09-05T07:55:55Z","title":"Enhancing User-Centric Privacy Protection: An Interactive Framework\n through Diffusion Models and Machine Unlearning","summary":" In the realm of multimedia data analysis, the extensive use of image datasets\nhas escalated concerns over privacy protection within such data. Current\nresearch predominantly focuses on privacy protection either in data sharing or\nupon the release of trained machine learning models. Our study pioneers a\ncomprehensive privacy protection framework that safeguards image data privacy\nconcurrently during data sharing and model publication. We propose an\ninteractive image privacy protection framework that utilizes generative machine\nlearning models to modify image information at the attribute level and employs\nmachine unlearning algorithms for the privacy preservation of model parameters.\nThis user-interactive framework allows for adjustments in privacy protection\nintensity based on user feedback on generated images, striking a balance\nbetween maximal privacy safeguarding and maintaining model performance. Within\nthis framework, we instantiate two modules: a differential privacy diffusion\nmodel for protecting attribute information in images and a feature unlearning\nalgorithm for efficient updates of the trained model on the revised image\ndataset. Our approach demonstrated superiority over existing methods on facial\ndatasets across various attribute classifications.\n","authors":["Huaxi Huang","Xin Yuan","Qiyu Liao","Dadong Wang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2409.03326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03320v1","updated":"2024-09-05T07:49:21Z","published":"2024-09-05T07:49:21Z","title":"YOLO-PPA based Efficient Traffic Sign Detection for Cruise Control in\n Autonomous Driving","summary":" It is very important to detect traffic signs efficiently and accurately in\nautonomous driving systems. However, the farther the distance, the smaller the\ntraffic signs. Existing object detection algorithms can hardly detect these\nsmall scaled signs.In addition, the performance of embedded devices on vehicles\nlimits the scale of detection models.To address these challenges, a YOLO PPA\nbased traffic sign detection algorithm is proposed in this paper.The\nexperimental results on the GTSDB dataset show that compared to the original\nYOLO, the proposed method improves inference efficiency by 11.2%. The mAP 50 is\nalso improved by 93.2%, which demonstrates the effectiveness of the proposed\nYOLO PPA.\n","authors":["Jingyu Zhang","Wenqing Zhang","Chaoyi Tan","Xiangtian Li","Qianyi Sun"],"pdf_url":"https://arxiv.org/pdf/2409.03320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16322v2","updated":"2024-09-05T07:47:53Z","published":"2024-08-29T07:49:31Z","title":"BEVal: A Cross-dataset Evaluation Study of BEV Segmentation Models for\n Autononomous Driving","summary":" Current research in semantic bird's-eye view segmentation for autonomous\ndriving focuses solely on optimizing neural network models using a single\ndataset, typically nuScenes. This practice leads to the development of highly\nspecialized models that may fail when faced with different environments or\nsensor setups, a problem known as domain shift. In this paper, we conduct a\ncomprehensive cross-dataset evaluation of state-of-the-art BEV segmentation\nmodels to assess their performance across different training and testing\ndatasets and setups, as well as different semantic categories. We investigate\nthe influence of different sensors, such as cameras and LiDAR, on the models'\nability to generalize to diverse conditions and scenarios. Additionally, we\nconduct multi-dataset training experiments that improve models' BEV\nsegmentation performance compared to single-dataset training. Our work\naddresses the gap in evaluating BEV segmentation models under cross-dataset\nvalidation. And our findings underscore the importance of enhancing model\ngeneralizability and adaptability to ensure more robust and reliable BEV\nsegmentation approaches for autonomous driving applications. The code for this\npaper available at https://github.com/manueldiaz96/beval .\n","authors":["Manuel Alejandro Diaz-Zapata","Wenqian Liu","Robin Baruffa","Christian Laugier"],"pdf_url":"https://arxiv.org/pdf/2408.16322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01679v2","updated":"2024-09-05T07:44:14Z","published":"2024-09-03T07:42:59Z","title":"Adaptive Explicit Knowledge Transfer for Knowledge Distillation","summary":" Logit-based knowledge distillation (KD) for classification is cost-efficient\ncompared to feature-based KD but often subject to inferior performance.\nRecently, it was shown that the performance of logit-based KD can be improved\nby effectively delivering the probability distribution for the non-target\nclasses from the teacher model, which is known as `implicit (dark) knowledge',\nto the student model. Through gradient analysis, we first show that this\nactually has an effect of adaptively controlling the learning of implicit\nknowledge. Then, we propose a new loss that enables the student to learn\nexplicit knowledge (i.e., the teacher's confidence about the target class)\nalong with implicit knowledge in an adaptive manner. Furthermore, we propose to\nseparate the classification and distillation tasks for effective distillation\nand inter-class relationship modeling. Experimental results demonstrate that\nthe proposed method, called adaptive explicit knowledge transfer (AEKT) method,\nachieves improved performance compared to the state-of-the-art KD methods on\nthe CIFAR-100 and ImageNet datasets.\n","authors":["Hyungkeun Park","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2409.01679v2.pdf","comment":"19 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.08572v2","updated":"2024-09-05T07:25:35Z","published":"2024-07-11T14:59:31Z","title":"Boosting Adversarial Transferability for Skeleton-based Action\n Recognition via Exploring the Model Posterior Space","summary":" Skeletal motion plays a pivotal role in human activity recognition (HAR).\nRecently, attack methods have been proposed to identify the universal\nvulnerability of skeleton-based HAR(S-HAR). However, the research of\nadversarial transferability on S-HAR is largely missing. More importantly,\nexisting attacks all struggle in transfer across unknown S-HAR models. We\nobserved that the key reason is that the loss landscape of the action\nrecognizers is rugged and sharp. Given the established correlation in prior\nstudies~\\cite{qin2022boosting,wu2020towards} between loss landscape and\nadversarial transferability, we assume and empirically validate that smoothing\nthe loss landscape could potentially improve adversarial transferability on\nS-HAR. This is achieved by proposing a new post-train Dual Bayesian strategy,\nwhich can effectively explore the model posterior space for a collection of\nsurrogates without the need for re-training. Furthermore, to craft adversarial\nexamples along the motion manifold, we incorporate the attack gradient with\ninformation of the motion dynamics in a Bayesian manner. Evaluated on benchmark\ndatasets, e.g. HDM05 and NTU 60, the average transfer success rate can reach as\nhigh as 35.9\\% and 45.5\\% respectively. In comparison, current state-of-the-art\nskeletal attacks achieve only 3.6\\% and 9.8\\%. The high adversarial\ntransferability remains consistent across various surrogate, victim, and even\ndefense models. Through a comprehensive analysis of the results, we provide\ninsights on what surrogates are more likely to exhibit transferability, to shed\nlight on future research.\n","authors":["Yunfeng Diao","Baiqi Wu","Ruixuan Zhang","Xun Yang","Meng Wang","He Wang"],"pdf_url":"https://arxiv.org/pdf/2407.08572v2.pdf","comment":"We have submitted a new version of our work at arXiv:2409.02483. This\n version, arXiv:2407.08572, is no longer valid. Any update for this work will\n be conducted in arXiv:2409.02483"},{"id":"http://arxiv.org/abs/2409.03303v1","updated":"2024-09-05T07:19:03Z","published":"2024-09-05T07:19:03Z","title":"Improving Robustness to Multiple Spurious Correlations by\n Multi-Objective Optimization","summary":" We study the problem of training an unbiased and accurate model given a\ndataset with multiple biases. This problem is challenging since the multiple\nbiases cause multiple undesirable shortcuts during training, and even worse,\nmitigating one may exacerbate the other. We propose a novel training method to\ntackle this challenge. Our method first groups training data so that different\ngroups induce different shortcuts, and then optimizes a linear combination of\ngroup-wise losses while adjusting their weights dynamically to alleviate\nconflicts between the groups in performance; this approach, rooted in the\nmulti-objective optimization theory, encourages to achieve the minimax Pareto\nsolution. We also present a new benchmark with multiple biases, dubbed\nMultiCelebA, for evaluating debiased training methods under realistic and\nchallenging scenarios. Our method achieved the best on three datasets with\nmultiple biases, and also showed superior performance on conventional\nsingle-bias datasets.\n","authors":["Nayeong Kim","Juwon Kang","Sungsoo Ahn","Jungseul Ok","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2409.03303v1.pdf","comment":"International Conference on Machine Learning 2024"},{"id":"http://arxiv.org/abs/2408.02049v2","updated":"2024-09-05T06:55:17Z","published":"2024-08-04T14:57:28Z","title":"3D Single-object Tracking in Point Clouds with High Temporal Variation","summary":" The high temporal variation of the point clouds is the key challenge of 3D\nsingle-object tracking (3D SOT). Existing approaches rely on the assumption\nthat the shape variation of the point clouds and the motion of the objects\nacross neighboring frames are smooth, failing to cope with high temporal\nvariation data. In this paper, we present a novel framework for 3D SOT in point\nclouds with high temporal variation, called HVTrack. HVTrack proposes three\nnovel components to tackle the challenges in the high temporal variation\nscenario: 1) A Relative-Pose-Aware Memory module to handle temporal point cloud\nshape variations; 2) a Base-Expansion Feature Cross-Attention module to deal\nwith similar object distractions in expanded search areas; 3) a Contextual\nPoint Guided Self-Attention module for suppressing heavy background noise. We\nconstruct a dataset with high temporal variation (KITTI-HV) by setting\ndifferent frame intervals for sampling in the KITTI dataset. On the KITTI-HV\nwith 5 frame intervals, our HVTrack surpasses the state-of-the-art tracker\nCXTracker by 11.3%/15.7% in Success/Precision.\n","authors":["Qiao Wu","Kun Sun","Pei An","Mathieu Salzmann","Yanning Zhang","Jiaqi Yang"],"pdf_url":"https://arxiv.org/pdf/2408.02049v2.pdf","comment":"Accepted by ECCV24"},{"id":"http://arxiv.org/abs/2409.03277v1","updated":"2024-09-05T06:41:02Z","published":"2024-09-05T06:41:02Z","title":"ChartMoE: Mixture of Expert Connector for Advanced Chart Understanding","summary":" Automatic chart understanding is crucial for content comprehension and\ndocument parsing. Multimodal large language models (MLLMs) have demonstrated\nremarkable capabilities in chart understanding through domain-specific\nalignment and fine-tuning. However, the application of alignment training\nwithin the chart domain is still underexplored. To address this, we propose\nChartMoE, which employs the mixture of expert (MoE) architecture to replace the\ntraditional linear projector to bridge the modality gap. Specifically, we train\nmultiple linear connectors through distinct alignment tasks, which are utilized\nas the foundational initialization parameters for different experts.\nAdditionally, we introduce ChartMoE-Align, a dataset with over 900K\nchart-table-JSON-code quadruples to conduct three alignment tasks\n(chart-table/JSON/code). Combined with the vanilla connector, we initialize\ndifferent experts in four distinct ways and adopt high-quality knowledge\nlearning to further refine the MoE connector and LLM parameters. Extensive\nexperiments demonstrate the effectiveness of the MoE connector and our\ninitialization strategy, e.g., ChartMoE improves the accuracy of the previous\nstate-of-the-art from 80.48% to 84.64% on the ChartQA benchmark.\n","authors":["Zhengzhuo Xu","Bowen Qu","Yiyan Qi","Sinan Du","Chengjin Xu","Chun Yuan","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2409.03277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15966v2","updated":"2024-09-05T06:33:31Z","published":"2024-08-28T17:38:44Z","title":"More Text, Less Point: Towards 3D Data-Efficient Point-Language\n Understanding","summary":" Enabling Large Language Models (LLMs) to comprehend the 3D physical world\nremains a significant challenge. Due to the lack of large-scale 3D-text pair\ndatasets, the success of LLMs has yet to be replicated in 3D understanding. In\nthis paper, we rethink this issue and propose a new task: 3D Data-Efficient\nPoint-Language Understanding. The goal is to enable LLMs to achieve robust 3D\nobject understanding with minimal 3D point cloud and text data pairs. To\naddress this task, we introduce GreenPLM, which leverages more text data to\ncompensate for the lack of 3D data. First, inspired by using CLIP to align\nimages and text, we utilize a pre-trained point cloud-text encoder to map the\n3D point cloud space to the text space. This mapping leaves us to seamlessly\nconnect the text space with LLMs. Once the point-text-LLM connection is\nestablished, we further enhance text-LLM alignment by expanding the\nintermediate text space, thereby reducing the reliance on 3D point cloud data.\nSpecifically, we generate 6M free-text descriptions of 3D objects, and design a\nthree-stage training strategy to help LLMs better explore the intrinsic\nconnections between different modalities. To achieve efficient modality\nalignment, we design a zero-parameter cross-attention module for token pooling.\nExtensive experimental results show that GreenPLM requires only 12% of the 3D\ntraining data used by existing state-of-the-art models to achieve superior 3D\nunderstanding. Remarkably, GreenPLM also achieves competitive performance using\ntext-only data. The code and weights are available at:\nhttps://github.com/TangYuan96/GreenPLM.\n","authors":["Yuan Tang","Xu Han","Xianzhi Li","Qiao Yu","Jinfeng Xu","Yixue Hao","Long Hu","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2408.15966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03272v1","updated":"2024-09-05T06:30:01Z","published":"2024-09-05T06:30:01Z","title":"OccLLaMA: An Occupancy-Language-Action Generative World Model for\n Autonomous Driving","summary":" The rise of multi-modal large language models(MLLMs) has spurred their\napplications in autonomous driving. Recent MLLM-based methods perform action by\nlearning a direct mapping from perception to action, neglecting the dynamics of\nthe world and the relations between action and world dynamics. In contrast,\nhuman beings possess world model that enables them to simulate the future\nstates based on 3D internal visual representation and plan actions accordingly.\nTo this end, we propose OccLLaMA, an occupancy-language-action generative world\nmodel, which uses semantic occupancy as a general visual representation and\nunifies vision-language-action(VLA) modalities through an autoregressive model.\nSpecifically, we introduce a novel VQVAE-like scene tokenizer to efficiently\ndiscretize and reconstruct semantic occupancy scenes, considering its sparsity\nand classes imbalance. Then, we build a unified multi-modal vocabulary for\nvision, language and action. Furthermore, we enhance LLM, specifically LLaMA,\nto perform the next token/scene prediction on the unified vocabulary to\ncomplete multiple tasks in autonomous driving. Extensive experiments\ndemonstrate that OccLLaMA achieves competitive performance across multiple\ntasks, including 4D occupancy forecasting, motion planning, and visual question\nanswering, showcasing its potential as a foundation model in autonomous\ndriving.\n","authors":["Julong Wei","Shanshuai Yuan","Pengfei Li","Qingda Hu","Zhongxue Gan","Wenchao Ding"],"pdf_url":"https://arxiv.org/pdf/2409.03272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03270v1","updated":"2024-09-05T06:27:32Z","published":"2024-09-05T06:27:32Z","title":"SVP: Style-Enhanced Vivid Portrait Talking Head Diffusion Model","summary":" Talking Head Generation (THG), typically driven by audio, is an important and\nchallenging task with broad application prospects in various fields such as\ndigital humans, film production, and virtual reality. While diffusion\nmodel-based THG methods present high quality and stable content generation,\nthey often overlook the intrinsic style which encompasses personalized features\nsuch as speaking habits and facial expressions of a video. As consequence, the\ngenerated video content lacks diversity and vividness, thus being limited in\nreal life scenarios. To address these issues, we propose a novel framework\nnamed Style-Enhanced Vivid Portrait (SVP) which fully leverages style-related\ninformation in THG. Specifically, we first introduce the novel probabilistic\nstyle prior learning to model the intrinsic style as a Gaussian distribution\nusing facial expressions and audio embedding. The distribution is learned\nthrough the 'bespoked' contrastive objective, effectively capturing the dynamic\nstyle information in each video. Then we finetune a pretrained Stable Diffusion\n(SD) model to inject the learned intrinsic style as a controlling signal via\ncross attention. Experiments show that our model generates diverse, vivid, and\nhigh-quality videos with flexible control over intrinsic styles, outperforming\nexisting state-of-the-art methods.\n","authors":["Weipeng Tan","Chuming Lin","Chengming Xu","Xiaozhong Ji","Junwei Zhu","Chengjie Wang","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2409.03270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03261v1","updated":"2024-09-05T06:03:52Z","published":"2024-09-05T06:03:52Z","title":"Bones Can't Be Triangles: Accurate and Efficient Vertebrae Keypoint\n Estimation through Collaborative Error Revision","summary":" Recent advances in interactive keypoint estimation methods have enhanced\naccuracy while minimizing user intervention. However, these methods require\nuser input for error correction, which can be costly in vertebrae keypoint\nestimation where inaccurate keypoints are densely clustered or overlap. We\nintroduce a novel approach, KeyBot, specifically designed to identify and\ncorrect significant and typical errors in existing models, akin to user\nrevision. By characterizing typical error types and using simulated errors for\ntraining, KeyBot effectively corrects these errors and significantly reduces\nuser workload. Comprehensive quantitative and qualitative evaluations on three\npublic datasets confirm that KeyBot significantly outperforms existing methods,\nachieving state-of-the-art performance in interactive vertebrae keypoint\nestimation. The source code and demo video are available at:\nhttps://ts-kim.github.io/KeyBot/\n","authors":["Jinhee Kim","Taesung Kim","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2409.03261v1.pdf","comment":"33 pages, ECCV 2024, Project Page: https://ts-kim.github.io/KeyBot/"},{"id":"http://arxiv.org/abs/2404.12415v2","updated":"2024-09-05T05:38:13Z","published":"2024-04-17T17:57:20Z","title":"Prediction of soil fertility parameters using USB-microscope imagery and\n portable X-ray fluorescence spectrometry","summary":" This study investigated the use of portable X-ray fluorescence (PXRF)\nspectrometry and soil image analysis for rapid soil fertility assessment, with\na focus on key indicators such as available boron (B), organic carbon (OC),\navailable manganese (Mn), available sulfur (S), and the sulfur availability\nindex (SAI). A total of 1,133 soil samples from diverse agro-climatic zones in\nEastern India were analyzed. The research integrated color and texture features\nfrom microscopic soil images, PXRF data, and auxiliary soil variables (AVs)\nusing a Random Forest model. Results showed that combining image features (IFs)\nwith AVs significantly improved prediction accuracy for available B (R2 = 0.80)\nand OC (R2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF\ndata, further enhanced predictions for available Mn and SAI, with R2 values of\n0.72 and 0.70, respectively. The study highlights the potential of integrating\nthese technologies to offer rapid, cost-effective soil testing methods, paving\nthe way for more advanced predictive models and a deeper understanding of soil\nfertility. Future work should explore the application of deep learning models\non a larger dataset, incorporating soils from a wider range of agro-climatic\nzones under field conditions.\n","authors":["Shubhadip Dasgupta","Satwik Pate","Divya Rathore","L. G. Divyanth","Ayan Das","Anshuman Nayak","Subhadip Dey","Asim Biswas","David C. Weindorf","Bin Li","Sergio Henrique Godinho Silva","Bruno Teixeira Ribeiro","Sanjay Srivastava","Somsubhra Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2404.12415v2.pdf","comment":"Published in 'Soil Advances'"},{"id":"http://arxiv.org/abs/2311.14189v4","updated":"2024-09-05T05:19:56Z","published":"2023-11-23T20:14:50Z","title":"D-SCo: Dual-Stream Conditional Diffusion for Monocular Hand-Held Object\n Reconstruction","summary":" Reconstructing hand-held objects from a single RGB image is a challenging\ntask in computer vision. In contrast to prior works that utilize deterministic\nmodeling paradigms, we employ a point cloud denoising diffusion model to\naccount for the probabilistic nature of this problem. In the core, we introduce\ncentroid-fixed dual-stream conditional diffusion for monocular hand-held object\nreconstruction (D-SCo), tackling two predominant challenges. First, to avoid\nthe object centroid from deviating, we utilize a novel hand-constrained\ncentroid fixing paradigm, enhancing the stability of diffusion and reverse\nprocesses and the precision of feature projection. Second, we introduce a\ndual-stream denoiser to semantically and geometrically model hand-object\ninteractions with a novel unified hand-object semantic embedding, enhancing the\nreconstruction performance of the hand-occluded region of the object.\nExperiments on the synthetic ObMan dataset and three real-world datasets HO3D,\nMOW and DexYCB demonstrate that our approach can surpass all other\nstate-of-the-art methods.\n","authors":["Bowen Fu","Gu Wang","Chenyangguang Zhang","Yan Di","Ziqin Huang","Zhiying Leng","Fabian Manhardt","Xiangyang Ji","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2311.14189v4.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.03254v1","updated":"2024-09-05T05:18:31Z","published":"2024-09-05T05:18:31Z","title":"Granular-ball Representation Learning for Deep CNN on Learning with\n Label Noise","summary":" In actual scenarios, whether manually or automatically annotated, label noise\nis inevitably generated in the training data, which can affect the\neffectiveness of deep CNN models. The popular solutions require data cleaning\nor designing additional optimizations to punish the data with mislabeled data,\nthereby enhancing the robustness of models. However, these methods come at the\ncost of weakening or even losing some data during the training process. As we\nknow, content is the inherent attribute of an image that does not change with\nchanges in annotations. In this study, we propose a general granular-ball\ncomputing (GBC) module that can be embedded into a CNN model, where the\nclassifier finally predicts the label of granular-ball ($gb$) samples instead\nof each individual samples. Specifically, considering the classification task:\n(1) in forward process, we split the input samples as $gb$ samples at\nfeature-level, each of which can correspond to multiple samples with varying\nnumbers and share one single label; (2) during the backpropagation process, we\nmodify the gradient allocation strategy of the GBC module to enable it to\npropagate normally; and (3) we develop an experience replay policy to ensure\nthe stability of the training process. Experiments demonstrate that the\nproposed method can improve the robustness of CNN models with no additional\ndata or optimization.\n","authors":["Dawei Dai","Hao Zhu","Shuyin Xia","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03252v1","updated":"2024-09-05T05:09:03Z","published":"2024-09-05T05:09:03Z","title":"Gr-IoU: Ground-Intersection over Union for Robust Multi-Object Tracking\n with 3D Geometric Constraints","summary":" We propose a Ground IoU (Gr-IoU) to address the data association problem in\nmulti-object tracking. When tracking objects detected by a camera, it often\noccurs that the same object is assigned different IDs in consecutive frames,\nespecially when objects are close to each other or overlapping. To address this\nissue, we introduce Gr-IoU, which takes into account the 3D structure of the\nscene. Gr-IoU transforms traditional bounding boxes from the image space to the\nground plane using the vanishing point geometry. The IoU calculated with these\ntransformed bounding boxes is more sensitive to the front-to-back relationships\nof objects, thereby improving data association accuracy and reducing ID\nswitches. We evaluated our Gr-IoU method on the MOT17 and MOT20 datasets, which\ncontain diverse tracking scenarios including crowded scenes and sequences with\nfrequent occlusions. Experimental results demonstrated that Gr-IoU outperforms\nconventional real-time methods without appearance features.\n","authors":["Keisuke Toida","Naoki Kato","Osamu Segawa","Takeshi Nakamura","Kazuhiro Hotta"],"pdf_url":"https://arxiv.org/pdf/2409.03252v1.pdf","comment":"Accepted for the ECCV 2024 Workshop on Affective Behavior Analysis\n in-the-wild(ABAW)"},{"id":"http://arxiv.org/abs/2402.11940v3","updated":"2024-09-05T05:06:57Z","published":"2024-02-19T08:27:23Z","title":"AICAttack: Adversarial Image Captioning Attack with Attention-Based\n Optimization","summary":" Recent advances in deep learning research have shown remarkable achievements\nacross many tasks in computer vision (CV) and natural language processing\n(NLP). At the intersection of CV and NLP is the problem of image captioning,\nwhere the related models' robustness against adversarial attacks has not been\nwell studied. This paper presents a novel adversarial attack strategy,\nAICAttack (Attention-based Image Captioning Attack), designed to attack image\ncaptioning models through subtle perturbations on images. Operating within a\nblack-box attack scenario, our algorithm requires no access to the target\nmodel's architecture, parameters, or gradient information. We introduce an\nattention-based candidate selection mechanism that identifies the optimal\npixels to attack, followed by a customised differential evolution method to\noptimise the perturbations of pixels' RGB values. We demonstrate AICAttack's\neffectiveness through extensive experiments on benchmark datasets against\nmultiple victim models. The experimental results demonstrate that our method\noutperforms current leading-edge techniques by achieving consistently higher\nattack success rates.\n","authors":["Jiyao Li","Mingze Ni","Yifei Dong","Tianqing Zhu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2402.11940v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03249v1","updated":"2024-09-05T04:55:40Z","published":"2024-09-05T04:55:40Z","title":"Multiple weather images restoration using the task transformer and\n adaptive mixup strategy","summary":" The current state-of-the-art in severe weather removal predominantly focuses\non single-task applications, such as rain removal, haze removal, and snow\nremoval. However, real-world weather conditions often consist of a mixture of\nseveral weather types, and the degree of weather mixing in autonomous driving\nscenarios remains unknown. In the presence of complex and diverse weather\nconditions, a single weather removal model often encounters challenges in\nproducing clear images from severe weather images. Therefore, there is a need\nfor the development of multi-task severe weather removal models that can\neffectively handle mixed weather conditions and improve image quality in\nautonomous driving scenarios. In this paper, we introduce a novel multi-task\nsevere weather removal model that can effectively handle complex weather\nconditions in an adaptive manner. Our model incorporates a weather task\nsequence generator, enabling the self-attention mechanism to selectively focus\non features specific to different weather types. To tackle the challenge of\nrepairing large areas of weather degradation, we introduce Fast Fourier\nConvolution (FFC) to increase the receptive field. Additionally, we propose an\nadaptive upsampling technique that effectively processes both the weather task\ninformation and underlying image features by selectively retaining relevant\ninformation. Our proposed model has achieved state-of-the-art performance on\nthe publicly available dataset.\n","authors":["Yang Wen","Anyu Lai","Bo Qian","Hao Wang","Wuzhen Shi","Wenming Cao"],"pdf_url":"https://arxiv.org/pdf/2409.03249v1.pdf","comment":"10 pages, 5 figures and 2 table"},{"id":"http://arxiv.org/abs/2409.02097v2","updated":"2024-09-05T04:53:37Z","published":"2024-09-03T17:54:39Z","title":"LinFusion: 1 GPU, 1 Minute, 16K Image","summary":" Modern diffusion models, particularly those utilizing a Transformer-based\nUNet for denoising, rely heavily on self-attention operations to manage complex\nspatial relationships, thus achieving impressive generation performance.\nHowever, this existing paradigm faces significant challenges in generating\nhigh-resolution visual content due to its quadratic time and memory complexity\nwith respect to the number of spatial tokens. To address this limitation, we\naim at a novel linear attention mechanism as an alternative in this paper.\nSpecifically, we begin our exploration from recently introduced models with\nlinear complexity, e.g., Mamba2, RWKV6, Gated Linear Attention, etc, and\nidentify two key features-attention normalization and non-causal inference-that\nenhance high-resolution visual generation performance. Building on these\ninsights, we introduce a generalized linear attention paradigm, which serves as\na low-rank approximation of a wide spectrum of popular linear token mixers. To\nsave the training cost and better leverage pre-trained models, we initialize\nour models and distill the knowledge from pre-trained StableDiffusion (SD). We\nfind that the distilled model, termed LinFusion, achieves performance on par\nwith or superior to the original SD after only modest training, while\nsignificantly reducing time and memory complexity. Extensive experiments on\nSD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory\nzero-shot cross-resolution generation performance, generating high-resolution\nimages like 16K resolution. Moreover, it is highly compatible with pre-trained\nSD components, such as ControlNet and IP-Adapter, requiring no adaptation\nefforts. Codes are available at https://github.com/Huage001/LinFusion.\n","authors":["Songhua Liu","Weihao Yu","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02097v2.pdf","comment":"Work in Progress. Codes are available at\n https://github.com/Huage001/LinFusion"},{"id":"http://arxiv.org/abs/2309.16126v2","updated":"2024-09-05T04:48:35Z","published":"2023-09-28T03:13:09Z","title":"UVL2: A Unified Framework for Video Tampering Localization","summary":" With the advancement of deep learning-driven video editing technology,\nsecurity risks have emerged. Malicious video tampering can lead to public\nmisunderstanding, property losses, and legal disputes. Currently, detection\nmethods are mostly limited to specific datasets, with limited detection\nperformance for unknown forgeries, and lack of robustness for processed data.\nThis paper proposes an effective video tampering localization network that\nsignificantly improves the detection performance of video inpainting and\nsplicing by extracting more generalized features of forgery traces. Considering\nthe inherent differences between tampered videos and original videos, such as\nedge artifacts, pixel distribution, texture features, and compress information,\nwe have specifically designed four modules to independently extract these\nfeatures. Furthermore, to seamlessly integrate these features, we employ a\ntwo-stage approach utilizing both a Convolutional Neural Network and a Vision\nTransformer, enabling us to learn these features in a local-to-global manner.\nExperimental results demonstrate that the method significantly outperforms the\nexisting state-of-the-art methods and exhibits robustness.\n","authors":["Pengfei Pei"],"pdf_url":"https://arxiv.org/pdf/2309.16126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03245v1","updated":"2024-09-05T04:47:36Z","published":"2024-09-05T04:47:36Z","title":"UAV (Unmanned Aerial Vehicles): Diverse Applications of UAV Datasets in\n Segmentation, Classification, Detection, and Tracking","summary":" Unmanned Aerial Vehicles (UAVs), have greatly revolutionized the process of\ngathering and analyzing data in diverse research domains, providing unmatched\nadaptability and effectiveness. This paper presents a thorough examination of\nUnmanned Aerial Vehicle (UAV) datasets, emphasizing their wide range of\napplications and progress. UAV datasets consist of various types of data, such\nas satellite imagery, images captured by drones, and videos. These datasets can\nbe categorized as either unimodal or multimodal, offering a wide range of\ndetailed and comprehensive information. These datasets play a crucial role in\ndisaster damage assessment, aerial surveillance, object recognition, and\ntracking. They facilitate the development of sophisticated models for tasks\nlike semantic segmentation, pose estimation, vehicle re-identification, and\ngesture recognition. By leveraging UAV datasets, researchers can significantly\nenhance the capabilities of computer vision models, thereby advancing\ntechnology and improving our understanding of complex, dynamic environments\nfrom an aerial perspective. This review aims to encapsulate the multifaceted\nutility of UAV datasets, emphasizing their pivotal role in driving innovation\nand practical applications in multiple domains.\n","authors":["Md. Mahfuzur Rahman","Sunzida Siddique","Marufa Kamal","Rakib Hossain Rifat","Kishor Datta Gupta"],"pdf_url":"https://arxiv.org/pdf/2409.03245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03236v1","updated":"2024-09-05T04:13:13Z","published":"2024-09-05T04:13:13Z","title":"Unveiling Context-Related Anomalies: Knowledge Graph Empowered\n Decoupling of Scene and Action for Human-Related Video Anomaly Detection","summary":" Detecting anomalies in human-related videos is crucial for surveillance\napplications. Current methods primarily include appearance-based and\naction-based techniques. Appearance-based methods rely on low-level visual\nfeatures such as color, texture, and shape. They learn a large number of pixel\npatterns and features related to known scenes during training, making them\neffective in detecting anomalies within these familiar contexts. However, when\nencountering new or significantly changed scenes, i.e., unknown scenes, they\noften fail because existing SOTA methods do not effectively capture the\nrelationship between actions and their surrounding scenes, resulting in low\ngeneralization. In contrast, action-based methods focus on detecting anomalies\nin human actions but are usually less informative because they tend to overlook\nthe relationship between actions and their scenes, leading to incorrect\ndetection. For instance, the normal event of running on the beach and the\nabnormal event of running on the street might both be considered normal due to\nthe lack of scene information. In short, current methods struggle to integrate\nlow-level visual and high-level action features, leading to poor anomaly\ndetection in varied and complex scenes. To address this challenge, we propose a\nnovel decoupling-based architecture for human-related video anomaly detection\n(DecoAD). DecoAD significantly improves the integration of visual and action\nfeatures through the decoupling and interweaving of scenes and actions, thereby\nenabling a more intuitive and accurate understanding of complex behaviors and\nscenes. DecoAD supports fully supervised, weakly supervised, and unsupervised\nsettings.\n","authors":["Chenglizhao Chen","Xinyu Liu","Mengke Song","Luming Li","Xu Yu","Shanchen Pang"],"pdf_url":"https://arxiv.org/pdf/2409.03236v1.pdf","comment":"13pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.06512v2","updated":"2024-09-05T03:59:05Z","published":"2024-07-09T02:47:58Z","title":"LuSNAR:A Lunar Segmentation, Navigation and Reconstruction Dataset based\n on Muti-sensor for Autonomous Exploration","summary":" With the complexity of lunar exploration missions, the moon needs to have a\nhigher level of autonomy. Environmental perception and navigation algorithms\nare the foundation for lunar rovers to achieve autonomous exploration. The\ndevelopment and verification of algorithms require highly reliable data\nsupport. Most of the existing lunar datasets are targeted at a single task,\nlacking diverse scenes and high-precision ground truth labels. To address this\nissue, we propose a multi-task, multi-scene, and multi-label lunar benchmark\ndataset LuSNAR. This dataset can be used for comprehensive evaluation of\nautonomous perception and navigation systems, including high-resolution stereo\nimage pairs, panoramic semantic labels, dense depth maps, LiDAR point clouds,\nand the position of rover. In order to provide richer scene data, we built 9\nlunar simulation scenes based on Unreal Engine. Each scene is divided according\nto topographic relief and the density of objects. To verify the usability of\nthe dataset, we evaluated and analyzed the algorithms of semantic segmentation,\n3D reconstruction, and autonomous navigation. The experiment results prove that\nthe dataset proposed in this paper can be used for ground verification of tasks\nsuch as autonomous environment perception and navigation, and provides a lunar\nbenchmark dataset for testing the accessibility of algorithm metrics. We make\nLuSNAR publicly available at: https://github.com/autumn999999/LuSNAR-dataset.\n","authors":["Jiayi Liu","Qianyu Zhang","Xue Wan","Shengyang Zhang","Yaolin Tian","Haodong Han","Yutao Zhao","Baichuan Liu","Zeyuan Zhao","Xubo Luo"],"pdf_url":"https://arxiv.org/pdf/2407.06512v2.pdf","comment":"19 pages, 13 figures, 11 tables"},{"id":"http://arxiv.org/abs/2409.03228v1","updated":"2024-09-05T03:55:37Z","published":"2024-09-05T03:55:37Z","title":"Labeled-to-Unlabeled Distribution Alignment for Partially-Supervised\n Multi-Organ Medical Image Segmentation","summary":" Partially-supervised multi-organ medical image segmentation aims to develop a\nunified semantic segmentation model by utilizing multiple partially-labeled\ndatasets, with each dataset providing labels for a single class of organs.\nHowever, the limited availability of labeled foreground organs and the absence\nof supervision to distinguish unlabeled foreground organs from the background\npose a significant challenge, which leads to a distribution mismatch between\nlabeled and unlabeled pixels. Although existing pseudo-labeling methods can be\nemployed to learn from both labeled and unlabeled pixels, they are prone to\nperformance degradation in this task, as they rely on the assumption that\nlabeled and unlabeled pixels have the same distribution. In this paper, to\naddress the problem of distribution mismatch, we propose a labeled-to-unlabeled\ndistribution alignment (LTUDA) framework that aligns feature distributions and\nenhances discriminative capability. Specifically, we introduce a cross-set data\naugmentation strategy, which performs region-level mixing between labeled and\nunlabeled organs to reduce distribution discrepancy and enrich the training\nset. Besides, we propose a prototype-based distribution alignment method that\nimplicitly reduces intra-class variation and increases the separation between\nthe unlabeled foreground and background. This can be achieved by encouraging\nconsistency between the outputs of two prototype classifiers and a linear\nclassifier. Extensive experimental results on the AbdomenCT-1K dataset and a\nunion of four benchmark datasets (including LiTS, MSD-Spleen, KiTS, and NIH82)\ndemonstrate that our method outperforms the state-of-the-art\npartially-supervised methods by a considerable margin, and even surpasses the\nfully-supervised methods. The source code is publicly available at\nhttps://github.com/xjiangmed/LTUDA.\n","authors":["Xixi Jiang","Dong Zhang","Xiang Li","Kangyi Liu","Kwang-Ting Cheng","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2409.03228v1.pdf","comment":"Accepted by Medical Image Analysis"},{"id":"http://arxiv.org/abs/2409.03223v1","updated":"2024-09-05T03:42:11Z","published":"2024-09-05T03:42:11Z","title":"Why mamba is effective? Exploit Linear Transformer-Mamba Network for\n Multi-Modality Image Fusion","summary":" Multi-modality image fusion aims to integrate the merits of images from\ndifferent sources and render high-quality fusion images. However, existing\nfeature extraction and fusion methods are either constrained by inherent local\nreduction bias and static parameters during inference (CNN) or limited by\nquadratic computational complexity (Transformers), and cannot effectively\nextract and fuse features. To solve this problem, we propose a dual-branch\nimage fusion network called Tmamba. It consists of linear Transformer and\nMamba, which has global modeling capabilities while maintaining linear\ncomplexity. Due to the difference between the Transformer and Mamba structures,\nthe features extracted by the two branches carry channel and position\ninformation respectively. T-M interaction structure is designed between the two\nbranches, using global learnable parameters and convolutional layers to\ntransfer position and channel information respectively. We further propose\ncross-modal interaction at the attention level to obtain cross-modal attention.\nExperiments show that our Tmamba achieves promising results in multiple fusion\ntasks, including infrared-visible image fusion and medical image fusion. Code\nwith checkpoints will be available after the peer-review process.\n","authors":["Chenguang Zhu","Shan Gao","Huafeng Chen","Guangqian Guo","Chaowei Wang","Yaoxing Wang","Chen Shu Lei","Quanjiang Fan"],"pdf_url":"https://arxiv.org/pdf/2409.03223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01105v4","updated":"2024-09-05T03:38:08Z","published":"2024-02-02T02:44:59Z","title":"A Survey for Foundation Models in Autonomous Driving","summary":" The advent of foundation models has revolutionized the fields of natural\nlanguage processing and computer vision, paving the way for their application\nin autonomous driving (AD). This survey presents a comprehensive review of more\nthan 40 research papers, demonstrating the role of foundation models in\nenhancing AD. Large language models contribute to planning and simulation in\nAD, particularly through their proficiency in reasoning, code generation and\ntranslation. In parallel, vision foundation models are increasingly adapted for\ncritical tasks such as 3D object detection and tracking, as well as creating\nrealistic driving scenarios for simulation and testing. Multi-modal foundation\nmodels, integrating diverse inputs, exhibit exceptional visual understanding\nand spatial reasoning, crucial for end-to-end AD. This survey not only provides\na structured taxonomy, categorizing foundation models based on their modalities\nand functionalities within the AD domain but also delves into the methods\nemployed in current research. It identifies the gaps between existing\nfoundation models and cutting-edge AD approaches, thereby charting future\nresearch directions and proposing a roadmap for bridging these gaps.\n","authors":["Haoxiang Gao","Zhongruo Wang","Yaqian Li","Kaiwen Long","Ming Yang","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2402.01105v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01876v2","updated":"2024-09-05T03:31:28Z","published":"2024-09-03T13:19:31Z","title":"CyberHost: Taming Audio-driven Avatar Diffusion Model with Region\n Codebook Attention","summary":" Diffusion-based video generation technology has advanced significantly,\ncatalyzing a proliferation of research in human animation. However, the\nmajority of these studies are confined to same-modality driving settings, with\ncross-modality human body animation remaining relatively underexplored. In this\npaper, we introduce, an end-to-end audio-driven human animation framework that\nensures hand integrity, identity consistency, and natural motion. The key\ndesign of CyberHost is the Region Codebook Attention mechanism, which improves\nthe generation quality of facial and hand animations by integrating\nfine-grained local features with learned motion pattern priors. Furthermore, we\nhave developed a suite of human-prior-guided training strategies, including\nbody movement map, hand clarity score, pose-aligned reference feature, and\nlocal enhancement supervision, to improve synthesis results. To our knowledge,\nCyberHost is the first end-to-end audio-driven human diffusion model capable of\nfacilitating zero-shot video generation within the scope of human body.\nExtensive experiments demonstrate that CyberHost surpasses previous works in\nboth quantitative and qualitative aspects.\n","authors":["Gaojie Lin","Jianwen Jiang","Chao Liang","Tianyun Zhong","Jiaqi Yang","Yanbo Zheng"],"pdf_url":"https://arxiv.org/pdf/2409.01876v2.pdf","comment":"Homepage: https://cyberhost.github.io/"},{"id":"http://arxiv.org/abs/2403.19386v2","updated":"2024-09-05T03:18:11Z","published":"2024-03-28T12:51:15Z","title":"PointCloud-Text Matching: Benchmark Datasets and a Baseline","summary":" In this paper, we present and study a new instance-level retrieval task:\nPointCloud-Text Matching~(PTM), which aims to find the exact cross-modal\ninstance that matches a given point-cloud query or text query. PTM could be\napplied to various scenarios, such as indoor/urban-canyon localization and\nscene retrieval. However, there exists no suitable and targeted dataset for PTM\nin practice. Therefore, we construct three new PTM benchmark datasets, namely\n3D2T-SR, 3D2T-NR, and 3D2T-QA. We observe that the data is challenging and with\nnoisy correspondence due to the sparsity, noise, or disorder of point clouds\nand the ambiguity, vagueness, or incompleteness of texts, which make existing\ncross-modal matching methods ineffective for PTM. To tackle these challenges,\nwe propose a PTM baseline, named Robust PointCloud-Text Matching method (RoMa).\nRoMa consists of two modules: a Dual Attention Perception module (DAP) and a\nRobust Negative Contrastive Learning module (RNCL). Specifically, DAP leverages\ntoken-level and feature-level attention to adaptively focus on useful local and\nglobal features, and aggregate them into common representations, thereby\nreducing the adverse impact of noise and ambiguity. To handle noisy\ncorrespondence, RNCL divides negative pairs, which are much less error-prone\nthan positive pairs, into clean and noisy subsets, and assigns them forward and\nreverse optimization directions respectively, thus enhancing robustness against\nnoisy correspondence. We conduct extensive experiments on our benchmarks and\ndemonstrate the superiority of our RoMa.\n","authors":["Yanglin Feng","Yang Qin","Dezhong Peng","Hongyuan Zhu","Xi Peng","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2403.19386v2.pdf","comment":"Upon further consideration, we have concluded that the current\n version requires significant revision and may not yet be ready for\n publication. We plan to conduct additional experiments and make the necessary\n improvements to ensure the paper meets the standards for future submission"},{"id":"http://arxiv.org/abs/2409.03213v1","updated":"2024-09-05T03:18:04Z","published":"2024-09-05T03:18:04Z","title":"Optimizing 3D Gaussian Splatting for Sparse Viewpoint Scene\n Reconstruction","summary":" 3D Gaussian Splatting (3DGS) has emerged as a promising approach for 3D scene\nrepresentation, offering a reduction in computational overhead compared to\nNeural Radiance Fields (NeRF). However, 3DGS is susceptible to high-frequency\nartifacts and demonstrates suboptimal performance under sparse viewpoint\nconditions, thereby limiting its applicability in robotics and computer vision.\nTo address these limitations, we introduce SVS-GS, a novel framework for Sparse\nViewpoint Scene reconstruction that integrates a 3D Gaussian smoothing filter\nto suppress artifacts. Furthermore, our approach incorporates a Depth Gradient\nProfile Prior (DGPP) loss with a dynamic depth mask to sharpen edges and 2D\ndiffusion with Score Distillation Sampling (SDS) loss to enhance geometric\nconsistency in novel view synthesis. Experimental evaluations on the\nMipNeRF-360 and SeaThru-NeRF datasets demonstrate that SVS-GS markedly improves\n3D reconstruction from sparse viewpoints, offering a robust and efficient\nsolution for scene understanding in robotics and computer vision applications.\n","authors":["Shen Chen","Jiale Zhou","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2409.03213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03212v1","updated":"2024-09-05T03:16:41Z","published":"2024-09-05T03:16:41Z","title":"Bi-capacity Choquet Integral for Sensor Fusion with Label Uncertainty","summary":" Sensor fusion combines data from multiple sensor sources to improve\nreliability, robustness, and accuracy of data interpretation. The Fuzzy\nIntegral (FI), in particular, the Choquet integral (ChI), is often used as a\npowerful nonlinear aggregator for fusion across multiple sensors. However,\nexisting supervised ChI learning algorithms typically require precise training\nlabels for each input data point, which can be difficult or impossible to\nobtain. Additionally, prior work on ChI fusion is often based only on the\nnormalized fuzzy measures, which bounds the fuzzy measure values between [0,\n1]. This can be limiting in cases where the underlying scales of input data\nsources are bipolar (i.e., between [-1, 1]). To address these challenges, this\npaper proposes a novel Choquet integral-based fusion framework, named Bi-MIChI\n(pronounced \"bi-mi-kee\"), which uses bi-capacities to represent the\ninteractions between pairs of subsets of the input sensor sources on a bi-polar\nscale. This allows for extended non-linear interactions between the sensor\nsources and can lead to interesting fusion results. Bi-MIChI also addresses\nlabel uncertainty through Multiple Instance Learning, where training labels are\napplied to \"bags\" (sets) of data instead of per-instance. Our proposed Bi-MIChI\nframework shows effective classification and detection performance on both\nsynthetic and real-world experiments for sensor fusion with label uncertainty.\nWe also provide detailed analyses on the behavior of the fuzzy measures to\ndemonstrate our fusion process.\n","authors":["Hersh Vakharia","Xiaoxiao Du"],"pdf_url":"https://arxiv.org/pdf/2409.03212v1.pdf","comment":"10 pages, 7 figures, 7 tables; Accepted to 2024 FUZZ-IEEE and\n presented at 2024 IEEE WCCI; Code available at\n https://github.com/hvak/Bi-MIChI"},{"id":"http://arxiv.org/abs/2409.03209v1","updated":"2024-09-05T03:07:26Z","published":"2024-09-05T03:07:26Z","title":"iSeg: An Iterative Refinement-based Framework for Training-free\n Segmentation","summary":" Stable diffusion has demonstrated strong image synthesis ability to given\ntext descriptions, suggesting it to contain strong semantic clue for grouping\nobjects. Inspired by this, researchers have explored employing stable diffusion\nfor trainingfree segmentation. Most existing approaches either simply employ\ncross-attention map or refine it by self-attention map, to generate\nsegmentation masks. We believe that iterative refinement with self-attention\nmap would lead to better results. However, we mpirically demonstrate that such\na refinement is sub-optimal likely due to the self-attention map containing\nirrelevant global information which hampers accurately refining cross-attention\nmap with multiple iterations. To address this, we propose an iterative\nrefinement framework for training-free segmentation, named iSeg, having an\nentropy-reduced self-attention module which utilizes a gradient descent scheme\nto reduce the entropy of self-attention map, thereby suppressing the weak\nresponses corresponding to irrelevant global information. Leveraging the\nentropy-reduced self-attention module, our iSeg stably improves refined\ncrossattention map with iterative refinement. Further, we design a\ncategory-enhanced cross-attention module to generate accurate cross-attention\nmap, providing a better initial input for iterative refinement. Extensive\nexperiments across different datasets and diverse segmentation tasks reveal the\nmerits of proposed contributions, leading to promising performance on diverse\nsegmentation tasks. For unsupervised semantic segmentation on Cityscapes, our\niSeg achieves an absolute gain of 3.8% in terms of mIoU compared to the best\nexisting training-free approach in literature. Moreover, our proposed iSeg can\nsupport segmentation with different kind of images and interactions.\n","authors":["Lin Sun","Jiale Cao","Jin Xie","Fahad Shahbaz Khan","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2409.03209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03206v1","updated":"2024-09-05T02:54:17Z","published":"2024-09-05T02:54:17Z","title":"TC-LLaVA: Rethinking the Transfer from Image to Video Understanding with\n Temporal Considerations","summary":" Multimodal Large Language Models (MLLMs) have significantly improved\nperformance across various image-language applications. Recently, there has\nbeen a growing interest in adapting image pre-trained MLLMs for video-related\ntasks. However, most efforts concentrate on enhancing the vision encoder and\nprojector components, while the core part, Large Language Models (LLMs),\nremains comparatively under-explored. In this paper, we propose two strategies\nto enhance the model's capability in video understanding tasks by improving\ninter-layer attention computation in LLMs. Specifically, the first approach\nfocuses on the enhancement of Rotary Position Embedding (RoPE) with\nTemporal-Aware Dual RoPE, which introduces temporal position information to\nstrengthen the MLLM's temporal modeling capabilities while preserving the\nrelative position relationships of both visual and text tokens. The second\napproach involves enhancing the Attention Mask with the Frame-wise Block Causal\nAttention Mask, a simple yet effective method that broadens visual token\ninteractions within and across video frames while maintaining the causal\ninference mechanism. Based on these proposed methods, we adapt LLaVA for video\nunderstanding tasks, naming it Temporal-Considered LLaVA (TC-LLaVA). Our\nTC-LLaVA achieves new state-of-the-art performance across various video\nunderstanding benchmarks with only supervised fine-tuning (SFT) on\nvideo-related datasets.\n","authors":["Mingze Gao","Jingyu Liu","Mingda Li","Jiangtao Xie","Qingbin Liu","Bo Zhao","Xi Chen","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2409.03206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03200v1","updated":"2024-09-05T02:46:36Z","published":"2024-09-05T02:46:36Z","title":"Active Fake: DeepFake Camouflage","summary":" DeepFake technology has gained significant attention due to its ability to\nmanipulate facial attributes with high realism, raising serious societal\nconcerns. Face-Swap DeepFake is the most harmful among these techniques, which\nfabricates behaviors by swapping original faces with synthesized ones. Existing\nforensic methods, primarily based on Deep Neural Networks (DNNs), effectively\nexpose these manipulations and have become important authenticity indicators.\nHowever, these methods mainly concentrate on capturing the blending\ninconsistency in DeepFake faces, raising a new security issue, termed Active\nFake, emerges when individuals intentionally create blending inconsistency in\ntheir authentic videos to evade responsibility. This tactic is called DeepFake\nCamouflage. To achieve this, we introduce a new framework for creating DeepFake\ncamouflage that generates blending inconsistencies while ensuring\nimperceptibility, effectiveness, and transferability. This framework, optimized\nvia an adversarial learning strategy, crafts imperceptible yet effective\ninconsistencies to mislead forensic detectors. Extensive experiments\ndemonstrate the effectiveness and robustness of our method, highlighting the\nneed for further research in active fake detection.\n","authors":["Pu Sun","Honggang Qi","Yuezun Li"],"pdf_url":"https://arxiv.org/pdf/2409.03200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03198v1","updated":"2024-09-05T02:41:18Z","published":"2024-09-05T02:41:18Z","title":"RoomDiffusion: A Specialized Diffusion Model in the Interior Design\n Industry","summary":" Recent advancements in text-to-image diffusion models have significantly\ntransformed visual content generation, yet their application in specialized\nfields such as interior design remains underexplored. In this paper, we present\nRoomDiffusion, a pioneering diffusion model meticulously tailored for the\ninterior design industry. To begin with, we build from scratch a whole data\npipeline to update and evaluate data for iterative model optimization.\nSubsequently, techniques such as multiaspect training, multi-stage fine-tune\nand model fusion are applied to enhance both the visual appeal and precision of\nthe generated results. Lastly, leveraging the latent consistency Distillation\nmethod, we distill and expedite the model for optimal efficiency. Unlike\nexisting models optimized for general scenarios, RoomDiffusion addresses\nspecific challenges in interior design, such as lack of fashion, high furniture\nduplication rate, and inaccurate style. Through our holistic human evaluation\nprotocol with more than 20 professional human evaluators, RoomDiffusion\ndemonstrates industry-leading performance in terms of aesthetics, accuracy, and\nefficiency, surpassing all existing open source models such as stable diffusion\nand SDXL.\n","authors":["Zhaowei Wang","Ying Hao","Hao Wei","Qing Xiao","Lulu Chen","Yulong Li","Yue Yang","Tianyi Li"],"pdf_url":"https://arxiv.org/pdf/2409.03198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03192v1","updated":"2024-09-05T02:32:07Z","published":"2024-09-05T02:32:07Z","title":"PEPL: Precision-Enhanced Pseudo-Labeling for Fine-Grained Image\n Classification in Semi-Supervised Learning","summary":" Fine-grained image classification has witnessed significant advancements with\nthe advent of deep learning and computer vision technologies. However, the\nscarcity of detailed annotations remains a major challenge, especially in\nscenarios where obtaining high-quality labeled data is costly or\ntime-consuming. To address this limitation, we introduce Precision-Enhanced\nPseudo-Labeling(PEPL) approach specifically designed for fine-grained image\nclassification within a semi-supervised learning framework. Our method\nleverages the abundance of unlabeled data by generating high-quality\npseudo-labels that are progressively refined through two key phases: initial\npseudo-label generation and semantic-mixed pseudo-label generation. These\nphases utilize Class Activation Maps (CAMs) to accurately estimate the semantic\ncontent and generate refined labels that capture the essential details\nnecessary for fine-grained classification. By focusing on semantic-level\ninformation, our approach effectively addresses the limitations of standard\ndata augmentation and image-mixing techniques in preserving critical\nfine-grained features. We achieve state-of-the-art performance on benchmark\ndatasets, demonstrating significant improvements over existing semi-supervised\nstrategies, with notable boosts in accuracy and robustness.Our code has been\nopen sourced at https://github.com/TianSuya/SemiFG.\n","authors":["Bowen Tian","Songning Lai","Lujundong Li","Zhihao Shuai","Runwei Guan","Tian Wu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2409.03192v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2404.13993v4","updated":"2024-09-05T02:21:42Z","published":"2024-04-22T08:59:35Z","title":"Zero-Shot Character Identification and Speaker Prediction in Comics via\n Iterative Multimodal Fusion","summary":" Recognizing characters and predicting speakers of dialogue are critical for\ncomic processing tasks, such as voice generation or translation. However,\nbecause characters vary by comic title, supervised learning approaches like\ntraining character classifiers which require specific annotations for each\ncomic title are infeasible. This motivates us to propose a novel zero-shot\napproach, allowing machines to identify characters and predict speaker names\nbased solely on unannotated comic images. In spite of their importance in\nreal-world applications, these task have largely remained unexplored due to\nchallenges in story comprehension and multimodal integration. Recent large\nlanguage models (LLMs) have shown great capability for text understanding and\nreasoning, while their application to multimodal content analysis is still an\nopen problem. To address this problem, we propose an iterative multimodal\nframework, the first to employ multimodal information for both character\nidentification and speaker prediction tasks. Our experiments demonstrate the\neffectiveness of the proposed framework, establishing a robust baseline for\nthese tasks. Furthermore, since our method requires no training data or\nannotations, it can be used as-is on any comic series.\n","authors":["Yingxuan Li","Ryota Hinami","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13993v4.pdf","comment":"Accepted to ACM Multimedia 2024. Project page:\n https://liyingxuan1012.github.io/zeroshot-speaker-prediction ; Github repo:\n https://github.com/liyingxuan1012/zeroshot-speaker-prediction"},{"id":"http://arxiv.org/abs/2407.08150v3","updated":"2024-09-05T02:21:05Z","published":"2024-07-11T03:00:26Z","title":"Hypergraph Multi-modal Large Language Model: Exploiting EEG and\n Eye-tracking Modalities to Evaluate Heterogeneous Responses for Video\n Understanding","summary":" Understanding of video creativity and content often varies among individuals,\nwith differences in focal points and cognitive levels across different ages,\nexperiences, and genders. There is currently a lack of research in this area,\nand most existing benchmarks suffer from several drawbacks: 1) a limited number\nof modalities and answers with restrictive length; 2) the content and scenarios\nwithin the videos are excessively monotonous, transmitting allegories and\nemotions that are overly simplistic. To bridge the gap to real-world\napplications, we introduce a large-scale Subjective Response Indicators for\nAdvertisement Videos dataset, namely SRI-ADV. Specifically, we collected real\nchanges in Electroencephalographic (EEG) and eye-tracking regions from\ndifferent demographics while they viewed identical video content. Utilizing\nthis multi-modal dataset, we developed tasks and protocols to analyze and\nevaluate the extent of cognitive understanding of video content among different\nusers. Along with the dataset, we designed a Hypergraph Multi-modal Large\nLanguage Model (HMLLM) to explore the associations among different\ndemographics, video elements, EEG, and eye-tracking indicators. HMLLM could\nbridge semantic gaps across rich modalities and integrate information beyond\ndifferent modalities to perform logical reasoning. Extensive experimental\nevaluations on SRI-ADV and other additional video-based generative performance\nbenchmarks demonstrate the effectiveness of our method. The codes and dataset\nwill be released at https://github.com/mininglamp-MLLM/HMLLM.\n","authors":["Minghui Wu","Chenxu Zhao","Anyang Su","Donglin Di","Tianyu Fu","Da An","Min He","Ya Gao","Meng Ma","Kun Yan","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2407.08150v3.pdf","comment":"Accepted by ACM MULTIMEDIA 2024"},{"id":"http://arxiv.org/abs/2409.03179v1","updated":"2024-09-05T02:14:04Z","published":"2024-09-05T02:14:04Z","title":"Perceptual-Distortion Balanced Image Super-Resolution is a\n Multi-Objective Optimization Problem","summary":" Training Single-Image Super-Resolution (SISR) models using pixel-based\nregression losses can achieve high distortion metrics scores (e.g., PSNR and\nSSIM), but often results in blurry images due to insufficient recovery of\nhigh-frequency details. Conversely, using GAN or perceptual losses can produce\nsharp images with high perceptual metric scores (e.g., LPIPS), but may\nintroduce artifacts and incorrect textures. Balancing these two types of losses\ncan help achieve a trade-off between distortion and perception, but the\nchallenge lies in tuning the loss function weights. To address this issue, we\npropose a novel method that incorporates Multi-Objective Optimization (MOO)\ninto the training process of SISR models to balance perceptual quality and\ndistortion. We conceptualize the relationship between loss weights and image\nquality assessment (IQA) metrics as black-box objective functions to be\noptimized within our Multi-Objective Bayesian Optimization Super-Resolution\n(MOBOSR) framework. This approach automates the hyperparameter tuning process,\nreduces overall computational cost, and enables the use of numerous loss\nfunctions simultaneously. Extensive experiments demonstrate that MOBOSR\noutperforms state-of-the-art methods in terms of both perceptual quality and\ndistortion, significantly advancing the perception-distortion Pareto frontier.\nOur work points towards a new direction for future research on balancing\nperceptual quality and fidelity in nearly all image restoration tasks. The\nsource code and pretrained models are available at:\nhttps://github.com/ZhuKeven/MOBOSR.\n","authors":["Qiwen Zhu","Yanjie Wang","Shilv Cai","Liqun Chen","Jiahuan Zhou","Luxin Yan","Sheng Zhong","Xu Zou"],"pdf_url":"https://arxiv.org/pdf/2409.03179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12912v4","updated":"2024-09-05T01:57:51Z","published":"2023-11-21T17:27:20Z","title":"Q-Seg: Quantum Annealing-Based Unsupervised Image Segmentation","summary":" We present Q-Seg, a novel unsupervised image segmentation method based on\nquantum annealing, tailored for existing quantum hardware. We formulate the\npixel-wise segmentation problem, which assimilates spectral and spatial\ninformation of the image, as a graph-cut optimization task. Our method\nefficiently leverages the interconnected qubit topology of the D-Wave Advantage\ndevice, offering superior scalability over existing quantum approaches and\noutperforming several tested state-of-the-art classical methods. Empirical\nevaluations on synthetic datasets have shown that Q-Seg has better runtime\nperformance than the state-of-the-art classical optimizer Gurobi. The method\nhas also been tested on earth observation image segmentation, a critical area\nwith noisy and unreliable annotations. In the era of noisy intermediate-scale\nquantum, Q-Seg emerges as a reliable contender for real-world applications in\ncomparison to advanced techniques like Segment Anything. Consequently, Q-Seg\noffers a promising solution using available quantum hardware, especially in\nsituations constrained by limited labeled data and the need for efficient\ncomputational runtime.\n","authors":["Supreeth Mysore Venkatesh","Antonio Macaluso","Marlon Nuske","Matthias Klusch","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2311.12912v4.pdf","comment":"12 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2405.15580v3","updated":"2024-09-05T01:54:38Z","published":"2024-05-24T14:07:57Z","title":"Open-Vocabulary SAM3D: Towards Training-free Open-Vocabulary 3D Scene\n Understanding","summary":" Open-vocabulary 3D scene understanding presents a significant challenge in\nthe field. Recent works have sought to transfer knowledge embedded in\nvision-language models from 2D to 3D domains. However, these approaches often\nrequire prior knowledge from specific 3D scene datasets, limiting their\napplicability in open-world scenarios. The Segment Anything Model (SAM) has\ndemonstrated remarkable zero-shot segmentation capabilities, prompting us to\ninvestigate its potential for comprehending 3D scenes without training. In this\npaper, we introduce OV-SAM3D, a training-free method that contains a universal\nframework for understanding open-vocabulary 3D scenes. This framework is\ndesigned to perform understanding tasks for any 3D scene without requiring\nprior knowledge of the scene. Specifically, our method is composed of two key\nsub-modules: First, we initiate the process by generating superpoints as the\ninitial 3D prompts and refine these prompts using segment masks derived from\nSAM. Moreover, we then integrate a specially designed overlapping score table\nwith open tags from the Recognize Anything Model (RAM) to produce final 3D\ninstances with open-world labels. Empirical evaluations on the ScanNet200 and\nnuScenes datasets demonstrate that our approach surpasses existing\nopen-vocabulary methods in unknown open-world environments.\n","authors":["Hanchen Tai","Qingdong He","Jiangning Zhang","Yijie Qian","Zhenyu Zhang","Xiaobin Hu","Xiangtai Li","Yabiao Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2405.15580v3.pdf","comment":"Project page: https://hithqd.github.io/projects/OV-SAM3D"},{"id":"http://arxiv.org/abs/2401.08174v5","updated":"2024-09-05T00:59:53Z","published":"2024-01-16T07:33:22Z","title":"An Efficient Instance Segmentation Framework Using Segmentation\n Foundation Models with Oriented Bounding Box Prompts","summary":" Instance segmentation in unmanned aerial vehicle measurement is a\nlong-standing challenge. Since horizontal bounding boxes introduce many\ninterference objects, oriented bounding boxes (OBBs) are usually used for\ninstance identification. However, based on ``segmentation within bounding box''\nparadigm, current instance segmentation methods using OBBs are overly dependent\non bounding box detection performance. To tackle this, this paper proposes\nOBSeg, an efficient instance segmentation framework using OBBs. OBSeg is based\non box prompt-based segmentation foundation models (BSMs), e.g., Segment\nAnything Model. Specifically, OBSeg first detects OBBs to distinguish instances\nand provide coarse localization information. Then, it predicts OBB\nprompt-related masks for fine segmentation. Since OBBs only serve as prompts,\nOBSeg alleviates the over-dependence on bounding box detection performance of\ncurrent instance segmentation methods using OBBs. In addition, to enable BSMs\nto handle OBB prompts, we propose a novel OBB prompt encoder. To make OBSeg\nmore lightweight and further improve the performance of lightweight distilled\nBSMs, a Gaussian smoothing-based knowledge distillation method is introduced.\nExperiments demonstrate that OBSeg outperforms current instance segmentation\nmethods on multiple public datasets. The code is available at\nhttps://github.com/zhen6618/OBBInstanceSegmentation.\n","authors":["Zhen Zhou","Junfeng Fan","Yunkai Ma","Sihan Zhao","Fengshui Jing","Min Tan"],"pdf_url":"https://arxiv.org/pdf/2401.08174v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04370v3","updated":"2024-09-05T00:55:07Z","published":"2024-05-07T14:51:05Z","title":"Diff-IP2D: Diffusion-Based Hand-Object Interaction Prediction on\n Egocentric Videos","summary":" Understanding how humans would behave during hand-object interaction is vital\nfor applications in service robot manipulation and extended reality. To achieve\nthis, some recent works have been proposed to simultaneously forecast hand\ntrajectories and object affordances on human egocentric videos. The joint\nprediction serves as a comprehensive representation of future hand-object\ninteractions in 2D space, indicating potential human motion and motivation.\nHowever, the existing approaches mostly adopt the autoregressive paradigm for\nunidirectional prediction, which lacks mutual constraints within the holistic\nfuture sequence, and accumulates errors along the time axis. Meanwhile, these\nworks basically overlook the effect of camera egomotion on first-person view\npredictions. To address these limitations, we propose a novel diffusion-based\ninteraction prediction method, namely Diff-IP2D, to forecast future hand\ntrajectories and object affordances concurrently in an iterative\nnon-autoregressive manner. We transform the sequential 2D images into latent\nfeature space and design a denoising diffusion model to predict future latent\ninteraction features conditioned on past ones. Motion features are further\nintegrated into the conditional denoising process to enable Diff-IP2D aware of\nthe camera wearer's dynamics for more accurate interaction prediction.\nExtensive experiments demonstrate that our method significantly outperforms the\nstate-of-the-art baselines on both the off-the-shelf metrics and our newly\nproposed evaluation protocol. This highlights the efficacy of leveraging a\ngenerative paradigm for 2D hand-object interaction prediction. The code of\nDiff-IP2D will be released at https://github.com/IRMVLab/Diff-IP2D.\n","authors":["Junyi Ma","Jingyi Xu","Xieyuanli Chen","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2405.04370v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.13393v4","updated":"2024-09-05T00:18:40Z","published":"2022-02-27T16:34:10Z","title":"TransKD: Transformer Knowledge Distillation for Efficient Semantic\n Segmentation","summary":" Semantic segmentation benchmarks in the realm of autonomous driving are\ndominated by large pre-trained transformers, yet their widespread adoption is\nimpeded by substantial computational costs and prolonged training durations. To\nlift this constraint, we look at efficient semantic segmentation from a\nperspective of comprehensive knowledge distillation and aim to bridge the gap\nbetween multi-source knowledge extractions and transformer-specific patch\nembeddings. We put forward the Transformer-based Knowledge Distillation\n(TransKD) framework which learns compact student transformers by distilling\nboth feature maps and patch embeddings of large teacher transformers, bypassing\nthe long pre-training process and reducing the FLOPs by >85.0%. Specifically,\nwe propose two fundamental modules to realize feature map distillation and\npatch embedding distillation, respectively: (1) Cross Selective Fusion (CSF)\nenables knowledge transfer between cross-stage features via channel attention\nand feature map distillation within hierarchical transformers; (2) Patch\nEmbedding Alignment (PEA) performs dimensional transformation within the\npatchifying process to facilitate the patch embedding distillation.\nFurthermore, we introduce two optimization modules to enhance the patch\nembedding distillation from different perspectives: (1) Global-Local Context\nMixer (GL-Mixer) extracts both global and local information of a representative\nembedding; (2) Embedding Assistant (EA) acts as an embedding method to\nseamlessly bridge teacher and student models with the teacher's number of\nchannels. Experiments on Cityscapes, ACDC, NYUv2, and Pascal VOC2012 datasets\nshow that TransKD outperforms state-of-the-art distillation frameworks and\nrivals the time-consuming pre-training method. The source code is publicly\navailable at https://github.com/RuipingL/TransKD.\n","authors":["Ruiping Liu","Kailun Yang","Alina Roitberg","Jiaming Zhang","Kunyu Peng","Huayao Liu","Yaonan Wang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2202.13393v4.pdf","comment":"Accepted to IEEE Transactions on Intelligent Transportation Systems\n (T-ITS). The source code is publicly available at\n https://github.com/RuipingL/TransKD"},{"id":"http://arxiv.org/abs/2409.03945v1","updated":"2024-09-05T23:54:32Z","published":"2024-09-05T23:54:32Z","title":"TropNNC: Structured Neural Network Compression Using Tropical Geometry","summary":" We present TropNNC, a structured pruning framework for compressing neural\nnetworks with linear and convolutional layers and ReLU activations. Our\napproximation is based on a geometrical approach to machine/deep learning,\nusing tropical geometry and extending the work of Misiakos et al. (2022). We\nuse the Hausdorff distance of zonotopes in its standard continuous form to\nachieve a tighter approximation bound for tropical polynomials compared to\nMisiakos et al. (2022). This enhancement allows for superior functional\napproximations of neural networks, leading to a more effective compression\nalgorithm. Our method is significantly easier to implement compared to other\nframeworks, and does not depend on the availability of training data samples.\nWe validate our framework through extensive empirical evaluations on the MNIST,\nCIFAR, and ImageNet datasets. Our results demonstrate that TropNNC achieves\nperformance on par with the state-of-the-art method ThiNet, even surpassing it\nin compressing linear layers, and to the best of our knowledge, it is the first\nmethod that achieves this using tropical geometry.\n","authors":["Konstantinos Fotopoulos","Petros Maragos","Panagiotis Misiakos"],"pdf_url":"https://arxiv.org/pdf/2409.03945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03944v1","updated":"2024-09-05T23:50:57Z","published":"2024-09-05T23:50:57Z","title":"HUMOS: Human Motion Model Conditioned on Body Shape","summary":" Generating realistic human motion is essential for many computer vision and\ngraphics applications. The wide variety of human body shapes and sizes greatly\nimpacts how people move. However, most existing motion models ignore these\ndifferences, relying on a standardized, average body. This leads to uniform\nmotion across different body types, where movements don't match their physical\ncharacteristics, limiting diversity. To solve this, we introduce a new approach\nto develop a generative motion model based on body shape. We show that it's\npossible to train this model using unpaired data by applying cycle consistency,\nintuitive physics, and stability constraints, which capture the relationship\nbetween identity and movement. The resulting model generates diverse,\nphysically plausible, and dynamically stable human motions that are both\nquantitatively and qualitatively more realistic than current state-of-the-art\nmethods. More details are available on our project page\nhttps://CarstenEpic.github.io/humos/.\n","authors":["Shashank Tripathi","Omid Taheri","Christoph Lassner","Michael J. Black","Daniel Holden","Carsten Stoll"],"pdf_url":"https://arxiv.org/pdf/2409.03944v1.pdf","comment":"Accepted in ECCV'24. Project page:\n https://CarstenEpic.github.io/humos/"},{"id":"http://arxiv.org/abs/2409.03938v1","updated":"2024-09-05T23:07:21Z","published":"2024-09-05T23:07:21Z","title":"Deep Clustering of Remote Sensing Scenes through Heterogeneous Transfer\n Learning","summary":" This paper proposes a method for unsupervised whole-image clustering of a\ntarget dataset of remote sensing scenes with no labels. The method consists of\nthree main steps: (1) finetuning a pretrained deep neural network (DINOv2) on a\nlabelled source remote sensing imagery dataset and using it to extract a\nfeature vector from each image in the target dataset, (2) reducing the\ndimension of these deep features via manifold projection into a low-dimensional\nEuclidean space, and (3) clustering the embedded features using a Bayesian\nnonparametric technique to infer the number and membership of clusters\nsimultaneously. The method takes advantage of heterogeneous transfer learning\nto cluster unseen data with different feature and label distributions. We\ndemonstrate the performance of this approach outperforming state-of-the-art\nzero-shot classification methods on several remote sensing scene classification\ndatasets.\n","authors":["Isaac Ray","Alexei Skurikhin"],"pdf_url":"https://arxiv.org/pdf/2409.03938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03929v1","updated":"2024-09-05T22:31:53Z","published":"2024-09-05T22:31:53Z","title":"Data-Efficient Generation for Dataset Distillation","summary":" While deep learning techniques have proven successful in image-related tasks,\nthe exponentially increased data storage and computation costs become a\nsignificant challenge. Dataset distillation addresses these challenges by\nsynthesizing only a few images for each class that encapsulate all essential\ninformation. Most current methods focus on matching. The problems lie in the\nsynthetic images not being human-readable and the dataset performance being\ninsufficient for downstream learning tasks. Moreover, the distillation time can\nquickly get out of bounds when the number of synthetic images per class\nincreases even slightly. To address this, we train a class conditional latent\ndiffusion model capable of generating realistic synthetic images with labels.\nThe sampling time can be reduced to several tens of images per seconds. We\ndemonstrate that models can be effectively trained using only a small set of\nsynthetic images and evaluated on a large real test set. Our approach achieved\nrank \\(1\\) in The First Dataset Distillation Challenge at ECCV 2024 on the\nCIFAR100 and TinyImageNet datasets.\n","authors":["Zhe Li","Weitong Zhang","Sarah Cechnicka","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2409.03929v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.03913v1","updated":"2024-09-05T21:22:48Z","published":"2024-09-05T21:22:48Z","title":"Image Recognition for Garbage Classification Based on Pixel Distribution\n Learning","summary":" The exponential growth in waste production due to rapid economic and\nindustrial development necessitates efficient waste management strategies to\nmitigate environmental pollution and resource depletion. Leveraging\nadvancements in computer vision, this study proposes a novel approach inspired\nby pixel distribution learning techniques to enhance automated garbage\nclassification. The method aims to address limitations of conventional\nconvolutional neural network (CNN)-based approaches, including computational\ncomplexity and vulnerability to image variations. We will conduct experiments\nusing the Kaggle Garbage Classification dataset, comparing our approach with\nexisting models to demonstrate the strength and efficiency of pixel\ndistribution learning in automated garbage classification technologies.\n","authors":["Jenil Kanani"],"pdf_url":"https://arxiv.org/pdf/2409.03913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09050v2","updated":"2024-09-05T21:17:13Z","published":"2024-07-12T07:18:05Z","title":"Refusing Safe Prompts for Multi-modal Large Language Models","summary":" Multimodal large language models (MLLMs) have become the cornerstone of\ntoday's generative AI ecosystem, sparking intense competition among tech giants\nand startups. In particular, an MLLM generates a text response given a prompt\nconsisting of an image and a question. While state-of-the-art MLLMs use safety\nfilters and alignment techniques to refuse unsafe prompts, in this work, we\nintroduce MLLM-Refusal, the first method that induces refusals for safe\nprompts. In particular, our MLLM-Refusal optimizes a nearly-imperceptible\nrefusal perturbation and adds it to an image, causing target MLLMs to likely\nrefuse a safe prompt containing the perturbed image and a safe question.\nSpecifically, we formulate MLLM-Refusal as a constrained optimization problem\nand propose an algorithm to solve it. Our method offers competitive advantages\nfor MLLM model providers by potentially disrupting user experiences of\ncompeting MLLMs, since competing MLLM's users will receive unexpected refusals\nwhen they unwittingly use these perturbed images in their prompts. We evaluate\nMLLM-Refusal on four MLLMs across four datasets, demonstrating its\neffectiveness in causing competing MLLMs to refuse safe prompts while not\naffecting non-competing MLLMs. Furthermore, we explore three potential\ncountermeasures-adding Gaussian noise, DiffPure, and adversarial training. Our\nresults show that though they can mitigate MLLM-Refusal's effectiveness, they\nalso sacrifice the accuracy and/or efficiency of the competing MLLM. The code\nis available at https://github.com/Sadcardation/MLLM-Refusal.\n","authors":["Zedian Shao","Hongbin Liu","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2407.09050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03911v1","updated":"2024-09-05T21:08:25Z","published":"2024-09-05T21:08:25Z","title":"The Role of Generative Systems in Historical Photography Management: A\n Case Study on Catalan Archives","summary":" The use of image analysis in automated photography management is an\nincreasing trend in heritage institutions. Such tools alleviate the human cost\nassociated with the manual and expensive annotation of new data sources while\nfacilitating fast access to the citizenship through online indexes and search\nengines. However, available tagging and description tools are usually designed\naround modern photographs in English, neglecting historical corpora in\nminoritized languages, each of which exhibits intrinsic particularities. The\nprimary objective of this research is to study the quantitative contribution of\ngenerative systems in the description of historical sources. This is done by\ncontextualizing the task of captioning historical photographs from the Catalan\narchives as a case study. Our findings provide practitioners with tools and\ndirections on transfer learning for captioning models based on visual\nadaptation and linguistic proximity.\n","authors":["Èric Śanchez","Adrià Molina","Oriol Ramos Terrades"],"pdf_url":"https://arxiv.org/pdf/2409.03911v1.pdf","comment":"Accepted at ECCV workshop AI4DH"},{"id":"http://arxiv.org/abs/2312.01397v3","updated":"2024-09-05T20:29:23Z","published":"2023-12-03T13:50:24Z","title":"Visual Prompting Upgrades Neural Network Sparsification: A Data-Model\n Perspective","summary":" The rapid development of large-scale deep learning models questions the\naffordability of hardware platforms, which necessitates the pruning to reduce\ntheir computational and memory footprints. Sparse neural networks as the\nproduct, have demonstrated numerous favorable benefits like low complexity,\nundamaged generalization, etc. Most of the prominent pruning strategies are\ninvented from a model-centric perspective, focusing on searching and preserving\ncrucial weights by analyzing network topologies. However, the role of data and\nits interplay with model-centric pruning has remained relatively unexplored. In\nthis research, we introduce a novel data-model co-design perspective: to\npromote superior weight sparsity by learning important model topology and\nadequate input data in a synergetic manner. Specifically, customized Visual\nPrompts are mounted to upgrade neural Network sparsification in our proposed\nVPNs framework. As a pioneering effort, this paper conducts systematic\ninvestigations about the impact of different visual prompts on model pruning\nand suggests an effective joint optimization approach. Extensive experiments\nwith 3 network architectures and 8 datasets evidence the substantial\nperformance improvements from VPNs over existing start-of-the-art pruning\nalgorithms. Furthermore, we find that subnetworks discovered by VPNs from\npre-trained models enjoy better transferability across diverse downstream\nscenarios. These insights shed light on new promising possibilities of\ndata-model co-designs for vision model sparsification.\n","authors":["Can Jin","Tianjin Huang","Yihua Zhang","Mykola Pechenizkiy","Sijia Liu","Shiwei Liu","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01397v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03486v2","updated":"2024-09-05T20:23:19Z","published":"2024-05-06T13:57:03Z","title":"UnsafeBench: Benchmarking Image Safety Classifiers on Real-World and\n AI-Generated Images","summary":" With the advent of text-to-image models and concerns about their misuse,\ndevelopers are increasingly relying on image safety classifiers to moderate\ntheir generated unsafe images. Yet, the performance of current image safety\nclassifiers remains unknown for both real-world and AI-generated images. In\nthis work, we propose UnsafeBench, a benchmarking framework that evaluates the\neffectiveness and robustness of image safety classifiers, with a particular\nfocus on the impact of AI-generated images on their performance. First, we\ncurate a large dataset of 10K real-world and AI-generated images that are\nannotated as safe or unsafe based on a set of 11 unsafe categories of images\n(sexual, violent, hateful, etc.). Then, we evaluate the effectiveness and\nrobustness of five popular image safety classifiers, as well as three\nclassifiers that are powered by general-purpose visual language models. Our\nassessment indicates that existing image safety classifiers are not\ncomprehensive and effective enough to mitigate the multifaceted problem of\nunsafe images. Also, there exists a distribution shift between real-world and\nAI-generated images in image qualities, styles, and layouts, leading to\ndegraded effectiveness and robustness. Motivated by these findings, we build a\ncomprehensive image moderation tool called PerspectiveVision, which addresses\nthe main drawbacks of existing classifiers with improved effectiveness and\nrobustness, especially on AI-generated images. UnsafeBench and\nPerspectiveVision can aid the research community in better understanding the\nlandscape of image safety classification in the era of generative AI.\n","authors":["Yiting Qu","Xinyue Shen","Yixin Wu","Michael Backes","Savvas Zannettou","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.03486v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03901v1","updated":"2024-09-05T20:21:49Z","published":"2024-09-05T20:21:49Z","title":"On-board Satellite Image Classification for Earth Observation: A\n Comparative Study of Pre-Trained Vision Transformer Models","summary":" Remote sensing image classification is a critical component of Earth\nobservation (EO) systems, traditionally dominated by convolutional neural\nnetworks (CNNs) and other deep learning techniques. However, the advent of\nTransformer-based architectures and large-scale pre-trained models has\nsignificantly shifted, offering enhanced performance and efficiency. This study\nfocuses on identifying the most effective pre-trained model for land use\nclassification in onboard satellite processing, emphasizing achieving high\naccuracy, computational efficiency, and robustness against noisy data\nconditions commonly encountered during satellite-based inference. Through\nextensive experimentation, we compared traditional CNN-based models,\nResNet-based models, and various pre-trained vision Transformer models. Our\nfindings demonstrate that pre-trained Transformer models, particularly\nMobileViTV2 and EfficientViT-M2, outperform models trained from scratch in\naccuracy and efficiency. These models achieve high performance with reduced\ncomputational requirements and exhibit greater resilience during inference\nunder noisy conditions. While MobileViTV2 excelled on clean validation data,\nEfficientViT-M2 proved more robust when handling noise, making it the most\nsuitable model for onboard satellite Earth observation tasks. In conclusion,\nEfficientViT-M2 is the optimal choice for reliable and efficient remote sensing\nimage classification in satellite operations, achieving 98.76\\% accuracy,\nprecision, and recall. Specifically, EfficientViT-M2 delivered the highest\nperformance across all metrics, excelled in training efficiency (1,000s) and\ninference time (10s), and demonstrated greater robustness (overall robustness\nscore at 0.79).\n","authors":["Thanh-Dung Le","Vu Nguyen Ha","Ti Ti Nguyen","Geoffrey Eappen","Prabhu Thiruvasagam","Luis M. Garces-Socarras","Hong-fu Chou","Jorge L. Gonzalez-Rios","Juan Carlos Merlano-Duncan","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2409.03901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09164v3","updated":"2024-09-05T19:56:09Z","published":"2024-02-14T13:30:02Z","title":"Less is More: Fewer Interpretable Region via Submodular Subset Selection","summary":" Image attribution algorithms aim to identify important regions that are\nhighly relevant to model decisions. Although existing attribution solutions can\neffectively assign importance to target elements, they still face the following\nchallenges: 1) existing attribution methods generate inaccurate small regions\nthus misleading the direction of correct attribution, and 2) the model cannot\nproduce good attribution results for samples with wrong predictions. To address\nthe above challenges, this paper re-models the above image attribution problem\nas a submodular subset selection problem, aiming to enhance model\ninterpretability using fewer regions. To address the lack of attention to local\nregions, we construct a novel submodular function to discover more accurate\nsmall interpretation regions. To enhance the attribution effect for all\nsamples, we also impose four different constraints on the selection of\nsub-regions, i.e., confidence, effectiveness, consistency, and collaboration\nscores, to assess the importance of various subsets. Moreover, our theoretical\nanalysis substantiates that the proposed function is in fact submodular.\nExtensive experiments show that the proposed method outperforms SOTA methods on\ntwo face datasets (Celeb-A and VGG-Face2) and one fine-grained dataset\n(CUB-200-2011). For correctly predicted samples, the proposed method improves\nthe Deletion and Insertion scores with an average of 4.9% and 2.5% gain\nrelative to HSIC-Attribution. For incorrectly predicted samples, our method\nachieves gains of 81.0% and 18.4% compared to the HSIC-Attribution algorithm in\nthe average highest confidence and Insertion score respectively. The code is\nreleased at https://github.com/RuoyuChen10/SMDL-Attribution.\n","authors":["Ruoyu Chen","Hua Zhang","Siyuan Liang","Jingzhi Li","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2402.09164v3.pdf","comment":"Accepted to ICLR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2409.03890v1","updated":"2024-09-05T19:55:38Z","published":"2024-09-05T19:55:38Z","title":"MVTN: A Multiscale Video Transformer Network for Hand Gesture\n Recognition","summary":" In this paper, we introduce a novel Multiscale Video Transformer Network\n(MVTN) for dynamic hand gesture recognition, since multiscale features can\nextract features with variable size, pose, and shape of hand which is a\nchallenge in hand gesture recognition. The proposed model incorporates a\nmultiscale feature hierarchy to capture diverse levels of detail and context\nwithin hand gestures which enhances the model's ability. This multiscale\nhierarchy is obtained by extracting different dimensions of attention in\ndifferent transformer stages with initial stages to model high-resolution\nfeatures and later stages to model low-resolution features. Our approach also\nleverages multimodal data, utilizing depth maps, infrared data, and surface\nnormals along with RGB images from NVGesture and Briareo datasets. Experiments\nshow that the proposed MVTN achieves state-of-the-art results with less\ncomputational complexity and parameters. The source code is available at\nhttps://github.com/mallikagarg/MVTN.\n","authors":["Mallika Garg","Debashis Ghosh","Pyari Mohan Pradhan"],"pdf_url":"https://arxiv.org/pdf/2409.03890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03889v1","updated":"2024-09-05T19:52:09Z","published":"2024-09-05T19:52:09Z","title":"Recon-all-clinical: Cortical surface reconstruction and analysis of\n heterogeneous clinical brain MRI","summary":" Surface-based analysis of the cerebral cortex is ubiquitous in human\nneuroimaging with MRI. It is crucial for cortical registration, parcellation,\nand thickness estimation. Traditionally, these analyses require\nhigh-resolution, isotropic scans with good gray-white matter contrast,\ntypically a 1mm T1-weighted scan. This excludes most clinical MRI scans, which\nare often anisotropic and lack the necessary T1 contrast. To enable large-scale\nneuroimaging studies using vast clinical data, we introduce recon-all-clinical,\na novel method for cortical reconstruction, registration, parcellation, and\nthickness estimation in brain MRI scans of any resolution and contrast. Our\napproach employs a hybrid analysis method that combines a convolutional neural\nnetwork (CNN) trained with domain randomization to predict signed distance\nfunctions (SDFs) and classical geometry processing for accurate surface\nplacement while maintaining topological and geometric constraints. The method\ndoes not require retraining for different acquisitions, thus simplifying the\nanalysis of heterogeneous clinical datasets. We tested recon-all-clinical on\nmultiple datasets, including over 19,000 clinical scans. The method\nconsistently produced precise cortical reconstructions and high parcellation\naccuracy across varied MRI contrasts and resolutions. Cortical thickness\nestimates are precise enough to capture aging effects independently of MRI\ncontrast, although accuracy varies with slice thickness. Our method is publicly\navailable at https://surfer.nmr.mgh.harvard.edu/fswiki/recon-all-clinical,\nenabling researchers to perform detailed cortical analysis on the huge amounts\nof already existing clinical MRI scans. This advancement may be particularly\nvaluable for studying rare diseases and underrepresented populations where\nresearch-grade MRI data is scarce.\n","authors":["Karthik Gopinath","Douglas N. Greve","Colin Magdamo","Steve Arnold","Sudeshna Das","Oula Puonti","Juan Eugenio Iglesias"],"pdf_url":"https://arxiv.org/pdf/2409.03889v1.pdf","comment":"16 pages in the manuscript with 11 page supplementary material"},{"id":"http://arxiv.org/abs/2409.03887v1","updated":"2024-09-05T19:50:26Z","published":"2024-09-05T19:50:26Z","title":"The Influence of Faulty Labels in Data Sets on Human Pose Estimation","summary":" In this study we provide empirical evidence demonstrating that the quality of\ntraining data impacts model performance in Human Pose Estimation (HPE).\nInaccurate labels in widely used data sets, ranging from minor errors to severe\nmislabeling, can negatively influence learning and distort performance metrics.\nWe perform an in-depth analysis of popular HPE data sets to show the extent and\nnature of label inaccuracies. Our findings suggest that accounting for the\nimpact of faulty labels will facilitate the development of more robust and\naccurate HPE models for a variety of real-world applications. We show improved\nperformance with cleansed data.\n","authors":["Arnold Schwarz","Levente Hernadi","Felix Bießmann","Kristian Hildebrand"],"pdf_url":"https://arxiv.org/pdf/2409.03887v1.pdf","comment":"15 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2409.03879v1","updated":"2024-09-05T19:36:02Z","published":"2024-09-05T19:36:02Z","title":"Multi-Camera Industrial Open-Set Person Re-Identification and Tracking","summary":" In recent years, the development of deep learning approaches for the task of\nperson re-identification led to impressive results. However, this comes with a\nlimitation for industrial and practical real-world applications. Firstly, most\nof the existing works operate on closed-world scenarios, in which the people to\nre-identify (probes) are compared to a closed-set (gallery). Real-world\nscenarios often are open-set problems in which the gallery is not known a\npriori, but the number of open-set approaches in the literature is\nsignificantly lower. Secondly, challenges such as multi-camera setups,\nocclusions, real-time requirements, etc., further constrain the applicability\nof off-the-shelf methods. This work presents MICRO-TRACK, a Modular Industrial\nmulti-Camera Re_identification and Open-set Tracking system that is real-time,\nscalable, and easy to integrate into existing industrial surveillance\nscenarios. Furthermore, we release a novel Re-ID and tracking dataset acquired\nin an industrial manufacturing facility, dubbed Facility-ReID, consisting of\n18-minute videos captured by 8 surveillance cameras.\n","authors":["Federico Cunico","Marco Cristani"],"pdf_url":"https://arxiv.org/pdf/2409.03879v1.pdf","comment":"Accepted at T-CAP workshop at ECCV 2024"},{"id":"http://arxiv.org/abs/2409.03878v1","updated":"2024-09-05T19:34:21Z","published":"2024-09-05T19:34:21Z","title":"Ground-roll Separation From Land Seismic Records Based on Convolutional\n Neural Network","summary":" Ground-roll wave is a common coherent noise in land field seismic data. This\nRayleigh-type surface wave usually has low frequency, low apparent velocity,\nand high amplitude, therefore obscures the reflection events of seismic shot\ngathers. Commonly used techniques focus on the differences of ground-roll and\nreflection in transformed domain such as $f-k$ domain, wavelet domain, or\ncurvelet domain. These approaches use a series of fixed atoms or bases to\ntransform the data in time-space domain into transformed domain to separate\ndifferent waveforms, thus tend to suffer from the complexity for a delicate\ndesign of the parameters of the transform domain filter. To deal with these\nproblems, a novel way is proposed to separate ground-roll from reflections\nusing convolutional neural network (CNN) model based method to learn to extract\nthe features of ground-roll and reflections automatically based on training\ndata. In the proposed method, low-pass filtered seismic data which is\ncontaminated by ground-roll wave is used as input of CNN, and then outputs both\nground-roll component and low-frequency part of reflection component\nsimultaneously. Discriminative loss is applied together with similarity loss in\nthe training process to enhance the similarity to their train labels as well as\nthe difference between the two outputs. Experiments are conducted on both\nsynthetic and real data, showing that CNN based method can separate ground roll\nfrom reflections effectively, and has generalization ability to a certain\nextent.\n","authors":["Zhuang Jia","Wenkai Lu","Meng Zhang","Yongkang Miao"],"pdf_url":"https://arxiv.org/pdf/2409.03878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03868v1","updated":"2024-09-05T19:10:29Z","published":"2024-09-05T19:10:29Z","title":"Few-shot Adaptation of Medical Vision-Language Models","summary":" Integrating image and text data through multi-modal learning has emerged as a\nnew approach in medical imaging research, following its successful deployment\nin computer vision. While considerable efforts have been dedicated to\nestablishing medical foundation models and their zero-shot transfer to\ndownstream tasks, the popular few-shot setting remains relatively unexplored.\nFollowing on from the currently strong emergence of this setting in computer\nvision, we introduce the first structured benchmark for adapting medical\nvision-language models (VLMs) in a strict few-shot regime and investigate\nvarious adaptation strategies commonly used in the context of natural images.\nFurthermore, we evaluate a simple generalization of the linear-probe adaptation\nbaseline, which seeks an optimal blending of the visual prototypes and text\nembeddings via learnable class-wise multipliers. Surprisingly, such a\ntext-informed linear probe yields competitive performances in comparison to\nconvoluted prompt-learning and adapter-based strategies, while running\nconsiderably faster and accommodating the black-box setting. Our extensive\nexperiments span three different medical modalities and specialized foundation\nmodels, nine downstream tasks, and several state-of-the-art few-shot adaptation\nmethods. We made our benchmark and code publicly available to trigger further\ndevelopments in this emergent subject:\n\\url{https://github.com/FereshteShakeri/few-shot-MedVLMs}.\n","authors":["Fereshteh Shakeri","Yunshi Huang","Julio Silva-Rodríguez","Houda Bahig","An Tang","Jose Dolz","Ismail Ben Ayed"],"pdf_url":"https://arxiv.org/pdf/2409.03868v1.pdf","comment":"MICCAI 2024 (Spotlight) - Code is available at\n https://github.com/FereshteShakeri/few-shot-MedVLMs.git"},{"id":"http://arxiv.org/abs/2212.11192v2","updated":"2024-09-05T18:17:51Z","published":"2022-12-21T17:08:58Z","title":"Continual Learning Approaches for Anomaly Detection","summary":" Anomaly Detection is a relevant problem that arises in numerous real-world\napplications, especially when dealing with images. However, there has been\nlittle research for this task in the Continual Learning setting. In this work,\nwe introduce a novel approach called SCALE (SCALing is Enough) to perform\nCompressed Replay in a framework for Anomaly Detection in Continual Learning\nsetting. The proposed technique scales and compresses the original images using\na Super Resolution model which, to the best of our knowledge, is studied for\nthe first time in the Continual Learning setting. SCALE can achieve a high\nlevel of compression while maintaining a high level of image reconstruction\nquality. In conjunction with other Anomaly Detection approaches, it can achieve\noptimal results. To validate the proposed approach, we use a real-world dataset\nof images with pixel-based anomalies, with the scope to provide a reliable\nbenchmark for Anomaly Detection in the context of Continual Learning, serving\nas a foundation for further advancements in the field.\n","authors":["Davide Dalle Pezze","Eugenia Anello","Chiara Masiero","Gian Antonio Susto"],"pdf_url":"https://arxiv.org/pdf/2212.11192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09948v2","updated":"2024-09-05T18:13:16Z","published":"2024-03-15T01:18:08Z","title":"RadCLIP: Enhancing Radiologic Image Analysis through Contrastive\n Language-Image Pre-training","summary":" The integration of artificial intelligence (AI) with radiology marks a\ntransformative era in medicine. Vision foundation models have been adopted to\nenhance radiologic imaging analysis. However, the distinct complexities of\nradiologic 2D and 3D radiologic data pose unique challenges that existing\nmodels, pre-trained on general non-medical images, fail to address adequately.\nTo bridge this gap and capitalize on the diagnostic precision required in\nradiologic imaging, we introduce Radiologic Contrastive Language-Image\nPre-training (RadCLIP): a cross-modal vision-language foundational model that\nharnesses Vision Language Pre-training (VLP) framework to improve radiologic\nimage analysis. Building upon Contrastive Language-Image Pre-training (CLIP),\nRadCLIP incorporates a slice pooling mechanism tailored for volumetric image\nanalysis and is pre-trained using a large and diverse dataset of radiologic\nimage-text pairs. The RadCLIP was pre-trained to effectively align radiologic\nimages with their corresponding text annotations, creating a robust vision\nbackbone for radiologic images. Extensive experiments demonstrate RadCLIP's\nsuperior performance in both uni-modal radiologic image classification and\ncross-modal image-text matching, highlighting its significant promise for\nimproving diagnostic accuracy and efficiency in clinical settings. Our Key\ncontributions include curating a large dataset with diverse radiologic 2D/3D\nradiologic image-text pairs, a slice pooling adapter using an attention\nmechanism for integrating 2D images, and comprehensive evaluations of RadCLIP\non various radiologic downstream tasks.\n","authors":["Zhixiu Lu","Hailong Li","Nehal A. Parikh","Jonathan R. Dillman","Lili He"],"pdf_url":"https://arxiv.org/pdf/2403.09948v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.03753v1","updated":"2024-09-05T17:59:15Z","published":"2024-09-05T17:59:15Z","title":"WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild","summary":" The increasing availability of real-world conversation data offers exciting\nopportunities for researchers to study user-chatbot interactions. However, the\nsheer volume of this data makes manually examining individual conversations\nimpractical. To overcome this challenge, we introduce WildVis, an interactive\ntool that enables fast, versatile, and large-scale conversation analysis.\nWildVis provides search and visualization capabilities in the text and\nembedding spaces based on a list of criteria. To manage million-scale datasets,\nwe implemented optimizations including search index construction, embedding\nprecomputation and compression, and caching to ensure responsive user\ninteractions within seconds. We demonstrate WildVis's utility through three\ncase studies: facilitating chatbot misuse research, visualizing and comparing\ntopic distributions across datasets, and characterizing user-specific\nconversation patterns. WildVis is open-source and designed to be extendable,\nsupporting additional datasets and customized search and visualization\nfunctionalities.\n","authors":["Yuntian Deng","Wenting Zhao","Jack Hessel","Xiang Ren","Claire Cardie","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03708v1","updated":"2024-09-05T17:14:23Z","published":"2024-09-05T17:14:23Z","title":"RAG based Question-Answering for Contextual Response Prediction System","summary":" Large Language Models (LLMs) have shown versatility in various Natural\nLanguage Processing (NLP) tasks, including their potential as effective\nquestion-answering systems. However, to provide precise and relevant\ninformation in response to specific customer queries in industry settings, LLMs\nrequire access to a comprehensive knowledge base to avoid hallucinations.\nRetrieval Augmented Generation (RAG) emerges as a promising technique to\naddress this challenge. Yet, developing an accurate question-answering\nframework for real-world applications using RAG entails several challenges: 1)\ndata availability issues, 2) evaluating the quality of generated content, and\n3) the costly nature of human evaluation. In this paper, we introduce an\nend-to-end framework that employs LLMs with RAG capabilities for industry use\ncases. Given a customer query, the proposed system retrieves relevant knowledge\ndocuments and leverages them, along with previous chat history, to generate\nresponse suggestions for customer service agents in the contact centers of a\nmajor retail company. Through comprehensive automated and human evaluations, we\nshow that this solution outperforms the current BERT-based algorithms in\naccuracy and relevance. Our findings suggest that RAG-based LLMs can be an\nexcellent support to human customer service representatives by lightening their\nworkload.\n","authors":["Sriram Veturi","Saurabh Vaichal","Nafis Irtiza Tripto","Reshma Lal Jagadheesh","Nian Yan"],"pdf_url":"https://arxiv.org/pdf/2409.03708v1.pdf","comment":"Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise,\n CIKM'24. 6 pages"},{"id":"http://arxiv.org/abs/2409.03504v1","updated":"2024-09-05T13:18:01Z","published":"2024-09-05T13:18:01Z","title":"HGAMN: Heterogeneous Graph Attention Matching Network for Multilingual\n POI Retrieval at Baidu Maps","summary":" The increasing interest in international travel has raised the demand of\nretrieving point of interests in multiple languages. This is even superior to\nfind local venues such as restaurants and scenic spots in unfamiliar languages\nwhen traveling abroad. Multilingual POI retrieval, enabling users to find\ndesired POIs in a demanded language using queries in numerous languages, has\nbecome an indispensable feature of today's global map applications such as\nBaidu Maps. This task is non-trivial because of two key challenges: (1)\nvisiting sparsity and (2) multilingual query-POI matching. To this end, we\npropose a Heterogeneous Graph Attention Matching Network (HGAMN) to\nconcurrently address both challenges. Specifically, we construct a\nheterogeneous graph that contains two types of nodes: POI node and query node\nusing the search logs of Baidu Maps. To alleviate challenge \\#1, we construct\nedges between different POI nodes to link the low-frequency POIs with the\nhigh-frequency ones, which enables the transfer of knowledge from the latter to\nthe former. To mitigate challenge \\#2, we construct edges between POI and query\nnodes based on the co-occurrences between queries and POIs, where queries in\ndifferent languages and formulations can be aggregated for individual POIs.\nMoreover, we develop an attention-based network to jointly learn node\nrepresentations of the heterogeneous graph and further design a cross-attention\nmodule to fuse the representations of both types of nodes for query-POI\nrelevance scoring. Extensive experiments conducted on large-scale real-world\ndatasets from Baidu Maps demonstrate the superiority and effectiveness of\nHGAMN. In addition, HGAMN has already been deployed in production at Baidu\nMaps, and it successfully keeps serving hundreds of millions of requests every\nday.\n","authors":["Jizhou Huang","Haifeng Wang","Yibo Sun","Miao Fan","Zhengjie Huang","Chunyuan Yuan","Yawen Li"],"pdf_url":"https://arxiv.org/pdf/2409.03504v1.pdf","comment":"Accepted by KDD'21"},{"id":"http://arxiv.org/abs/2409.03449v1","updated":"2024-09-05T11:56:40Z","published":"2024-09-05T11:56:40Z","title":"MOBIUS: Towards the Next Generation of Query-Ad Matching in Baidu's\n Sponsored Search","summary":" Baidu runs the largest commercial web search engine in China, serving\nhundreds of millions of online users every day in response to a great variety\nof queries. In order to build a high-efficiency sponsored search engine, we\nused to adopt a three-layer funnel-shaped structure to screen and sort hundreds\nof ads from billions of ad candidates subject to the requirement of low\nresponse latency and the restraints of computing resources. Given a user query,\nthe top matching layer is responsible for providing semantically relevant ad\ncandidates to the next layer, while the ranking layer at the bottom concerns\nmore about business indicators (e.g., CPM, ROI, etc.) of those ads. The clear\nseparation between the matching and ranking objectives results in a lower\ncommercial return. The Mobius project has been established to address this\nserious issue. It is our first attempt to train the matching layer to consider\nCPM as an additional optimization objective besides the query-ad relevance, via\ndirectly predicting CTR (click-through rate) from billions of query-ad pairs.\nSpecifically, this paper will elaborate on how we adopt active learning to\novercome the insufficiency of click history at the matching layer when training\nour neural click networks offline, and how we use the SOTA ANN search technique\nfor retrieving ads more efficiently (Here ``ANN'' stands for approximate\nnearest neighbor search). We contribute the solutions to Mobius-V1 as the first\nversion of our next generation query-ad matching system.\n","authors":["Miao Fan","Jiacheng Guo","Shuai Zhu","Shuo Miao","Mingming Sun","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2409.03449v1.pdf","comment":"Accepted by KDD'19"},{"id":"http://arxiv.org/abs/2409.02727v2","updated":"2024-09-05T07:17:59Z","published":"2024-09-04T14:01:48Z","title":"Pooling And Attention: What Are Effective Designs For LLM-Based\n Embedding Models?","summary":" The significant advancements of Large Language Models (LLMs) in generative\ntasks have led to a growing body of work exploring LLM-based embedding models.\nWhile these models, employing different pooling and attention strategies, have\nachieved state-of-the-art performance on public embedding benchmarks, questions\nstill arise about what constitutes an effective design for LLM-based embedding\nmodels. However, these models are often trained on different datasets, using\ndifferent LLM base models or training settings. Moreover, evaluations on public\nembedding benchmarks often fail to report statistical significance, making it\ndifficult to determine which designs truly contribute to final performance.\nThis complicates the process for practitioners seeking optimal training recipes\nfor LLM-based embedding models. In this study, we conduct a large-scale\nexperiment by training a series of LLM-based embedding models using the same\ntraining data and base model but differing in their pooling and attention\nstrategies. The results show that there is no one-size-fits-all solution: while\nbidirectional attention and an additional trainable pooling layer outperform in\ntext similarity and information retrieval tasks, they do not significantly\nsurpass simpler designs like EOS-last token pooling and default causal\nattention in clustering and classification tasks. Furthermore, we propose a new\npooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs\nof all hidden layers, rather than just the last layer, using a cross-attention\nnetwork. This method proves to be statistically superior in text similarity and\nretrieval tasks compared to existing pooling methods. Overall, this paper sheds\nlight on effective training strategies for LLM-based embedding models.\n","authors":["Yixuan Tang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.02727v2.pdf","comment":"https://github.com/yixuantt/PoolingAndAttn"},{"id":"http://arxiv.org/abs/2409.03294v1","updated":"2024-09-05T06:59:56Z","published":"2024-09-05T06:59:56Z","title":"Federated Prototype-based Contrastive Learning for Privacy-Preserving\n Cross-domain Recommendation","summary":" Cross-domain recommendation (CDR) aims to improve recommendation accuracy in\nsparse domains by transferring knowledge from data-rich domains. However,\nexisting CDR methods often assume the availability of user-item interaction\ndata across domains, overlooking user privacy concerns. Furthermore, these\nmethods suffer from performance degradation in scenarios with sparse\noverlapping users, as they typically depend on a large number of fully shared\nusers for effective knowledge transfer. To address these challenges, we propose\na Federated Prototype-based Contrastive Learning (CL) method for\nPrivacy-Preserving CDR, named FedPCL-CDR. This approach utilizes\nnon-overlapping user information and prototypes to improve multi-domain\nperformance while protecting user privacy. FedPCL-CDR comprises two modules:\nlocal domain (client) learning and global server aggregation. In the local\ndomain, FedPCL-CDR clusters all user data to learn representative prototypes,\neffectively utilizing non-overlapping user information and addressing the\nsparse overlapping user issue. It then facilitates knowledge transfer by\nemploying both local and global prototypes returned from the server in a CL\nmanner. Simultaneously, the global server aggregates representative prototypes\nfrom local domains to learn both local and global prototypes. The combination\nof prototypes and federated learning (FL) ensures that sensitive user data\nremains decentralized, with only prototypes being shared across domains,\nthereby protecting user privacy. Extensive experiments on four CDR tasks using\ntwo real-world datasets demonstrate that FedPCL-CDR outperforms the\nstate-of-the-art baselines.\n","authors":["Li Wang","Quangui Zhang","Lei Sang","Qiang Wu","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2409.03294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03284v1","updated":"2024-09-05T06:49:14Z","published":"2024-09-05T06:49:14Z","title":"iText2KG: Incremental Knowledge Graphs Construction Using Large Language\n Models","summary":" Most available data is unstructured, making it challenging to access valuable\ninformation. Automatically building Knowledge Graphs (KGs) is crucial for\nstructuring data and making it accessible, allowing users to search for\ninformation effectively. KGs also facilitate insights, inference, and\nreasoning. Traditional NLP methods, such as named entity recognition and\nrelation extraction, are key in information retrieval but face limitations,\nincluding the use of predefined entity types and the need for supervised\nlearning. Current research leverages large language models' capabilities, such\nas zero- or few-shot learning. However, unresolved and semantically duplicated\nentities and relations still pose challenges, leading to inconsistent graphs\nand requiring extensive post-processing. Additionally, most approaches are\ntopic-dependent. In this paper, we propose iText2KG, a method for incremental,\ntopic-independent KG construction without post-processing. This plug-and-play,\nzero-shot method is applicable across a wide range of KG construction scenarios\nand comprises four modules: Document Distiller, Incremental Entity Extractor,\nIncremental Relation Extractor, and Graph Integrator and Visualization. Our\nmethod demonstrates superior performance compared to baseline methods across\nthree scenarios: converting scientific papers to graphs, websites to graphs,\nand CVs to graphs.\n","authors":["Yassir Lairgi","Ludovic Moncla","Rémy Cazabet","Khalid Benabdeslem","Pierre Cléau"],"pdf_url":"https://arxiv.org/pdf/2409.03284v1.pdf","comment":"Accepted at The International Web Information Systems Engineering\n conference (the WISE conference) 2024"},{"id":"http://arxiv.org/abs/2409.03140v1","updated":"2024-09-05T00:25:37Z","published":"2024-09-05T00:25:37Z","title":"GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase\n Recommendation","summary":" Online sellers and advertisers are recommended keyphrases for their listed\nproducts, which they bid on to enhance their sales. One popular paradigm that\ngenerates such recommendations is Extreme Multi-Label Classification (XMC),\nwhich involves tagging/mapping keyphrases to items. We outline the limitations\nof using traditional item-query based tagging or mapping techniques for\nkeyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an\ninnovative graph-based approach that recommends keyphrases to sellers using\nextraction of token permutations from item titles. Additionally, we demonstrate\nthat relying on traditional metrics such as precision/recall can be misleading\nin practical applications, thereby necessitating a combination of metrics to\nevaluate performance in real-world scenarios. These metrics are designed to\nassess the relevance of keyphrases to items and the potential for buyer\noutreach. GraphEx outperforms production models at eBay, achieving the\nobjectives mentioned above. It supports near real-time inferencing in\nresource-constrained production environments and scales effectively for\nbillions of items.\n","authors":["Ashirbad Mishra","Soumik Dey","Marshall Wu","Jinyu Zhao","He Yu","Kaichen Ni","Binbin Li","Kamesh Madduri"],"pdf_url":"https://arxiv.org/pdf/2409.03140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03928v1","updated":"2024-09-05T22:22:57Z","published":"2024-09-05T22:22:57Z","title":"RETAIN: Interactive Tool for Regression Testing Guided LLM Migration","summary":" Large Language Models (LLMs) are increasingly integrated into diverse\napplications. The rapid evolution of LLMs presents opportunities for developers\nto enhance applications continuously. However, this constant adaptation can\nalso lead to performance regressions during model migrations. While several\ninteractive tools have been proposed to streamline the complexity of prompt\nengineering, few address the specific requirements of regression testing for\nLLM Migrations. To bridge this gap, we introduce RETAIN (REgression Testing\nguided LLM migrAtIoN), a tool designed explicitly for regression testing in LLM\nMigrations. RETAIN comprises two key components: an interactive interface\ntailored to regression testing needs during LLM migrations, and an error\ndiscovery module that facilitates understanding of differences in model\nbehaviors. The error discovery module generates textual descriptions of various\nerrors or differences between model outputs, providing actionable insights for\nprompt refinement. Our automatic evaluation and empirical user studies\ndemonstrate that RETAIN, when compared to manual evaluation, enabled\nparticipants to identify twice as many errors, facilitated experimentation with\n75% more prompts, and achieves 12% higher metric scores in a given time frame.\n","authors":["Tanay Dixit","Daniel Lee","Sally Fang","Sai Sree Harsha","Anirudh Sureshan","Akash Maharaj","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2409.03928v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2406.12580v2","updated":"2024-09-05T21:29:32Z","published":"2024-06-18T13:06:58Z","title":"Behavior-Dependent Linear Recurrent Units for Efficient Sequential\n Recommendation","summary":" Sequential recommender systems aims to predict the users' next interaction\nthrough user behavior modeling with various operators like RNNs and attentions.\nHowever, existing models generally fail to achieve the three golden principles\nfor sequential recommendation simultaneously, i.e., training efficiency,\nlow-cost inference, and strong performance. To this end, we propose RecBLR, an\nEfficient Sequential Recommendation Model based on Behavior-Dependent Linear\nRecurrent Units to accomplish the impossible triangle of the three principles.\nBy incorporating gating mechanisms and behavior-dependent designs into linear\nrecurrent units, our model significantly enhances user behavior modeling and\nrecommendation performance. Furthermore, we unlock the parallelizable training\nas well as inference efficiency for our model by designing a hardware-aware\nscanning acceleration algorithm with a customized CUDA kernel. Extensive\nexperiments on real-world datasets with varying lengths of user behavior\nsequences demonstrate RecBLR's remarkable effectiveness in simultaneously\nachieving all three golden principles - strong recommendation performance,\ntraining efficiency, and low-cost inference, while exhibiting excellent\nscalability to datasets with long user interaction histories.\n","authors":["Chengkai Liu","Jianghao Lin","Hanzhou Liu","Jianling Wang","James Caverlee"],"pdf_url":"https://arxiv.org/pdf/2406.12580v2.pdf","comment":"Accepted to CIKM 2024"},{"id":"http://arxiv.org/abs/2409.03893v1","updated":"2024-09-05T19:59:42Z","published":"2024-09-05T19:59:42Z","title":"Understanding Fairness Metrics in Recommender Systems: A Healthcare\n Perspective","summary":" Fairness in AI-driven decision-making systems has become a critical concern,\nespecially when these systems directly affect human lives. This paper explores\nthe public's comprehension of fairness in healthcare recommendations. We\nconducted a survey where participants selected from four fairness metrics --\nDemographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive\nValue -- across different healthcare scenarios to assess their understanding of\nthese concepts. Our findings reveal that fairness is a complex and often\nmisunderstood concept, with a generally low level of public understanding\nregarding fairness metrics in recommender systems. This study highlights the\nneed for enhanced information and education on algorithmic fairness to support\ninformed decision-making in using these systems. Furthermore, the results\nsuggest that a one-size-fits-all approach to fairness may be insufficient,\npointing to the importance of context-sensitive designs in developing equitable\nAI systems.\n","authors":["Veronica Kecki","Alan Said"],"pdf_url":"https://arxiv.org/pdf/2409.03893v1.pdf","comment":"Accepted to the 18th ACM Conference on Recommender Systems"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.03757v1","updated":"2024-09-05T17:59:56Z","published":"2024-09-05T17:59:56Z","title":"Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene\n Understanding","summary":" Complex 3D scene understanding has gained increasing attention, with scene\nencoding strategies playing a crucial role in this success. However, the\noptimal scene encoding strategies for various scenarios remain unclear,\nparticularly compared to their image-based counterparts. To address this issue,\nwe present a comprehensive study that probes various visual encoding models for\n3D scene understanding, identifying the strengths and limitations of each model\nacross different scenarios. Our evaluation spans seven vision foundation\nencoders, including image-based, video-based, and 3D foundation models. We\nevaluate these models in four tasks: Vision-Language Scene Reasoning, Visual\nGrounding, Segmentation, and Registration, each focusing on different aspects\nof scene understanding. Our evaluations yield key findings: DINOv2 demonstrates\nsuperior performance, video models excel in object-level tasks, diffusion\nmodels benefit geometric tasks, and language-pretrained models show unexpected\nlimitations in language-related tasks. These insights challenge some\nconventional understandings, provide novel perspectives on leveraging visual\nfoundation models, and highlight the need for more flexible encoder selection\nin future vision-language and scene-understanding tasks.\n","authors":["Yunze Man","Shuhong Zheng","Zhipeng Bao","Martial Hebert","Liang-Yan Gui","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03757v1.pdf","comment":"Project page: https://yunzeman.github.io/lexicon3d , Github:\n https://github.com/YunzeMan/Lexicon3D"},{"id":"http://arxiv.org/abs/2409.03753v1","updated":"2024-09-05T17:59:15Z","published":"2024-09-05T17:59:15Z","title":"WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild","summary":" The increasing availability of real-world conversation data offers exciting\nopportunities for researchers to study user-chatbot interactions. However, the\nsheer volume of this data makes manually examining individual conversations\nimpractical. To overcome this challenge, we introduce WildVis, an interactive\ntool that enables fast, versatile, and large-scale conversation analysis.\nWildVis provides search and visualization capabilities in the text and\nembedding spaces based on a list of criteria. To manage million-scale datasets,\nwe implemented optimizations including search index construction, embedding\nprecomputation and compression, and caching to ensure responsive user\ninteractions within seconds. We demonstrate WildVis's utility through three\ncase studies: facilitating chatbot misuse research, visualizing and comparing\ntopic distributions across datasets, and characterizing user-specific\nconversation patterns. WildVis is open-source and designed to be extendable,\nsupporting additional datasets and customized search and visualization\nfunctionalities.\n","authors":["Yuntian Deng","Wenting Zhao","Jack Hessel","Xiang Ren","Claire Cardie","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03749v1","updated":"2024-09-05T17:58:28Z","published":"2024-09-05T17:58:28Z","title":"Dynamics of Supervised and Reinforcement Learning in the Non-Linear\n Perceptron","summary":" The ability of a brain or a neural network to efficiently learn depends\ncrucially on both the task structure and the learning rule. Previous works have\nanalyzed the dynamical equations describing learning in the relatively\nsimplified context of the perceptron under assumptions of a student-teacher\nframework or a linearized output. While these assumptions have facilitated\ntheoretical understanding, they have precluded a detailed understanding of the\nroles of the nonlinearity and input-data distribution in determining the\nlearning dynamics, limiting the applicability of the theories to real\nbiological or artificial neural networks. Here, we use a stochastic-process\napproach to derive flow equations describing learning, applying this framework\nto the case of a nonlinear perceptron performing binary classification. We\ncharacterize the effects of the learning rule (supervised or reinforcement\nlearning, SL/RL) and input-data distribution on the perceptron's learning curve\nand the forgetting curve as subsequent tasks are learned. In particular, we\nfind that the input-data noise differently affects the learning speed under SL\nvs. RL, as well as determines how quickly learning of a task is overwritten by\nsubsequent learning. Additionally, we verify our approach with real data using\nthe MNIST dataset. This approach points a way toward analyzing learning\ndynamics for more-complex circuit architectures.\n","authors":["Christian Schmid","James M. Murray"],"pdf_url":"https://arxiv.org/pdf/2409.03749v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03741v1","updated":"2024-09-05T17:54:26Z","published":"2024-09-05T17:54:26Z","title":"Understanding Data Importance in Machine Learning Attacks: Does Valuable\n Data Pose Greater Harm?","summary":" Machine learning has revolutionized numerous domains, playing a crucial role\nin driving advancements and enabling data-centric processes. The significance\nof data in training models and shaping their performance cannot be overstated.\nRecent research has highlighted the heterogeneous impact of individual data\nsamples, particularly the presence of valuable data that significantly\ncontributes to the utility and effectiveness of machine learning models.\nHowever, a critical question remains unanswered: are these valuable data\nsamples more vulnerable to machine learning attacks? In this work, we\ninvestigate the relationship between data importance and machine learning\nattacks by analyzing five distinct attack types. Our findings reveal notable\ninsights. For example, we observe that high importance data samples exhibit\nincreased vulnerability in certain attacks, such as membership inference and\nmodel stealing. By analyzing the linkage between membership inference\nvulnerability and data importance, we demonstrate that sample characteristics\ncan be integrated into membership metrics by introducing sample-specific\ncriteria, therefore enhancing the membership inference performance. These\nfindings emphasize the urgent need for innovative defense mechanisms that\nstrike a balance between maximizing utility and safeguarding valuable data\nagainst potential exploitation.\n","authors":["Rui Wen","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03741v1.pdf","comment":"To Appear in Network and Distributed System Security (NDSS) Symposium\n 2025"},{"id":"http://arxiv.org/abs/2409.03740v1","updated":"2024-09-05T17:53:54Z","published":"2024-09-05T17:53:54Z","title":"Differentiable Discrete Event Simulation for Queuing Network Control","summary":" Queuing network control is essential for managing congestion in\njob-processing systems such as service systems, communication networks, and\nmanufacturing processes. Despite growing interest in applying reinforcement\nlearning (RL) techniques, queueing network control poses distinct challenges,\nincluding high stochasticity, large state and action spaces, and lack of\nstability. To tackle these challenges, we propose a scalable framework for\npolicy optimization based on differentiable discrete event simulation. Our main\ninsight is that by implementing a well-designed smoothing technique for\ndiscrete event dynamics, we can compute pathwise policy gradients for\nlarge-scale queueing networks using auto-differentiation software (e.g.,\nTensorflow, PyTorch) and GPU parallelization. Through extensive empirical\nexperiments, we observe that our policy gradient estimators are several orders\nof magnitude more accurate than typical REINFORCE-based estimators. In\naddition, We propose a new policy architecture, which drastically improves\nstability while maintaining the flexibility of neural-network policies. In a\nwide variety of scheduling and admission control tasks, we demonstrate that\ntraining control policies with pathwise gradients leads to a 50-1000x\nimprovement in sample efficiency over state-of-the-art RL methods. Unlike prior\ntailored approaches to queueing, our methods can flexibly handle realistic\nscenarios, including systems operating in non-stationary environments and those\nwith non-exponential interarrival/service times.\n","authors":["Ethan Che","Jing Dong","Hongseok Namkoong"],"pdf_url":"https://arxiv.org/pdf/2409.03740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03735v1","updated":"2024-09-05T17:50:31Z","published":"2024-09-05T17:50:31Z","title":"LLM-CI: Assessing Contextual Integrity Norms in Language Models","summary":" Large language models (LLMs), while memorizing parts of their training data\nscraped from the Internet, may also inadvertently encode societal preferences\nand norms. As these models are integrated into sociotechnical systems, it is\ncrucial that the norms they encode align with societal expectations. These\nnorms could vary across models, hyperparameters, optimization techniques, and\ndatasets. This is especially challenging due to prompt sensitivity$-$small\nvariations in prompts yield different responses, rendering existing assessment\nmethodologies unreliable. There is a need for a comprehensive framework\ncovering various models, optimization, and datasets, along with a reliable\nmethodology to assess encoded norms.\n We present LLM-CI, the first open-sourced framework to assess privacy norms\nencoded in LLMs. LLM-CI uses a Contextual Integrity-based factorial vignette\nmethodology to assess the encoded norms across different contexts and LLMs. We\npropose the multi-prompt assessment methodology to address prompt sensitivity\nby assessing the norms from only the prompts that yield consistent responses\nacross multiple variants. Using LLM-CI and our proposed methodology, we\ncomprehensively evaluate LLMs using IoT and COPPA vignettes datasets from prior\nwork, examining the impact of model properties (e.g., hyperparameters,\ncapacity) and optimization strategies (e.g., alignment, quantization).\n","authors":["Yan Shvartzshnaider","Vasisht Duddu","John Lacalamita"],"pdf_url":"https://arxiv.org/pdf/2409.03735v1.pdf","comment":"20 pages, 8 Figures, 4 Tables"},{"id":"http://arxiv.org/abs/2409.03734v1","updated":"2024-09-05T17:45:01Z","published":"2024-09-05T17:45:01Z","title":"Safety vs. Performance: How Multi-Objective Learning Reduces Barriers to\n Market Entry","summary":" Emerging marketplaces for large language models and other large-scale machine\nlearning (ML) models appear to exhibit market concentration, which has raised\nconcerns about whether there are insurmountable barriers to entry in such\nmarkets. In this work, we study this issue from both an economic and an\nalgorithmic point of view, focusing on a phenomenon that reduces barriers to\nentry. Specifically, an incumbent company risks reputational damage unless its\nmodel is sufficiently aligned with safety objectives, whereas a new company can\nmore easily avoid reputational damage. To study this issue formally, we define\na multi-objective high-dimensional regression framework that captures\nreputational damage, and we characterize the number of data points that a new\ncompany needs to enter the market. Our results demonstrate how multi-objective\nconsiderations can fundamentally reduce barriers to entry -- the required\nnumber of data points can be significantly smaller than the incumbent company's\ndataset size. En route to proving these results, we develop scaling laws for\nhigh-dimensional linear regression in multi-objective environments, showing\nthat the scaling rate becomes slower when the dataset size is large, which\ncould be of independent interest.\n","authors":["Meena Jagadeesan","Michael I. Jordan","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2409.03734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03733v1","updated":"2024-09-05T17:44:49Z","published":"2024-09-05T17:44:49Z","title":"Planning In Natural Language Improves LLM Search For Code Generation","summary":" While scaling training compute has led to remarkable improvements in large\nlanguage models (LLMs), scaling inference compute has not yet yielded analogous\ngains. We hypothesize that a core missing component is a lack of diverse LLM\noutputs, leading to inefficient search due to models repeatedly sampling highly\nsimilar, yet incorrect generations. We empirically demonstrate that this lack\nof diversity can be mitigated by searching over candidate plans for solving a\nproblem in natural language. Based on this insight, we propose PLANSEARCH, a\nnovel search algorithm which shows strong results across HumanEval+, MBPP+, and\nLiveCodeBench (a contamination-free benchmark for competitive coding).\nPLANSEARCH generates a diverse set of observations about the problem and then\nuses these observations to construct plans for solving the problem. By\nsearching over plans in natural language rather than directly over code\nsolutions, PLANSEARCH explores a significantly more diverse range of potential\nsolutions compared to baseline search methods. Using PLANSEARCH on top of\nClaude 3.5 Sonnet achieves a state-of-the-art pass@200 of 77.0% on\nLiveCodeBench, outperforming both the best score achieved without search\n(pass@1 = 41.4%) and using standard repeated sampling (pass@200 = 60.6%).\nFinally, we show that, across all models, search algorithms, and benchmarks\nanalyzed, we can accurately predict performance gains due to search as a direct\nfunction of the diversity over generated ideas.\n","authors":["Evan Wang","Federico Cassano","Catherine Wu","Yunfeng Bai","Will Song","Vaskar Nath","Ziwen Han","Sean Hendryx","Summer Yue","Hugh Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03731v1","updated":"2024-09-05T17:42:19Z","published":"2024-09-05T17:42:19Z","title":"A Deep Generative Learning Approach for Two-stage Adaptive Robust\n Optimization","summary":" Two-stage adaptive robust optimization is a powerful approach for planning\nunder uncertainty that aims to balance costs of \"here-and-now\" first-stage\ndecisions with those of \"wait-and-see\" recourse decisions made after\nuncertainty is realized. To embed robustness against uncertainty, modelers\ntypically assume a simple polyhedral or ellipsoidal set over which\ncontingencies may be realized. However, these simple uncertainty sets tend to\nyield highly conservative decision-making when uncertainties are\nhigh-dimensional. In this work, we introduce AGRO, a column-and-constraint\ngeneration algorithm that performs adversarial generation for two-stage\nadaptive robust optimization using a variational autoencoder. AGRO identifies\nrealistic and cost-maximizing contingencies by optimizing over spherical\nuncertainty sets in a latent space using a projected gradient ascent approach\nthat differentiates the optimal recourse cost with respect to the latent\nvariable. To demonstrate the cost- and time-efficiency of our approach\nexperimentally, we apply AGRO to an adaptive robust capacity expansion problem\nfor a regional power system and show that AGRO is able to reduce costs by up to\n7.8% and runtimes by up to 77% in comparison to the conventional\ncolumn-and-constraint generation algorithm.\n","authors":["Aron Brenner","Rahman Khorramfar","Jennifer Sun","Saurabh Amin"],"pdf_url":"https://arxiv.org/pdf/2409.03731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12113v2","updated":"2024-09-05T17:01:33Z","published":"2024-07-16T18:51:24Z","title":"A Graph-based Adversarial Imitation Learning Framework for Reliable &\n Realtime Fleet Scheduling in Urban Air Mobility","summary":" The advent of Urban Air Mobility (UAM) presents the scope for a\ntransformative shift in the domain of urban transportation. However, its\nwidespread adoption and economic viability depends in part on the ability to\noptimally schedule the fleet of aircraft across vertiports in a UAM network,\nunder uncertainties attributed to airspace congestion, changing weather\nconditions, and varying demands. This paper presents a comprehensive\noptimization formulation of the fleet scheduling problem, while also\nidentifying the need for alternate solution approaches, since directly solving\nthe resulting integer nonlinear programming problem is computationally\nprohibitive for daily fleet scheduling. Previous work has shown the\neffectiveness of using (graph) reinforcement learning (RL) approaches to train\nreal-time executable policy models for fleet scheduling. However, such policies\ncan often be brittle on out-of-distribution scenarios or edge cases. Moreover,\ntraining performance also deteriorates as the complexity (e.g., number of\nconstraints) of the problem increases. To address these issues, this paper\npresents an imitation learning approach where the RL-based policy exploits\nexpert demonstrations yielded by solving the exact optimization using a Genetic\nAlgorithm. The policy model comprises Graph Neural Network (GNN) based encoders\nthat embed the space of vertiports and aircraft, Transformer networks to encode\ndemand, passenger fare, and transport cost profiles, and a Multi-head attention\n(MHA) based decoder. Expert demonstrations are used through the Generative\nAdversarial Imitation Learning (GAIL) algorithm. Interfaced with a UAM\nsimulation environment involving 8 vertiports and 40 aircrafts, in terms of the\ndaily profits earned reward, the new imitative approach achieves better mean\nperformance and remarkable improvement in the case of unseen worst-case\nscenarios, compared to pure RL results.\n","authors":["Prithvi Poddar","Steve Paul","Souma Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2407.12113v2.pdf","comment":"Presented at the AIAA Aviation Forum 2024"},{"id":"http://arxiv.org/abs/2409.03703v1","updated":"2024-09-05T16:59:56Z","published":"2024-09-05T16:59:56Z","title":"Iterative thresholding for non-linear learning in the strong\n $\\varepsilon$-contamination model","summary":" We derive approximation bounds for learning single neuron models using\nthresholded gradient descent when both the labels and the covariates are\npossibly corrupted adversarially. We assume the data follows the model $y =\n\\sigma(\\mathbf{w}^{*} \\cdot \\mathbf{x}) + \\xi,$ where $\\sigma$ is a nonlinear\nactivation function, the noise $\\xi$ is Gaussian, and the covariate vector\n$\\mathbf{x}$ is sampled from a sub-Gaussian distribution. We study sigmoidal,\nleaky-ReLU, and ReLU activation functions and derive a\n$O(\\nu\\sqrt{\\epsilon\\log(1/\\epsilon)})$ approximation bound in $\\ell_{2}$-norm,\nwith sample complexity $O(d/\\epsilon)$ and failure probability\n$e^{-\\Omega(d)}$.\n We also study the linear regression problem, where $\\sigma(\\mathbf{x}) =\n\\mathbf{x}$. We derive a $O(\\nu\\epsilon\\log(1/\\epsilon))$ approximation bound,\nimproving upon the previous $O(\\nu)$ approximation bounds for the\ngradient-descent based iterative thresholding algorithms of Bhatia et al.\n(NeurIPS 2015) and Shen and Sanghavi (ICML 2019). Our algorithm has a\n$O(\\textrm{polylog}(N,d)\\log(R/\\epsilon))$ runtime complexity when\n$\\|\\mathbf{w}^{*}\\|_2 \\leq R$, improving upon the\n$O(\\text{polylog}(N,d)/\\epsilon^2)$ runtime complexity of Awasthi et al.\n(NeurIPS 2022).\n","authors":["Arvind Rathnashyam","Alex Gittens"],"pdf_url":"https://arxiv.org/pdf/2409.03703v1.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2407.06099v2","updated":"2024-09-05T16:59:17Z","published":"2024-07-08T16:38:52Z","title":"Physics-Informed Machine Learning Towards A Real-Time Spacecraft Thermal\n Simulator","summary":" Modeling thermal states for complex space missions, such as the surface\nexploration of airless bodies, requires high computation, whether used in\nground-based analysis for spacecraft design or during onboard reasoning for\nautonomous operations. For example, a finite-element thermal model with\nhundreds of elements can take significant time to simulate, which makes it\nunsuitable for onboard reasoning during time-sensitive scenarios such as\ndescent and landing, proximity operations, or in-space assembly. Further, the\nlack of fast and accurate thermal modeling drives thermal designs to be more\nconservative and leads to spacecraft with larger mass and higher power budgets.\nThe emerging paradigm of physics-informed machine learning (PIML) presents a\nclass of hybrid modeling architectures that address this challenge by combining\nsimplified physics models with machine learning (ML) models resulting in models\nwhich maintain both interpretability and robustness. Such techniques enable\ndesigns with reduced mass and power through onboard thermal-state estimation\nand control and may lead to improved onboard handling of off-nominal states,\nincluding unplanned down-time. The PIML model or hybrid model presented here\nconsists of a neural network which predicts reduced nodalizations (distribution\nand size of coarse mesh) given on-orbit thermal load conditions, and\nsubsequently a (relatively coarse) finite-difference model operates on this\nmesh to predict thermal states. We compare the computational performance and\naccuracy of the hybrid model to a data-driven neural net model, and a\nhigh-fidelity finite-difference model of a prototype Earth-orbiting small\nspacecraft. The PIML based active nodalization approach provides significantly\nbetter generalization than the neural net model and coarse mesh model, while\nreducing computing cost by up to 1.7x compared to the high-fidelity model.\n","authors":["Manaswin Oddiraju","Zaki Hasnain","Saptarshi Bandyopadhyay","Eric Sunada","Souma Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2407.06099v2.pdf","comment":"Presented at the AIAA Aviation 2024 Forum"},{"id":"http://arxiv.org/abs/2409.03697v1","updated":"2024-09-05T16:52:20Z","published":"2024-09-05T16:52:20Z","title":"Classification and Prediction of Heart Diseases using Machine Learning\n Algorithms","summary":" Heart disease is a serious worldwide health issue because it claims the lives\nof many people who might have been treated if the disease had been identified\nearlier. The leading cause of death in the world is cardiovascular disease,\nusually referred to as heart disease. Creating reliable, effective, and precise\npredictions for these diseases is one of the biggest issues facing the medical\nworld today. Although there are tools for predicting heart diseases, they are\neither expensive or challenging to apply for determining a patient's risk. The\nbest classifier for foretelling and spotting heart disease was the aim of this\nresearch. This experiment examined a range of machine learning approaches,\nincluding Logistic Regression, K-Nearest Neighbor, Support Vector Machine, and\nArtificial Neural Networks, to determine which machine learning algorithm was\nmost effective at predicting heart diseases. One of the most often utilized\ndata sets for this purpose, the UCI heart disease repository provided the data\nset for this study. The K-Nearest Neighbor technique was shown to be the most\neffective machine learning algorithm for determining whether a patient has\nheart disease. It will be beneficial to conduct further studies on the\napplication of additional machine learning algorithms for heart disease\nprediction.\n","authors":["Akua Sekyiwaa Osei-Nkwantabisa","Redeemer Ntumy"],"pdf_url":"https://arxiv.org/pdf/2409.03697v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.02115v2","updated":"2024-09-05T16:47:30Z","published":"2024-08-30T06:27:25Z","title":"Deep Neural Implicit Representation of Accessibility for Multi-Axis\n Manufacturing","summary":" One of the main concerns in design and process planning for multi-axis\nadditive and subtractive manufacturing is collision avoidance between moving\nobjects (e.g., tool assemblies) and stationary objects (e.g., a part unified\nwith fixtures). The collision measure for various pairs of relative rigid\ntranslations and rotations between the two pointsets can be conceptualized by a\ncompactly supported scalar field over the 6D non-Euclidean configuration space.\nExplicit representation and computation of this field is costly in both time\nand space. If we fix $O(m)$ sparsely sampled rotations (e.g., tool\norientations), computation of the collision measure field as a convolution of\nindicator functions of the 3D pointsets over a uniform grid (i.e., voxelized\ngeometry) of resolution $O(n^3)$ via fast Fourier transforms (FFTs) scales as\nin $O(mn^3 \\log n)$ in time and $O(mn^3)$ in space. In this paper, we develop\nan implicit representation of the collision measure field via deep neural\nnetworks (DNNs). We show that our approach is able to accurately interpolate\nthe collision measure from a sparse sampling of rotations, and can represent\nthe collision measure field with a small memory footprint. Moreover, we show\nthat this representation can be efficiently updated through fine-tuning to more\nefficiently train the network on multi-resolution data, as well as accommodate\nincremental changes to the geometry (such as might occur in iterative processes\nsuch as topology optimization of the part subject to CNC tool accessibility\nconstraints).\n","authors":["George P. Harabin","Amir Mirzendehdel","Morad Behandish"],"pdf_url":"https://arxiv.org/pdf/2409.02115v2.pdf","comment":"Special Issue on symposium on Solid and Physical Modeling (SPM 2023)"},{"id":"http://arxiv.org/abs/2407.16607v3","updated":"2024-09-05T16:39:44Z","published":"2024-07-23T16:13:22Z","title":"Data Mixture Inference: What do BPE Tokenizers Reveal about their\n Training Data?","summary":" The pretraining data of today's strongest language models is opaque; in\nparticular, little is known about the proportions of various domains or\nlanguages represented. In this work, we tackle a task which we call data\nmixture inference, which aims to uncover the distributional make-up of training\ndata. We introduce a novel attack based on a previously overlooked source of\ninformation: byte-pair encoding (BPE) tokenizers, used by the vast majority of\nmodern language models. Our key insight is that the ordered list of merge rules\nlearned by a BPE tokenizer naturally reveals information about the token\nfrequencies in its training data. Given a tokenizer's merge list along with\nexample data for each category of interest, we formulate a linear program that\nsolves for the proportion of each category in the tokenizer's training set. In\ncontrolled experiments, we show that our attack recovers mixture ratios with\nhigh precision for tokenizers trained on known mixtures of natural languages,\nprogramming languages, and data sources. We then apply our approach to\noff-the-shelf tokenizers released with recent LMs. We confirm much publicly\ndisclosed information about these models, and also make several new inferences:\nGPT-4o and Mistral NeMo's tokenizers are much more multilingual than their\npredecessors, training on 39% and 47% non-English language data, respectively;\nLlama 3 extends GPT-3.5's tokenizer primarily for multilingual (48%) use;\nGPT-3.5's and Claude's tokenizers are trained on predominantly code (~60%). We\nhope our work sheds light on current design practices for pretraining data, and\ninspires continued research into data mixture inference for LMs.\n","authors":["Jonathan Hayase","Alisa Liu","Yejin Choi","Sewoong Oh","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2407.16607v3.pdf","comment":"new robustness experiments; new baselines; include Mistral,\n Mistral-Nemo and GPT-NeoX; link to code"},{"id":"http://arxiv.org/abs/2409.03685v1","updated":"2024-09-05T16:39:21Z","published":"2024-09-05T16:39:21Z","title":"View-Invariant Policy Learning via Zero-Shot Novel View Synthesis","summary":" Large-scale visuomotor policy learning is a promising approach toward\ndeveloping generalizable manipulation systems. Yet, policies that can be\ndeployed on diverse embodiments, environments, and observational modalities\nremain elusive. In this work, we investigate how knowledge from large-scale\nvisual data of the world may be used to address one axis of variation for\ngeneralizable manipulation: observational viewpoint. Specifically, we study\nsingle-image novel view synthesis models, which learn 3D-aware scene-level\npriors by rendering images of the same scene from alternate camera viewpoints\ngiven a single input image. For practical application to diverse robotic data,\nthese models must operate zero-shot, performing view synthesis on unseen tasks\nand environments. We empirically analyze view synthesis models within a simple\ndata-augmentation scheme that we call View Synthesis Augmentation (VISTA) to\nunderstand their capabilities for learning viewpoint-invariant policies from\nsingle-viewpoint demonstration data. Upon evaluating the robustness of policies\ntrained with our method to out-of-distribution camera viewpoints, we find that\nthey outperform baselines in both simulated and real-world manipulation tasks.\nVideos and additional visualizations are available at\nhttps://s-tian.github.io/projects/vista.\n","authors":["Stephen Tian","Blake Wulfe","Kyle Sargent","Katherine Liu","Sergey Zakharov","Vitor Guizilini","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2409.03685v1.pdf","comment":"Accepted to CoRL 2024"},{"id":"http://arxiv.org/abs/2409.03684v1","updated":"2024-09-05T16:39:13Z","published":"2024-09-05T16:39:13Z","title":"Predicting quantum channels over general product distributions","summary":" We investigate the problem of predicting the output behavior of unknown\nquantum channels. Given query access to an $n$-qubit channel $E$ and an\nobservable $O$, we aim to learn the mapping \\begin{equation*}\n \\rho \\mapsto \\mathrm{Tr}(O E[\\rho]) \\end{equation*} to within a small error\nfor most $\\rho$ sampled from a distribution $D$. Previously, Huang, Chen, and\nPreskill proved a surprising result that even if $E$ is arbitrary, this task\ncan be solved in time roughly $n^{O(\\log(1/\\epsilon))}$, where $\\epsilon$ is\nthe target prediction error. However, their guarantee applied only to input\ndistributions $D$ invariant under all single-qubit Clifford gates, and their\nalgorithm fails for important cases such as general product distributions over\nproduct states $\\rho$.\n In this work, we propose a new approach that achieves accurate prediction\nover essentially any product distribution $D$, provided it is not \"classical\"\nin which case there is a trivial exponential lower bound. Our method employs a\n\"biased Pauli analysis,\" analogous to classical biased Fourier analysis.\nImplementing this approach requires overcoming several challenges unique to the\nquantum setting, including the lack of a basis with appropriate orthogonality\nproperties. The techniques we develop to address these issues may have broader\napplications in quantum information.\n","authors":["Sitan Chen","Jaume de Dios Pont","Jun-Ting Hsieh","Hsin-Yuan Huang","Jane Lange","Jerry Li"],"pdf_url":"https://arxiv.org/pdf/2409.03684v1.pdf","comment":"20 pages, comments welcome"},{"id":"http://arxiv.org/abs/2409.03682v1","updated":"2024-09-05T16:37:26Z","published":"2024-09-05T16:37:26Z","title":"A New First-Order Meta-Learning Algorithm with Convergence Guarantees","summary":" Learning new tasks by drawing on prior experience gathered from other\n(related) tasks is a core property of any intelligent system. Gradient-based\nmeta-learning, especially MAML and its variants, has emerged as a viable\nsolution to accomplish this goal. One problem MAML encounters is its\ncomputational and memory burdens needed to compute the meta-gradients. We\npropose a new first-order variant of MAML that we prove converges to a\nstationary point of the MAML objective, unlike other first-order variants. We\nalso show that the MAML objective does not satisfy the smoothness assumption\nassumed in previous works; we show instead that its smoothness constant grows\nwith the norm of the meta-gradient, which theoretically suggests the use of\nnormalized or clipped-gradient methods compared to the plain gradient method\nused in previous works. We validate our theory on a synthetic experiment.\n","authors":["El Mahdi Chayti","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2409.03682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17399v2","updated":"2024-09-05T16:34:02Z","published":"2024-04-26T13:21:30Z","title":"Evaluations of Machine Learning Privacy Defenses are Misleading","summary":" Empirical defenses for machine learning privacy forgo the provable guarantees\nof differential privacy in the hope of achieving higher utility while resisting\nrealistic adversaries. We identify severe pitfalls in existing empirical\nprivacy evaluations (based on membership inference attacks) that result in\nmisleading conclusions. In particular, we show that prior evaluations fail to\ncharacterize the privacy leakage of the most vulnerable samples, use weak\nattacks, and avoid comparisons with practical differential privacy baselines.\nIn 5 case studies of empirical privacy defenses, we find that prior evaluations\nunderestimate privacy leakage by an order of magnitude. Under our stronger\nevaluation, none of the empirical defenses we study are competitive with a\nproperly tuned, high-utility DP-SGD baseline (with vacuous provable\nguarantees).\n","authors":["Michael Aerni","Jie Zhang","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2404.17399v2.pdf","comment":"Accepted at ACM CCS 2024"},{"id":"http://arxiv.org/abs/2409.03674v1","updated":"2024-09-05T16:27:16Z","published":"2024-09-05T16:27:16Z","title":"Practical Forecasting of Cryptocoins Timeseries using Correlation\n Patterns","summary":" Cryptocoins (i.e., Bitcoin, Ether, Litecoin) are tradable digital assets.\nOwnerships of cryptocoins are registered on distributed ledgers (i.e.,\nblockchains). Secure encryption techniques guarantee the security of the\ntransactions (transfers of coins among owners), registered into the ledger.\nCryptocoins are exchanged for specific trading prices. The extreme volatility\nof such trading prices across all different sets of crypto-assets remains\nundisputed. However, the relations between the trading prices across different\ncryptocoins remains largely unexplored. Major coin exchanges indicate trend\ncorrelation to advise for sells or buys. However, price correlations remain\nlargely unexplored. We shed some light on the trend correlations across a large\nvariety of cryptocoins, by investigating their coin/price correlation trends\nover the past two years. We study the causality between the trends, and exploit\nthe derived correlations to understand the accuracy of state-of-the-art\nforecasting techniques for time series modeling (e.g., GBMs, LSTM and GRU) of\ncorrelated cryptocoins. Our evaluation shows (i) strong correlation patterns\nbetween the most traded coins (e.g., Bitcoin and Ether) and other types of\ncryptocurrencies, and (ii) state-of-the-art time series forecasting algorithms\ncan be used to forecast cryptocoins price trends. We released datasets and code\nto reproduce our analysis to the research community.\n","authors":["Pasquale De Rosa","Pascal Felber","Valerio Schiavoni"],"pdf_url":"https://arxiv.org/pdf/2409.03674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03672v1","updated":"2024-09-05T16:25:30Z","published":"2024-09-05T16:25:30Z","title":"Wind turbine condition monitoring based on intra- and inter-farm\n federated learning","summary":" As wind energy adoption is growing, ensuring the efficient operation and\nmaintenance of wind turbines becomes essential for maximizing energy production\nand minimizing costs and downtime. Many AI applications in wind energy, such as\nin condition monitoring and power forecasting, may benefit from using\noperational data not only from individual wind turbines but from multiple\nturbines and multiple wind farms. Collaborative distributed AI which preserves\ndata privacy holds a strong potential for these applications. Federated\nlearning has emerged as a privacy-preserving distributed machine learning\napproach in this context. We explore federated learning in wind turbine\ncondition monitoring, specifically for fault detection using normal behaviour\nmodels. We investigate various federated learning strategies, including\ncollaboration across different wind farms and turbine models, as well as\ncollaboration restricted to the same wind farm and turbine model. Our case\nstudy results indicate that federated learning across multiple wind turbines\nconsistently outperforms models trained on a single turbine, especially when\ntraining data is scarce. Moreover, the amount of historical data necessary to\ntrain an effective model can be significantly reduced by employing a\ncollaborative federated learning strategy. Finally, our findings show that\nextending the collaboration to multiple wind farms may result in inferior\nperformance compared to restricting learning within a farm, specifically when\nfaced with statistical heterogeneity and imbalanced datasets.\n","authors":["Albin Grataloup","Stefan Jonas","Angela Meyer"],"pdf_url":"https://arxiv.org/pdf/2409.03672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03669v1","updated":"2024-09-05T16:23:07Z","published":"2024-09-05T16:23:07Z","title":"A method to benchmark high-dimensional process drift detection","summary":" Process curves are multi-variate finite time series data coming from\nmanufacturing processes. This paper studies machine learning methods for drifts\nof process curves. A theoretic framework to synthetically generate process\ncurves in a controlled way is introduced in order to benchmark machine learning\nalgorithms for process drift detection. A evaluation score, called the temporal\narea under the curve, is introduced, which allows to quantify how well machine\nlearning models unveil curves belonging to drift segments. Finally, a benchmark\nstudy comparing popular machine learning approaches on synthetic data generated\nwith the introduced framework shown.\n","authors":["Edgar Wolf","Tobias Windisch"],"pdf_url":"https://arxiv.org/pdf/2409.03669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03668v1","updated":"2024-09-05T16:22:31Z","published":"2024-09-05T16:22:31Z","title":"A Fused Large Language Model for Predicting Startup Success","summary":" Investors are continuously seeking profitable investment opportunities in\nstartups and, hence, for effective decision-making, need to predict a startup's\nprobability of success. Nowadays, investors can use not only various\nfundamental information about a startup (e.g., the age of the startup, the\nnumber of founders, and the business sector) but also textual description of a\nstartup's innovation and business model, which is widely available through\nonline venture capital (VC) platforms such as Crunchbase. To support the\ndecision-making of investors, we develop a machine learning approach with the\naim of locating successful startups on VC platforms. Specifically, we develop,\ntrain, and evaluate a tailored, fused large language model to predict startup\nsuccess. Thereby, we assess to what extent self-descriptions on VC platforms\nare predictive of startup success. Using 20,172 online profiles from\nCrunchbase, we find that our fused large language model can predict startup\nsuccess, with textual self-descriptions being responsible for a significant\npart of the predictive power. Our work provides a decision support tool for\ninvestors to find profitable investment opportunities.\n","authors":["Abdurahman Maarouf","Stefan Feuerriegel","Nicolas Pröllochs"],"pdf_url":"https://arxiv.org/pdf/2409.03668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03667v1","updated":"2024-09-05T16:21:20Z","published":"2024-09-05T16:21:20Z","title":"Threat Classification on Deployed Optical Networks Using MIMO Digital\n Fiber Sensing, Wavelets, and Machine Learning","summary":" We demonstrate mechanical threats classification including jackhammers and\nexcavators, leveraging wavelet transform of MIMO-DFS output data across a 57-km\noperational network link. Our machine learning framework incorporates transfer\nlearning and shows 93% classification accuracy from field data, with benefits\nfor optical network supervision.\n","authors":["Khouloud Abdelli","Henrique Pavani","Christian Dorize","Sterenn Guerrier","Haik Mardoyan","Patricia Layec","Jeremie Renaudier"],"pdf_url":"https://arxiv.org/pdf/2409.03667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03663v1","updated":"2024-09-05T16:15:52Z","published":"2024-09-05T16:15:52Z","title":"Weather-Adaptive Multi-Step Forecasting of State of Polarization Changes\n in Aerial Fibers Using Wavelet Neural Networks","summary":" We introduce a novel weather-adaptive approach for multi-step forecasting of\nmulti-scale SOP changes in aerial fiber links. By harnessing the discrete\nwavelet transform and incorporating weather data, our approach improves\nforecasting accuracy by over 65% in RMSE and 63% in MAPE compared to baselines.\n","authors":["Khouloud Abdelli","Matteo Lonardi","Jurgen Gripp","Samuel Olsson Fabien Boitier","Patricia Layec"],"pdf_url":"https://arxiv.org/pdf/2409.03663v1.pdf","comment":"ECOC 2024"},{"id":"http://arxiv.org/abs/2409.03662v1","updated":"2024-09-05T16:15:12Z","published":"2024-09-05T16:15:12Z","title":"The representation landscape of few-shot learning and fine-tuning in\n large language models","summary":" In-context learning (ICL) and supervised fine-tuning (SFT) are two common\nstrategies for improving the performance of modern large language models (LLMs)\non specific tasks. Despite their different natures, these strategies often lead\nto comparable performance gains. However, little is known about whether they\ninduce similar representations inside LLMs. We approach this problem by\nanalyzing the probability landscape of their hidden representations in the two\ncases. More specifically, we compare how LLMs solve the same question-answering\ntask, finding that ICL and SFT create very different internal structures, in\nboth cases undergoing a sharp transition in the middle of the network. In the\nfirst half of the network, ICL shapes interpretable representations\nhierarchically organized according to their semantic content. In contrast, the\nprobability landscape obtained with SFT is fuzzier and semantically mixed. In\nthe second half of the model, the fine-tuned representations develop\nprobability modes that better encode the identity of answers, while the\nlandscape of ICL representations is characterized by less defined peaks. Our\napproach reveals the diverse computational strategies developed inside LLMs to\nsolve the same task across different conditions, allowing us to make a step\ntowards designing optimal methods to extract information from language models.\n","authors":["Diego Doimo","Alessandro Serra","Alessio Ansuini","Alberto Cazzaniga"],"pdf_url":"https://arxiv.org/pdf/2409.03662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03658v1","updated":"2024-09-05T16:11:40Z","published":"2024-09-05T16:11:40Z","title":"A DNN Biophysics Model with Topological and Electrostatic Features","summary":" In this project, we provide a deep-learning neural network (DNN) based\nbiophysics model to predict protein properties. The model uses multi-scale and\nuniform topological and electrostatic features generated with protein\nstructural information and force field, which governs the molecular mechanics.\nThe topological features are generated using the element specified persistent\nhomology (ESPH) while the electrostatic features are fast computed using a\nCartesian treecode. These features are uniform in number for proteins with\nvarious sizes thus the broadly available protein structure database can be used\nin training the network. These features are also multi-scale thus the\nresolution and computational cost can be balanced by the users. The machine\nlearning simulation on over 4000 protein structures shows the efficiency and\nfidelity of these features in representing the protein structure and force\nfield for the predication of their biophysical properties such as electrostatic\nsolvation energy. Tests on topological or electrostatic features alone and the\ncombination of both showed the optimal performance when both features are used.\nThis model shows its potential as a general tool in assisting biophysical\nproperties and function prediction for the broad biomolecules using data from\nboth theoretical computing and experiments.\n","authors":["Elyssa Sliheet","Md Abu Talha","Weihua Geng"],"pdf_url":"https://arxiv.org/pdf/2409.03658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03657v1","updated":"2024-09-05T16:11:36Z","published":"2024-09-05T16:11:36Z","title":"Unsupervised Anomaly Detection and Localization with Generative\n Adversarial Networks","summary":" We propose a novel unsupervised anomaly detection approach using generative\nadversarial networks and SOP-derived spectrograms. Demonstrating remarkable\nefficacy, our method achieves over 97% accuracy on SOP datasets from both\nsubmarine and terrestrial fiber links, all achieved without the need for\nlabelled data.\n","authors":["Khouloud Abdelli","Matteo Lonardi","Jurgen Gripp","Samuel Olsson","Fabien Boitier","Patricia Layec"],"pdf_url":"https://arxiv.org/pdf/2409.03657v1.pdf","comment":"ECOC 2024"},{"id":"http://arxiv.org/abs/2409.03655v1","updated":"2024-09-05T16:10:31Z","published":"2024-09-05T16:10:31Z","title":"Privacy versus Emotion Preservation Trade-offs in Emotion-Preserving\n Speaker Anonymization","summary":" Advances in speech technology now allow unprecedented access to personally\nidentifiable information through speech. To protect such information, the\ndifferential privacy field has explored ways to anonymize speech while\npreserving its utility, including linguistic and paralinguistic aspects.\nHowever, anonymizing speech while maintaining emotional state remains\nchallenging. We explore this problem in the context of the VoicePrivacy 2024\nchallenge. Specifically, we developed various speaker anonymization pipelines\nand find that approaches either excel at anonymization or preserving emotion\nstate, but not both simultaneously. Achieving both would require an in-domain\nemotion recognizer. Additionally, we found that it is feasible to train a\nsemi-effective speaker verification system using only emotion representations,\ndemonstrating the challenge of separating these two modalities.\n","authors":["Zexin Cai","Henry Li Xinyuan","Ashi Garg","Leibny Paola García-Perera","Kevin Duh","Sanjeev Khudanpur","Nicholas Andrews","Matthew Wiesner"],"pdf_url":"https://arxiv.org/pdf/2409.03655v1.pdf","comment":"accepted by 2024 IEEE Spoken Language Technology Workshop"},{"id":"http://arxiv.org/abs/2409.03650v1","updated":"2024-09-05T16:08:19Z","published":"2024-09-05T16:08:19Z","title":"On the Limited Generalization Capability of the Implicit Reward Model\n Induced by Direct Preference Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) is an effective approach\nfor aligning language models to human preferences. Central to RLHF is learning\na reward function for scoring human preferences. Two main approaches for\nlearning a reward model are 1) training an EXplicit Reward Model (EXRM) as in\nRLHF, and 2) using an implicit reward learned from preference data through\nmethods such as Direct Preference Optimization (DPO). Prior work has shown that\nthe implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in\nthe limit. DPORM's effectiveness directly implies the optimality of the learned\npolicy, and also has practical implication for LLM alignment methods including\niterative DPO. However, it is unclear how well DPORM empirically matches the\nperformance of EXRM. This work studies the accuracy at distinguishing preferred\nand rejected answers for both DPORM and EXRM. Our findings indicate that even\nthough DPORM fits the training dataset comparably, it generalizes less\neffectively than EXRM, especially when the validation datasets contain\ndistribution shifts. Across five out-of-distribution settings, DPORM has a mean\ndrop in accuracy of 3% and a maximum drop of 7%. These findings highlight that\nDPORM has limited generalization ability and substantiates the integration of\nan explicit reward model in iterative DPO approaches.\n","authors":["Yong Lin","Skyler Seto","Maartje ter Hoeve","Katherine Metcalf","Barry-John Theobald","Xuan Wang","Yizhe Zhang","Chen Huang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03650v1.pdf","comment":"12 pages, 8 tables, 2 figures"},{"id":"http://arxiv.org/abs/2409.03646v1","updated":"2024-09-05T16:04:57Z","published":"2024-09-05T16:04:57Z","title":"Limited but consistent gains in adversarial robustness by co-training\n object recognition models with human EEG","summary":" In contrast to human vision, artificial neural networks (ANNs) remain\nrelatively susceptible to adversarial attacks. To address this vulnerability,\nefforts have been made to transfer inductive bias from human brains to ANNs,\noften by training the ANN representations to match their biological\ncounterparts. Previous works relied on brain data acquired in rodents or\nprimates using invasive techniques, from specific regions of the brain, under\nnon-natural conditions (anesthetized animals), and with stimulus datasets\nlacking diversity and naturalness. In this work, we explored whether aligning\nmodel representations to human EEG responses to a rich set of real-world images\nincreases robustness to ANNs. Specifically, we trained ResNet50-backbone models\non a dual task of classification and EEG prediction; and evaluated their EEG\nprediction accuracy and robustness to adversarial attacks. We observed\nsignificant correlation between the networks' EEG prediction accuracy, often\nhighest around 100 ms post stimulus onset, and their gains in adversarial\nrobustness. Although effect size was limited, effects were consistent across\ndifferent random initializations and robust for architectural variants. We\nfurther teased apart the data from individual EEG channels and observed\nstrongest contribution from electrodes in the parieto-occipital regions. The\ndemonstrated utility of human EEG for such tasks opens up avenues for future\nefforts that scale to larger datasets under diverse stimuli conditions with the\npromise of stronger effects.\n","authors":["Manshan Guo","Bhavin Choksi","Sari Sadiya","Alessandro T. Gifford","Martina G. Vilas","Radoslaw M. Cichy","Gemma Roig"],"pdf_url":"https://arxiv.org/pdf/2409.03646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17622v2","updated":"2024-09-05T15:52:47Z","published":"2024-07-24T20:28:03Z","title":"Towards Neural Network based Cognitive Models of Dynamic Decision-Making\n by Humans","summary":" Modeling human cognitive processes in dynamic decision-making tasks has been\nan endeavor in AI for a long time because such models can help make AI systems\nmore intuitive, personalized, mitigate any human biases, and enhance training\nin simulation. Some initial work has attempted to utilize neural networks (and\nlarge language models) but often assumes one common model for all humans and\naims to emulate human behavior in aggregate. However, the behavior of each\nhuman is distinct, heterogeneous, and relies on specific past experiences in\ncertain tasks. For instance, consider two individuals responding to a phishing\nemail: one who has previously encountered and identified similar threats may\nrecognize it quickly, while another without such experience might fall for the\nscam. In this work, we build on Instance Based Learning (IBL) that posits that\nhuman decisions are based on similar situations encountered in the past.\nHowever, IBL relies on simple fixed form functions to capture the mapping from\npast situations to current decisions. To that end, we propose two new\nattention-based neural network models to have open form non-linear functions to\nmodel distinct and heterogeneous human decision-making in dynamic settings. We\nexperiment with two distinct datasets gathered from human subject experiment\ndata, one focusing on detection of phishing email by humans and another where\nhumans act as attackers in a cybersecurity setting and decide on an attack\noption. We conducted extensive experiments with our two neural network models,\nIBL, and GPT3.5, and demonstrate that the neural network models outperform IBL\nsignificantly in representing human decision-making, while providing similar\ninterpretability of human decisions as IBL. Overall, our work yields promising\nresults for further use of neural networks in cognitive modeling of human\ndecision making.\n","authors":["Changyu Chen","Shashank Reddy Chirra","Maria José Ferreira","Cleotilde Gonzalez","Arunesh Sinha","Pradeep Varakantham"],"pdf_url":"https://arxiv.org/pdf/2407.17622v2.pdf","comment":"Our code is available at https://github.com/shshnkreddy/NCM-HDM"},{"id":"http://arxiv.org/abs/2408.10468v4","updated":"2024-09-05T15:47:45Z","published":"2024-08-20T00:40:49Z","title":"Tracing Privacy Leakage of Language Models to Training Data via Adjusted\n Influence Functions","summary":" The responses generated by Large Language Models (LLMs) can include sensitive\ninformation from individuals and organizations, leading to potential privacy\nleakage. This work implements Influence Functions (IFs) to trace privacy\nleakage back to the training data, thereby mitigating privacy concerns of\nLanguage Models (LMs). However, we notice that current IFs struggle to\naccurately estimate the influence of tokens with large gradient norms,\npotentially overestimating their influence. When tracing the most influential\nsamples, this leads to frequently tracing back to samples with large gradient\nnorm tokens, overshadowing the actual most influential samples even if their\ninfluences are well estimated. To address this issue, we propose Heuristically\nAdjusted IF (HAIF), which reduces the weight of tokens with large gradient\nnorms, thereby significantly improving the accuracy of tracing the most\ninfluential samples. To establish easily obtained groundtruth for tracing\nprivacy leakage, we construct two datasets, PII-E and PII-CR, representing two\ndistinct scenarios: one with identical text in the model outputs and\npre-training data, and the other where models leverage their reasoning\nabilities to generate text divergent from pre-training data. HAIF significantly\nimproves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E\ndataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA\nIFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs\non real-world pretraining data CLUECorpus2020, demonstrating strong robustness\nregardless prompt and response lengths.\n","authors":["Jinxin Liu","Zao Yang"],"pdf_url":"https://arxiv.org/pdf/2408.10468v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03632v1","updated":"2024-09-05T15:47:04Z","published":"2024-09-05T15:47:04Z","title":"Beyond Model Interpretability: Socio-Structural Explanations in Machine\n Learning","summary":" What is it to interpret the outputs of an opaque machine learning model. One\napproach is to develop interpretable machine learning techniques. These\ntechniques aim to show how machine learning models function by providing either\nmodel centric local or global explanations, which can be based on mechanistic\ninterpretations revealing the inner working mechanisms of models or\nnonmechanistic approximations showing input feature output data relationships.\nIn this paper, we draw on social philosophy to argue that interpreting machine\nlearning outputs in certain normatively salient domains could require appealing\nto a third type of explanation that we call sociostructural explanation. The\nrelevance of this explanation type is motivated by the fact that machine\nlearning models are not isolated entities but are embedded within and shaped by\nsocial structures. Sociostructural explanations aim to illustrate how social\nstructures contribute to and partially explain the outputs of machine learning\nmodels. We demonstrate the importance of sociostructural explanations by\nexamining a racially biased healthcare allocation algorithm. Our proposal\nhighlights the need for transparency beyond model interpretability,\nunderstanding the outputs of machine learning systems could require a broader\nanalysis that extends beyond the understanding of the machine learning model\nitself.\n","authors":["Andrew Smart","Atoosa Kasirzadeh"],"pdf_url":"https://arxiv.org/pdf/2409.03632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02066v2","updated":"2024-09-05T15:35:53Z","published":"2024-09-03T17:13:55Z","title":"Robust Clustering on High-Dimensional Data with Stochastic Quantization","summary":" This paper addresses the limitations of traditional vector quantization\n(clustering) algorithms, particularly K-Means and its variant K-Means++, and\nexplores the Stochastic Quantization (SQ) algorithm as a scalable alternative\nfor high-dimensional unsupervised and semi-supervised learning problems. Some\ntraditional clustering algorithms suffer from inefficient memory utilization\nduring computation, necessitating the loading of all data samples into memory,\nwhich becomes impractical for large-scale datasets. While variants such as\nMini-Batch K-Means partially mitigate this issue by reducing memory usage, they\nlack robust theoretical convergence guarantees due to the non-convex nature of\nclustering problems. In contrast, the Stochastic Quantization algorithm\nprovides strong theoretical convergence guarantees, making it a robust\nalternative for clustering tasks. We demonstrate the computational efficiency\nand rapid convergence of the algorithm on an image classification problem with\npartially labeled data, comparing model accuracy across various ratios of\nlabeled to unlabeled data. To address the challenge of high dimensionality, we\ntrained Triplet Network to encode images into low-dimensional representations\nin a latent space, which serve as a basis for comparing the efficiency of both\nthe Stochastic Quantization algorithm and traditional quantization algorithms.\nFurthermore, we enhance the algorithm's convergence speed by introducing\nmodifications with an adaptive learning rate.\n","authors":["Anton Kozyriev","Vladimir Norkin"],"pdf_url":"https://arxiv.org/pdf/2409.02066v2.pdf","comment":"20 pages, 5 figures, to be published in the International Scientific\n Technical Journal \"Problems of Control and Informatics\""},{"id":"http://arxiv.org/abs/2409.00046v2","updated":"2024-09-05T15:24:24Z","published":"2024-08-19T11:50:23Z","title":"Rethinking Molecular Design: Integrating Latent Variable and\n Auto-Regressive Models for Goal Directed Generation","summary":" De novo molecule design has become a highly active research area, advanced\nsignificantly through the use of state-of-the-art generative models. Despite\nthese advances, several fundamental questions remain unanswered as the field\nincreasingly focuses on more complex generative models and sophisticated\nmolecular representations as an answer to the challenges of drug design. In\nthis paper, we return to the simplest representation of molecules, and\ninvestigate overlooked limitations of classical generative approaches,\nparticularly Variational Autoencoders (VAEs) and auto-regressive models. We\npropose a hybrid model in the form of a novel regularizer that leverages the\nstrengths of both to improve validity, conditional generation, and style\ntransfer of molecular sequences. Additionally, we provide an in depth\ndiscussion of overlooked assumptions of these models' behaviour.\n","authors":["Heath Arthur-Loui","Amina Mollaysa","Michael Krauthammer"],"pdf_url":"https://arxiv.org/pdf/2409.00046v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04823v5","updated":"2024-09-05T15:22:55Z","published":"2023-02-09T18:21:29Z","title":"Hierarchical Generative Adversarial Imitation Learning with Mid-level\n Input Generation for Autonomous Driving on Urban Environments","summary":" Deriving robust control policies for realistic urban navigation scenarios is\nnot a trivial task. In an end-to-end approach, these policies must map\nhigh-dimensional images from the vehicle's cameras to low-level actions such as\nsteering and throttle. While pure Reinforcement Learning (RL) approaches are\nbased exclusively on engineered rewards, Generative Adversarial Imitation\nLearning (GAIL) agents learn from expert demonstrations while interacting with\nthe environment, which favors GAIL on tasks for which a reward signal is\ndifficult to derive, such as autonomous driving. However, training deep\nnetworks directly from raw images on RL tasks is known to be unstable and\ntroublesome. To deal with that, this work proposes a hierarchical GAIL-based\narchitecture (hGAIL) which decouples representation learning from the driving\ntask to solve the autonomous navigation of a vehicle. The proposed architecture\nconsists of two modules: a GAN (Generative Adversarial Net) which generates an\nabstract mid-level input representation, which is the Bird's-Eye View (BEV)\nfrom the surroundings of the vehicle; and the GAIL which learns to control the\nvehicle based on the BEV predictions from the GAN as input. hGAIL is able to\nlearn both the policy and the mid-level representation simultaneously as the\nagent interacts with the environment. Our experiments made in the CARLA\nsimulation environment have shown that GAIL exclusively from cameras (without\nBEV) fails to even learn the task, while hGAIL, after training exclusively on\none city, was able to autonomously navigate successfully in 98% of the\nintersections of a new city not used in training phase. Videos and code\navailable at: https://sites.google.com/view/hgail\n","authors":["Gustavo Claudio Karl Couto","Eric Aislan Antonelo"],"pdf_url":"https://arxiv.org/pdf/2302.04823v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03618v1","updated":"2024-09-05T15:22:39Z","published":"2024-09-05T15:22:39Z","title":"DART2: a robust multiple testing method to smartly leverage helpful or\n misleading ancillary information","summary":" In many applications of multiple testing, ancillary information is available,\nreflecting the hypothesis null or alternative status. Several methods have been\ndeveloped to leverage this ancillary information to enhance testing power,\ntypically requiring the ancillary information is helpful enough to ensure\nfavorable performance. In this paper, we develop a robust and effective\ndistance-assisted multiple testing procedure named DART2, designed to be\npowerful and robust regardless of the quality of ancillary information. When\nthe ancillary information is helpful, DART2 can asymptotically control FDR\nwhile improving power; otherwise, DART2 can still control FDR and maintain\npower at least as high as ignoring the ancillary information. We demonstrated\nDART2's superior performance compared to existing methods through numerical\nstudies under various settings. In addition, DART2 has been applied to a gene\nassociation study where we have shown its superior accuracy and robustness\nunder two different types of ancillary information.\n","authors":["Xuechan Li","Jichun Xie"],"pdf_url":"https://arxiv.org/pdf/2409.03618v1.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.03614v1","updated":"2024-09-05T15:18:44Z","published":"2024-09-05T15:18:44Z","title":"1 Modular Parallel Manipulator for Long-Term Soft Robotic Data\n Collection","summary":" Performing long-term experimentation or large-scale data collection for\nmachine learning in the field of soft robotics is challenging, due to the\nhardware robustness and experimental flexibility required. In this work, we\npropose a modular parallel robotic manipulation platform suitable for such\nlarge-scale data collection and compatible with various soft-robotic\nfabrication methods. Considering the computational and theoretical difficulty\nof replicating the high-fidelity, faster-than-real-time simulations that enable\nlarge-scale data collection in rigid robotic systems, a robust soft-robotic\nhardware platform becomes a high priority development task for the field.\n The platform's modules consist of a pair of off-the-shelf electrical motors\nwhich actuate a customizable finger consisting of a compliant parallel\nstructure. The parallel mechanism of the finger can be as simple as a single\n3D-printed urethane or molded silicone bulk structure, due to the motors being\nable to fully actuate a passive structure. This design flexibility allows\nexperimentation with soft mechanism varied geometries, bulk properties and\nsurface properties. Additionally, while the parallel mechanism does not require\nseparate electronics or additional parts, these can be included, and it can be\nconstructed using multi-functional soft materials to study compatible soft\nsensors and actuators in the learning process. In this work, we validate the\nplatform's ability to be used for policy gradient reinforcement learning\ndirectly on hardware in a benchmark 2D manipulation task. We additionally\ndemonstrate compatibility with multiple fingers and characterize the design\nconstraints for compatible extensions.\n","authors":["Kiyn Chin","Carmel Majidi","Abhinav Gupta"],"pdf_url":"https://arxiv.org/pdf/2409.03614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03612v1","updated":"2024-09-05T15:17:26Z","published":"2024-09-05T15:17:26Z","title":"VFLGAN-TS: Vertical Federated Learning-based Generative Adversarial\n Networks for Publication of Vertically Partitioned Time-Series Data","summary":" In the current artificial intelligence (AI) era, the scale and quality of the\ndataset play a crucial role in training a high-quality AI model. However, often\noriginal data cannot be shared due to privacy concerns and regulations. A\npotential solution is to release a synthetic dataset with a similar\ndistribution to the private dataset. Nevertheless, in some scenarios, the\nattributes required to train an AI model are distributed among different\nparties, and the parties cannot share the local data for synthetic data\nconstruction due to privacy regulations. In PETS 2024, we recently introduced\nthe first Vertical Federated Learning-based Generative Adversarial Network\n(VFLGAN) for publishing vertically partitioned static data. However, VFLGAN\ncannot effectively handle time-series data, presenting both temporal and\nattribute dimensions. In this article, we proposed VFLGAN-TS, which combines\nthe ideas of attribute discriminator and vertical federated learning to\ngenerate synthetic time-series data in the vertically partitioned scenario. The\nperformance of VFLGAN-TS is close to that of its counterpart, which is trained\nin a centralized manner and represents the upper limit for VFLGAN-TS. To\nfurther protect privacy, we apply a Gaussian mechanism to make VFLGAN-TS\nsatisfy an $(\\epsilon,\\delta)$-differential privacy. Besides, we develop an\nenhanced privacy auditing scheme to evaluate the potential privacy breach\nthrough the framework of VFLGAN-TS and synthetic datasets.\n","authors":["Xun Yuan","Zilong Zhao","Prosanta Gope","Biplab Sikdar"],"pdf_url":"https://arxiv.org/pdf/2409.03612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16917v3","updated":"2024-09-05T15:14:36Z","published":"2023-10-25T18:34:06Z","title":"MimicTouch: Leveraging Multi-modal Human Tactile Demonstrations for\n Contact-rich Manipulation","summary":" Tactile sensing is critical to fine-grained, contact-rich manipulation tasks,\nsuch as insertion and assembly. Prior research has shown the possibility of\nlearning tactile-guided policy from teleoperated demonstration data. However,\nto provide the demonstration, human users often rely on visual feedback to\ncontrol the robot. This creates a gap between the sensing modality used for\ncontrolling the robot (visual) and the modality of interest (tactile). To\nbridge this gap, we introduce \"MimicTouch\", a novel framework for learning\npolicies directly from demonstrations provided by human users with their hands.\nThe key innovations are i) a human tactile data collection system which\ncollects multi-modal tactile dataset for learning human's tactile-guided\ncontrol strategy, ii) an imitation learning-based framework for learning\nhuman's tactile-guided control strategy through such data, and iii) an online\nresidual RL framework to bridge the embodiment gap between the human hand and\nthe robot gripper. Through comprehensive experiments, we highlight the efficacy\nof utilizing human's tactile-guided control strategy to resolve contact-rich\nmanipulation tasks. The project website is at\nhttps://sites.google.com/view/MimicTouch.\n","authors":["Kelin Yu","Yunhai Han","Qixian Wang","Vaibhav Saxena","Danfei Xu","Ye Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.16917v3.pdf","comment":"Accepted by CoRL 2024, Best Paper Award at NeurIPS 2023 Touch\n Processing Workshop"},{"id":"http://arxiv.org/abs/2408.16686v2","updated":"2024-09-05T15:11:40Z","published":"2024-08-29T16:32:24Z","title":"CW-CNN & CW-AN: Convolutional Networks and Attention Networks for\n CW-Complexes","summary":" We present a novel framework for learning on CW-complex structured data\npoints. Recent advances have discussed CW-complexes as ideal learning\nrepresentations for problems in cheminformatics. However, there is a lack of\navailable machine learning methods suitable for learning on CW-complexes. In\nthis paper we develop notions of convolution and attention that are well\ndefined for CW-complexes. These notions enable us to create the first Hodge\ninformed neural network that can receive a CW-complex as input. We illustrate\nand interpret this framework in the context of supervised prediction.\n","authors":["Rahul Khorana"],"pdf_url":"https://arxiv.org/pdf/2408.16686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08561v3","updated":"2024-09-05T15:09:54Z","published":"2023-08-14T13:18:40Z","title":"Implementation of The Future of Drug Discovery: QuantumBased Machine\n Learning Simulation (QMLS)","summary":" The Research & Development (R&D) phase of drug development is a lengthy and\ncostly process. To revolutionize this process, we introduce our new concept\nQMLS to shorten the whole R&D phase to three to six months and decrease the\ncost to merely fifty to eighty thousand USD. For Hit Generation, Machine\nLearning Molecule Generation (MLMG) generates possible hits according to the\nmolecular structure of the target protein while the Quantum Simulation (QS)\nfilters molecules from the primary essay based on the reaction and binding\neffectiveness with the target protein. Then, For Lead Optimization, the\nresultant molecules generated and filtered from MLMG and QS are compared, and\nmolecules that appear as a result of both processes will be made into dozens of\nmolecular variations through Machine Learning Molecule Variation (MLMV), while\nothers will only be made into a few variations. Lastly, all optimized molecules\nwould undergo multiple rounds of QS filtering with a high standard for reaction\neffectiveness and safety, creating a few dozen pre-clinical-trail-ready drugs.\nThis paper is based on our first paper, where we pitched the concept of machine\nlearning combined with quantum simulations. In this paper we will go over the\ndetailed design and framework of QMLS, including MLMG, MLMV, and QS.\n","authors":["Yifan Zhou","Yan Shing Liang","Yew Kee Wong","Haichuan Qiu","Yu Xi Wu","Bin He"],"pdf_url":"https://arxiv.org/pdf/2308.08561v3.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.01095v4","updated":"2024-09-05T15:01:32Z","published":"2023-06-01T19:10:57Z","title":"Large-Batch, Iteration-Efficient Neural Bayesian Design Optimization","summary":" Bayesian optimization (BO) provides a powerful framework for optimizing\nblack-box, expensive-to-evaluate functions. It is therefore an attractive tool\nfor engineering design problems, typically involving multiple objectives.\nThanks to the rapid advances in fabrication and measurement methods as well as\nparallel computing infrastructure, querying many design problems can be heavily\nparallelized. This class of problems challenges BO with an unprecedented setup\nwhere it has to deal with very large batches, shifting its focus from sample\nefficiency to iteration efficiency. We present a novel Bayesian optimization\nframework specifically tailored to address these limitations. Our key\ncontribution is a highly scalable, sample-based acquisition function that\nperforms a non-dominated sorting of not only the objectives but also their\nassociated uncertainty. We show that our acquisition function in combination\nwith different Bayesian neural network surrogates is effective in\ndata-intensive environments with a minimal number of iterations. We demonstrate\nthe superiority of our method by comparing it with state-of-the-art\nmulti-objective optimizations. We perform our evaluation on two real-world\nproblems -- airfoil design and 3D printing -- showcasing the applicability and\nefficiency of our approach. Our code is available at:\nhttps://github.com/an-on-ym-ous/lbn_mobo\n","authors":["Navid Ansari","Alireza Javanmardi","Eyke Hüllermeier","Hans-Peter Seidel","Vahid Babaei"],"pdf_url":"https://arxiv.org/pdf/2306.01095v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03598v1","updated":"2024-09-05T14:57:01Z","published":"2024-09-05T14:57:01Z","title":"A practical approach to evaluating the adversarial distance for machine\n learning classifiers","summary":" Robustness is critical for machine learning (ML) classifiers to ensure\nconsistent performance in real-world applications where models may encounter\ncorrupted or adversarial inputs. In particular, assessing the robustness of\nclassifiers to adversarial inputs is essential to protect systems from\nvulnerabilities and thus ensure safety in use. However, methods to accurately\ncompute adversarial robustness have been challenging for complex ML models and\nhigh-dimensional data. Furthermore, evaluations typically measure adversarial\naccuracy on specific attack budgets, limiting the informative value of the\nresulting metrics. This paper investigates the estimation of the more\ninformative adversarial distance using iterative adversarial attacks and a\ncertification approach. Combined, the methods provide a comprehensive\nevaluation of adversarial robustness by computing estimates for the upper and\nlower bounds of the adversarial distance. We present visualisations and\nablation studies that provide insights into how this evaluation method should\nbe applied and parameterised. We find that our adversarial attack approach is\neffective compared to related implementations, while the certification method\nfalls short of expectations. The approach in this paper should encourage a more\ninformative way of evaluating the adversarial robustness of ML classifiers.\n","authors":["Georg Siedel","Ekagra Gupta","Andrey Morozov"],"pdf_url":"https://arxiv.org/pdf/2409.03598v1.pdf","comment":"Accepted manuscript at International Mechanical Engineering Congress\n and Exposition IMECE2024"},{"id":"http://arxiv.org/abs/2409.03588v1","updated":"2024-09-05T14:43:11Z","published":"2024-09-05T14:43:11Z","title":"Costs Estimation in Unit Commitment Problems using Simulation-Based\n Inference","summary":" The Unit Commitment (UC) problem is a key optimization task in power systems\nto forecast the generation schedules of power units over a finite time period\nby minimizing costs while meeting demand and technical constraints. However,\nmany parameters required by the UC problem are unknown, such as the costs. In\nthis work, we estimate these unknown costs using simulation-based inference on\nan illustrative UC problem, which provides an approximated posterior\ndistribution of the parameters given observed generation schedules and demands.\nOur results highlight that the learned posterior distribution effectively\ncaptures the underlying distribution of the data, providing a range of possible\nvalues for the unknown parameters given a past observation. This posterior\nallows for the estimation of past costs using observed past generation\nschedules, enabling operators to better forecast future costs and make more\nrobust generation scheduling forecasts. We present avenues for future research\nto address overconfidence in posterior estimation, enhance the scalability of\nthe methodology and apply it to more complex UC problems modeling the network\nconstraints and renewable energy sources.\n","authors":["Matthias Pirlet","Adrien Bolland","Gilles Louppe","Damien Ernst"],"pdf_url":"https://arxiv.org/pdf/2409.03588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07666v4","updated":"2024-09-05T14:37:59Z","published":"2024-08-14T16:58:48Z","title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories,\n Applications and Opportunities","summary":" Model merging is an efficient empowerment technique in the machine learning\ncommunity that does not require the collection of raw training data and does\nnot require expensive computation. As model merging becomes increasingly\nprevalent across various fields, it is crucial to understand the available\nmodel merging techniques comprehensively. However, there is a significant gap\nin the literature regarding a systematic and thorough review of these\ntechniques. This survey provides a comprehensive overview of model merging\nmethods and theories, their applications in various domains and settings, and\nfuture research directions. Specifically, we first propose a new taxonomic\napproach that exhaustively discusses existing model merging methods. Secondly,\nwe discuss the application of model merging techniques in large language\nmodels, multimodal large language models, and 10+ machine learning subfields,\nincluding continual learning, multi-task learning, few-shot learning, etc.\nFinally, we highlight the remaining challenges of model merging and discuss\nfuture research directions. A comprehensive list of papers about model merging\nis available at\n\\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}.\n","authors":["Enneng Yang","Li Shen","Guibing Guo","Xingwei Wang","Xiaochun Cao","Jie Zhang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2408.07666v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03577v1","updated":"2024-09-05T14:31:05Z","published":"2024-09-05T14:31:05Z","title":"CHIRPs: Change-Induced Regret Proxy metrics for Lifelong Reinforcement\n Learning","summary":" Reinforcement learning agents can achieve superhuman performance in static\ntasks but are costly to train and fragile to task changes. This limits their\ndeployment in real-world scenarios where training experience is expensive or\nthe context changes through factors like sensor degradation, environmental\nprocesses or changing mission priorities. Lifelong reinforcement learning aims\nto improve sample efficiency and adaptability by studying how agents perform in\nevolving problems. The difficulty that these changes pose to an agent is rarely\nmeasured directly, however. Agent performances can be compared across a change,\nbut this is often prohibitively expensive. We propose Change-Induced Regret\nProxy (CHIRP) metrics, a class of metrics for approximating a change's\ndifficulty while avoiding the high costs of using trained agents. A\nrelationship between a CHIRP metric and agent performance is identified in two\nenvironments, a simple grid world and MetaWorld's suite of robotic arm tasks.\nWe demonstrate two uses for these metrics: for learning, an agent that clusters\nMDPs based on a CHIRP metric achieves $17\\%$ higher average returns than three\nexisting agents in a sequence of MetaWorld tasks. We also show how a CHIRP can\nbe calibrated to compare the difficulty of changes across distinctly different\nenvironments.\n","authors":["John Birkbeck","Adam Sobey","Federico Cerutti","Katherine Heseltine Hurley Flynn","Timothy J. Norman"],"pdf_url":"https://arxiv.org/pdf/2409.03577v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.03563v1","updated":"2024-09-05T14:19:45Z","published":"2024-09-05T14:19:45Z","title":"100 instances is all you need: predicting the success of a new LLM on\n unseen data by testing on a few instances","summary":" Predicting the performance of LLMs on individual task instances is essential\nto ensure their reliability in high-stakes applications. To do so, a\npossibility is to evaluate the considered LLM on a set of task instances and\ntrain an assessor to predict its performance based on features of the\ninstances. However, this approach requires evaluating each new LLM on a\nsufficiently large set of task instances to train an assessor specific to it.\nIn this work, we leverage the evaluation results of previously tested LLMs to\nreduce the number of evaluations required to predict the performance of a new\nLLM. In practice, we propose to test the new LLM on a small set of reference\ninstances and train a generic assessor which predicts the performance of the\nLLM on an instance based on the performance of the former on the reference set\nand features of the instance of interest. We conduct empirical studies on\nHELM-Lite and KindsOfReasoning, a collection of existing reasoning datasets\nthat we introduce, where we evaluate all instruction-fine-tuned OpenAI models\nuntil the January 2024 version of GPT4. When predicting performance on\ninstances with the same distribution as those used to train the generic\nassessor, we find this achieves performance comparable to the LLM-specific\nassessors trained on the full set of instances. Additionally, we find that\nrandomly selecting the reference instances performs as well as some advanced\nselection methods we tested. For out of distribution, however, no clear winner\nemerges and the overall performance is worse, suggesting that the inherent\npredictability of LLMs is low.\n","authors":["Lorenzo Pacchiardi","Lucy G. Cheke","José Hernández-Orallo"],"pdf_url":"https://arxiv.org/pdf/2409.03563v1.pdf","comment":"Presented at the 2024 KDD workshop on Evaluation and Trustworthiness\n of Generative AI Models"},{"id":"http://arxiv.org/abs/2405.07441v3","updated":"2024-09-05T14:17:48Z","published":"2024-05-13T02:59:50Z","title":"Reducing Spatial Discretization Error on Coarse CFD Simulations Using an\n OpenFOAM-Embedded Deep Learning Framework","summary":" We propose a method for reducing the spatial discretization error of coarse\ncomputational fluid dynamics (CFD) problems by enhancing the quality of\nlow-resolution simulations using deep learning. We feed the model with\nfine-grid data after projecting it to the coarse-grid discretization. We\nsubstitute the default differencing scheme for the convection term by a\nfeed-forward neural network that interpolates velocities from cell centers to\nface values to produce velocities that approximate the down-sampled fine-grid\ndata well. The deep learning framework incorporates the open-source CFD code\nOpenFOAM, resulting in an end-to-end differentiable model. We automatically\ndifferentiate the CFD physics using a discrete adjoint code version. We present\na fast communication method between TensorFlow (Python) and OpenFOAM (c++) that\naccelerates the training process. We applied the model to the flow past a\nsquare cylinder problem, reducing the error from 120% to 25% in the velocity\nfor simulations inside the training distribution compared to the traditional\nsolver using an x8 coarser mesh. For simulations outside the training\ndistribution, the error reduction in the velocities was about 50%. The training\nis affordable in terms of time and data samples since the architecture exploits\nthe local features of the physics.\n","authors":["Jesus Gonzalez-Sieiro","David Pardo","Vincenzo Nava","Victor M. Calo","Markus Towara"],"pdf_url":"https://arxiv.org/pdf/2405.07441v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03556v1","updated":"2024-09-05T14:17:01Z","published":"2024-09-05T14:17:01Z","title":"MaskVal: Simple but Effective Uncertainty Quantification for 6D Pose\n Estimation","summary":" For the use of 6D pose estimation in robotic applications, reliable poses are\nof utmost importance to ensure a safe, reliable and predictable operational\nperformance. Despite these requirements, state-of-the-art 6D pose estimators\noften do not provide any uncertainty quantification for their pose estimates at\nall, or if they do, it has been shown that the uncertainty provided is only\nweakly correlated with the actual true error. To address this issue, we\ninvestigate a simple but effective uncertainty quantification, that we call\nMaskVal, which compares the pose estimates with their corresponding instance\nsegmentations by rendering and does not require any modification of the pose\nestimator itself. Despite its simplicity, MaskVal significantly outperforms a\nstate-of-the-art ensemble method on both a dataset and a robotic setup. We show\nthat by using MaskVal, the performance of a state-of-the-art 6D pose estimator\nis significantly improved towards a safe and reliable operation. In addition,\nwe propose a new and specific approach to compare and evaluate uncertainty\nquantification methods for 6D pose estimation in the context of robotic\nmanipulation.\n","authors":["Philipp Quentin","Daniel Goehring"],"pdf_url":"https://arxiv.org/pdf/2409.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03555v1","updated":"2024-09-05T14:15:54Z","published":"2024-09-05T14:15:54Z","title":"Unified Framework for Neural Network Compression via Decomposition and\n Optimal Rank Selection","summary":" Despite their high accuracy, complex neural networks demand significant\ncomputational resources, posing challenges for deployment on\nresource-constrained devices such as mobile phones and embedded systems.\nCompression algorithms have been developed to address these challenges by\nreducing model size and computational demands while maintaining accuracy. Among\nthese approaches, factorization methods based on tensor decomposition are\ntheoretically sound and effective. However, they face difficulties in selecting\nthe appropriate rank for decomposition. This paper tackles this issue by\npresenting a unified framework that simultaneously applies decomposition and\noptimal rank selection, employing a composite compression loss within defined\nrank constraints. Our approach includes an automatic rank search in a\ncontinuous space, efficiently identifying optimal rank configurations without\nthe use of training data, making it computationally efficient. Combined with a\nsubsequent fine-tuning step, our approach maintains the performance of highly\ncompressed models on par with their original counterparts. Using various\nbenchmark datasets, we demonstrate the efficacy of our method through a\ncomprehensive analysis.\n","authors":["Ali Aghababaei-Harandi","Massih-Reza Amini"],"pdf_url":"https://arxiv.org/pdf/2409.03555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03550v1","updated":"2024-09-05T14:12:22Z","published":"2024-09-05T14:12:22Z","title":"DKDM: Data-Free Knowledge Distillation for Diffusion Models with Any\n Architecture","summary":" Diffusion models (DMs) have demonstrated exceptional generative capabilities\nacross various areas, while they are hindered by slow inference speeds and high\ncomputational demands during deployment. The most common way to accelerate DMs\ninvolves reducing the number of denoising steps during generation, achieved\nthrough faster sampling solvers or knowledge distillation (KD). In contrast to\nprior approaches, we propose a novel method that transfers the capability of\nlarge pretrained DMs to faster architectures. Specifically, we employ KD in a\ndistinct manner to compress DMs by distilling their generative ability into\nmore rapid variants. Furthermore, considering that the source data is either\nunaccessible or too enormous to store for current generative models, we\nintroduce a new paradigm for their distillation without source data, termed\nData-Free Knowledge Distillation for Diffusion Models (DKDM). Generally, our\nestablished DKDM framework comprises two main components: 1) a DKDM objective\nthat uses synthetic denoising data produced by pretrained DMs to optimize\nfaster DMs without source data, and 2) a dynamic iterative distillation method\nthat flexibly organizes the synthesis of denoising data, preventing it from\nslowing down the optimization process as the generation is slow. To our\nknowledge, this is the first attempt at using KD to distill DMs into any\narchitecture in a data-free manner. Importantly, our DKDM is orthogonal to most\nexisting acceleration methods, such as denoising step reduction, quantization\nand pruning. Experiments show that our DKDM is capable of deriving 2x faster\nDMs with performance remaining on par with the baseline. Notably, our DKDM\nenables pretrained DMs to function as \"datasets\" for training new DMs.\n","authors":["Qianlong Xiang","Miao Zhang","Yuzhang Shang","Jianlong Wu","Yan Yan","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2409.03550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03545v1","updated":"2024-09-05T14:07:10Z","published":"2024-09-05T14:07:10Z","title":"The Power of Second Chance: Personalized Submodular Maximization with\n Two Candidates","summary":" Most of existing studies on submodular maximization focus on selecting a\nsubset of items that maximizes a \\emph{single} submodular function. However, in\nmany real-world scenarios, we might have multiple user-specific functions, each\nof which models the utility of a particular type of user. In these settings,\nour goal would be to choose a set of items that performs well across all the\nuser-specific functions. One way to tackle this problem is to select a single\nsubset that maximizes the sum of all of the user-specific functions. Although\nthis aggregate approach is efficient in the sense that it avoids computation of\nsets for individual functions, it really misses the power of personalization -\nfor it does not allow to choose different sets for different functions. In this\npaper, we introduce the problem of personalized submodular maximization with\ntwo candidate solutions. For any two candidate solutions, the utility of each\nuser-specific function is defined as the better of these two candidates. Our\nobjective is, therefore, to select the best set of two candidates that maximize\nthe sum of utilities of all the user-specific functions. We have designed\neffective algorithms for this problem. We also discuss how our approach\ngeneralizes to multiple candidate solutions, increasing flexibility and\npersonalization in our solution.\n","authors":["Jing Yuan","Shaojie Tang"],"pdf_url":"https://arxiv.org/pdf/2409.03545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03542v1","updated":"2024-09-05T14:06:56Z","published":"2024-09-05T14:06:56Z","title":"Risk-based Calibration for Probabilistic Classifiers","summary":" We introduce a general iterative procedure called risk-based calibration (RC)\ndesigned to minimize the empirical risk under the 0-1 loss (empirical error)\nfor probabilistic classifiers. These classifiers are based on modeling\nprobability distributions, including those constructed from the joint\ndistribution (generative) and those based on the class conditional distribution\n(conditional). RC can be particularized to any probabilistic classifier\nprovided a specific learning algorithm that computes the classifier's\nparameters in closed form using data statistics. RC reinforces the statistics\naligned with the true class while penalizing those associated with other\nclasses, guided by the 0-1 loss. The proposed method has been empirically\ntested on 30 datasets using na\\\"ive Bayes, quadratic discriminant analysis, and\nlogistic regression classifiers. RC improves the empirical error of the\noriginal closed-form learning algorithms and, more notably, consistently\noutperforms the gradient descent approach with the three classifiers.\n","authors":["Aritz Pérez","Carlos Echegoyen","Guzmán Santafé"],"pdf_url":"https://arxiv.org/pdf/2409.03542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03543v1","updated":"2024-09-05T14:06:56Z","published":"2024-09-05T14:06:56Z","title":"Prediction Accuracy & Reliability: Classification and Object\n Localization under Distribution Shift","summary":" Natural distribution shift causes a deterioration in the perception\nperformance of convolutional neural networks (CNNs). This comprehensive\nanalysis for real-world traffic data addresses: 1) investigating the effect of\nnatural distribution shift and weather augmentations on both detection quality\nand confidence estimation, 2) evaluating model performance for both\nclassification and object localization, and 3) benchmarking two common\nuncertainty quantification methods - Ensembles and different variants of\nMonte-Carlo (MC) Dropout - under natural and close-to-natural distribution\nshift. For this purpose, a novel dataset has been curated from publicly\navailable autonomous driving datasets. The in-distribution (ID) data is based\non cutouts of a single object, for which both class and bounding box\nannotations are available. The six distribution-shift datasets cover adverse\nweather scenarios, simulated rain and fog, corner cases, and\nout-of-distribution data. A granular analysis of CNNs under distribution shift\nallows to quantize the impact of different types of shifts on both, task\nperformance and confidence estimation: ConvNeXt-Tiny is more robust than\nEfficientNet-B0; heavy rain degrades classification stronger than localization,\ncontrary to heavy fog; integrating MC-Dropout into selected layers only has the\npotential to enhance task performance and confidence estimation, whereby the\nidentification of these layers depends on the type of distribution shift and\nthe considered task.\n","authors":["Fabian Diet","Moussa Kassem Sbeyti","Michelle Karg"],"pdf_url":"https://arxiv.org/pdf/2409.03543v1.pdf","comment":"This preprint has not undergone any post-submission improvements or\n corrections"},{"id":"http://arxiv.org/abs/2408.17235v2","updated":"2024-09-05T13:59:21Z","published":"2024-08-30T12:26:23Z","title":"AI-Driven Intrusion Detection Systems (IDS) on the ROAD Dataset: A\n Comparative Analysis for Automotive Controller Area Network (CAN)","summary":" The integration of digital devices in modern vehicles has revolutionized\nautomotive technology, enhancing safety and the overall driving experience. The\nController Area Network (CAN) bus is a central system for managing in-vehicle\ncommunication between the electronic control units (ECUs). However, the CAN\nprotocol poses security challenges due to inherent vulnerabilities, lacking\nencryption and authentication, which, combined with an expanding attack\nsurface, necessitates robust security measures. In response to this challenge,\nnumerous Intrusion Detection Systems (IDS) have been developed and deployed.\nNonetheless, an open, comprehensive, and realistic dataset to test the\neffectiveness of such IDSs remains absent in the existing literature. This\npaper addresses this gap by considering the latest ROAD dataset, containing\nstealthy and sophisticated injections. The methodology involves dataset\nlabelling and the implementation of both state-of-the-art deep learning models\nand traditional machine learning models to show the discrepancy in performance\nbetween the datasets most commonly used in the literature and the ROAD dataset,\na more realistic alternative.\n","authors":["Lorenzo Guerra","Linhan Xu","Paolo Bellavista","Thomas Chapuis","Guillaume Duc","Pavlo Mozharovskyi","Van-Tam Nguyen"],"pdf_url":"https://arxiv.org/pdf/2408.17235v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12334v2","updated":"2024-09-05T13:47:26Z","published":"2024-06-18T06:59:24Z","title":"What Did I Do Wrong? Quantifying LLMs' Sensitivity and Consistency to\n Prompt Engineering","summary":" Large Language Models (LLMs) changed the way we design and interact with\nsoftware systems. Their ability to process and extract information from text\nhas drastically improved productivity in a number of routine tasks. Developers\nthat want to include these models in their software stack, however, face a\ndreadful challenge: debugging LLMs' inconsistent behavior across minor\nvariations of the prompt. We therefore introduce two metrics for classification\ntasks, namely sensitivity and consistency, which are complementary to task\nperformance. First, sensitivity measures changes of predictions across\nrephrasings of the prompt, and does not require access to ground truth labels.\nInstead, consistency measures how predictions vary across rephrasings for\nelements of the same class. We perform an empirical comparison of these metrics\non text classification tasks, using them as guideline for understanding failure\nmodes of the LLM. Our hope is that sensitivity and consistency will be helpful\nto guide prompt engineering and obtain LLMs that balance robustness with\nperformance.\n","authors":["Federico Errica","Giuseppe Siracusano","Davide Sanvito","Roberto Bifulco"],"pdf_url":"https://arxiv.org/pdf/2406.12334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11962v4","updated":"2024-09-05T13:42:02Z","published":"2023-02-23T12:18:28Z","title":"Unified Convergence Theory of Stochastic and Variance-Reduced Cubic\n Newton Methods","summary":" We study stochastic Cubic Newton methods for solving general possibly\nnon-convex minimization problems. We propose a new framework, which we call the\nhelper framework, that provides a unified view of the stochastic and\nvariance-reduced second-order algorithms equipped with global complexity\nguarantees. It can also be applied to learning with auxiliary information. Our\nhelper framework offers the algorithm designer high flexibility for\nconstructing and analyzing the stochastic Cubic Newton methods, allowing\narbitrary size batches, and the use of noisy and possibly biased estimates of\nthe gradients and Hessians, incorporating both the variance reduction and the\nlazy Hessian updates. We recover the best-known complexities for the stochastic\nand variance-reduced Cubic Newton, under weak assumptions on the noise. A\ndirect consequence of our theory is the new lazy stochastic second-order\nmethod, which significantly improves the arithmetic complexity for large\ndimension problems. We also establish complexity bounds for the classes of\ngradient-dominated objectives, that include convex and strongly convex\nproblems. For Auxiliary Learning, we show that using a helper (auxiliary\nfunction) can outperform training alone if a given similarity measure is small.\n","authors":["El Mahdi Chayti","Nikita Doikov","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2302.11962v4.pdf","comment":"Published in Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2307.12438v3","updated":"2024-09-05T13:41:37Z","published":"2023-07-23T21:46:55Z","title":"Multifidelity Covariance Estimation via Regression on the Manifold of\n Symmetric Positive Definite Matrices","summary":" We introduce a multifidelity estimator of covariance matrices formulated as\nthe solution to a regression problem on the manifold of symmetric positive\ndefinite matrices. The estimator is positive definite by construction, and the\nMahalanobis distance minimized to obtain it possesses properties enabling\npractical computation. We show that our manifold regression multifidelity\n(MRMF) covariance estimator is a maximum likelihood estimator under a certain\nerror model on manifold tangent space. More broadly, we show that our\nRiemannian regression framework encompasses existing multifidelity covariance\nestimators constructed from control variates. We demonstrate via numerical\nexamples that the MRMF estimator can provide significant decreases, up to one\norder of magnitude, in squared estimation error relative to both\nsingle-fidelity and other multifidelity covariance estimators. Furthermore,\npreservation of positive definiteness ensures that our estimator is compatible\nwith downstream tasks, such as data assimilation and metric learning, in which\nthis property is essential.\n","authors":["Aimee Maurais","Terrence Alsup","Benjamin Peherstorfer","Youssef Marzouk"],"pdf_url":"https://arxiv.org/pdf/2307.12438v3.pdf","comment":"To appear in the SIAM Journal on Mathematics of Data Science (SIMODS)"},{"id":"http://arxiv.org/abs/2409.03507v1","updated":"2024-09-05T13:20:10Z","published":"2024-09-05T13:20:10Z","title":"A Physics-Informed Machine Learning Approach for Solving Distributed\n Order Fractional Differential Equations","summary":" This paper introduces a novel methodology for solving distributed-order\nfractional differential equations using a physics-informed machine learning\nframework. The core of this approach involves extending the support vector\nregression (SVR) algorithm to approximate the unknown solutions of the\ngoverning equations during the training phase. By embedding the\ndistributed-order functional equation into the SVR framework, we incorporate\nphysical laws directly into the learning process. To further enhance\ncomputational efficiency, Gegenbauer orthogonal polynomials are employed as the\nkernel function, capitalizing on their fractional differentiation properties to\nstreamline the problem formulation. Finally, the resulting optimization problem\nof SVR is addressed either as a quadratic programming problem or as a positive\ndefinite system in its dual form. The effectiveness of the proposed approach is\nvalidated through a series of numerical experiments on Caputo-based\ndistributed-order fractional differential equations, encompassing both ordinary\nand partial derivatives.\n","authors":["Alireza Afzal Aghaei"],"pdf_url":"https://arxiv.org/pdf/2409.03507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03505v1","updated":"2024-09-05T13:19:08Z","published":"2024-09-05T13:19:08Z","title":"Survey of Data-driven Newsvendor: Unified Analysis and Spectrum of\n Achievable Regrets","summary":" In the Newsvendor problem, the goal is to guess the number that will be drawn\nfrom some distribution, with asymmetric consequences for guessing too high vs.\ntoo low. In the data-driven version, the distribution is unknown, and one must\nwork with samples from the distribution. Data-driven Newsvendor has been\nstudied under many variants: additive vs. multiplicative regret, high\nprobability vs. expectation bounds, and different distribution classes. This\npaper studies all combinations of these variants, filling in many gaps in the\nliterature and simplifying many proofs. In particular, we provide a unified\nanalysis based on the notion of clustered distributions, which in conjunction\nwith our new lower bounds, shows that the entire spectrum of regrets between\n$1/\\sqrt{n}$ and $1/n$ can be possible.\n","authors":["Zhuoxin Chen","Will Ma"],"pdf_url":"https://arxiv.org/pdf/2409.03505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10332v2","updated":"2024-09-05T13:16:07Z","published":"2024-07-14T21:11:44Z","title":"Ontology-driven Reinforcement Learning for Personalized Student Support","summary":" In the search for more effective education, there is a widespread effort to\ndevelop better approaches to personalize student education. Unassisted,\neducators often do not have time or resources to personally support every\nstudent in a given classroom. Motivated by this issue, and by recent\nadvancements in artificial intelligence, this paper presents a general-purpose\nframework for personalized student support, applicable to any virtual\neducational system such as a serious game or an intelligent tutoring system. To\nfit any educational situation, we apply ontologies for their semantic\norganization, combining them with data collection considerations and\nmulti-agent reinforcement learning. The result is a modular system that can be\nadapted to any virtual educational software to provide useful personalized\nassistance to students.\n","authors":["Ryan Hare","Ying Tang"],"pdf_url":"https://arxiv.org/pdf/2407.10332v2.pdf","comment":"6 pages, 3 figures, in press for IEEE Systems, Man, and Cybernetics\n 2024 Conference"},{"id":"http://arxiv.org/abs/2409.01931v2","updated":"2024-09-05T13:10:22Z","published":"2024-09-03T14:21:46Z","title":"On the design space between molecular mechanics and machine learning\n force fields","summary":" A force field as accurate as quantum mechanics (QM) and as fast as molecular\nmechanics (MM), with which one can simulate a biomolecular system efficiently\nenough and meaningfully enough to get quantitative insights, is among the most\nardent dreams of biophysicists -- a dream, nevertheless, not to be fulfilled\nany time soon. Machine learning force fields (MLFFs) represent a meaningful\nendeavor towards this direction, where differentiable neural functions are\nparametrized to fit ab initio energies, and furthermore forces through\nautomatic differentiation. We argue that, as of now, the utility of the MLFF\nmodels is no longer bottlenecked by accuracy but primarily by their speed (as\nwell as stability and generalizability), as many recent variants, on limited\nchemical spaces, have long surpassed the chemical accuracy of $1$ kcal/mol --\nthe empirical threshold beyond which realistic chemical predictions are\npossible -- though still magnitudes slower than MM. Hoping to kindle\nexplorations and designs of faster, albeit perhaps slightly less accurate\nMLFFs, in this review, we focus our attention on the design space (the\nspeed-accuracy tradeoff) between MM and ML force fields. After a brief review\nof the building blocks of force fields of either kind, we discuss the desired\nproperties and challenges now faced by the force field development community,\nsurvey the efforts to make MM force fields more accurate and ML force fields\nfaster, envision what the next generation of MLFF might look like.\n","authors":["Yuanqing Wang","Kenichiro Takaba","Michael S. Chen","Marcus Wieder","Yuzhi Xu","Tong Zhu","John Z. H. Zhang","Arnav Nagle","Kuang Yu","Xinyan Wang","Daniel J. Cole","Joshua A. Rackers","Kyunghyun Cho","Joe G. Greener","Peter Eastman","Stefano Martiniani","Mark E. Tuckerman"],"pdf_url":"https://arxiv.org/pdf/2409.01931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00515v3","updated":"2024-09-05T13:09:23Z","published":"2024-02-01T11:31:26Z","title":"Developing A Multi-Agent and Self-Adaptive Framework with Deep\n Reinforcement Learning for Dynamic Portfolio Risk Management","summary":" Deep or reinforcement learning (RL) approaches have been adapted as reactive\nagents to quickly learn and respond with new investment strategies for\nportfolio management under the highly turbulent financial market environments\nin recent years. In many cases, due to the very complex correlations among\nvarious financial sectors, and the fluctuating trends in different financial\nmarkets, a deep or reinforcement learning based agent can be biased in\nmaximising the total returns of the newly formulated investment portfolio while\nneglecting its potential risks under the turmoil of various market conditions\nin the global or regional sectors. Accordingly, a multi-agent and self-adaptive\nframework namely the MASA is proposed in which a sophisticated multi-agent\nreinforcement learning (RL) approach is adopted through two cooperating and\nreactive agents to carefully and dynamically balance the trade-off between the\noverall portfolio returns and their potential risks. Besides, a very flexible\nand proactive agent as the market observer is integrated into the MASA\nframework to provide some additional information on the estimated market trends\nas valuable feedbacks for multi-agent RL approach to quickly adapt to the\never-changing market conditions. The obtained empirical results clearly reveal\nthe potential strengths of our proposed MASA framework based on the multi-agent\nRL approach against many well-known RL-based approaches on the challenging data\nsets of the CSI 300, Dow Jones Industrial Average and S&P 500 indexes over the\npast 10 years. More importantly, our proposed MASA framework shed lights on\nmany possible directions for future investigation.\n","authors":["Zhenglong Li","Vincent Tam","Kwan L. Yeung"],"pdf_url":"https://arxiv.org/pdf/2402.00515v3.pdf","comment":"In Proceedings of the 23rd International Conference on Autonomous\n Agents and Multiagent Systems"},{"id":"http://arxiv.org/abs/2409.03495v1","updated":"2024-09-05T13:07:31Z","published":"2024-09-05T13:07:31Z","title":"Maximum likelihood inference for high-dimensional problems with\n multiaffine variable relations","summary":" Maximum Likelihood Estimation of continuous variable models can be very\nchallenging in high dimensions, due to potentially complex probability\ndistributions. The existence of multiple interdependencies among variables can\nmake it very difficult to establish convergence guarantees. This leads to a\nwide use of brute-force methods, such as grid searching and Monte-Carlo\nsampling and, when applicable, complex and problem-specific algorithms. In this\npaper, we consider inference problems where the variables are related by\nmultiaffine expressions. We propose a novel Alternating and\nIteratively-Reweighted Least Squares (AIRLS) algorithm, and prove its\nconvergence for problems with Generalized Normal Distributions. We also provide\nan efficient method to compute the variance of the estimates obtained using\nAIRLS. Finally, we show how the method can be applied to graphical statistical\nmodels. We perform numerical experiments on several inference problems, showing\nsignificantly better performance than state-of-the-art approaches in terms of\nscalability, robustness to noise, and convergence speed due to an empirically\nobserved super-linear convergence rate.\n","authors":["Jean-Sébastien Brouillon","Florian Dörfler","Giancarlo Ferrari-Trecate"],"pdf_url":"https://arxiv.org/pdf/2409.03495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14847v3","updated":"2024-09-05T12:59:56Z","published":"2023-12-22T17:19:50Z","title":"Large Scale Training of Graph Neural Networks for Optimal Markov-Chain\n Partitioning Using the Kemeny Constant","summary":" Traditional clustering algorithms often struggle to capture the complex\nrelationships within graphs and generalise to arbitrary clustering criteria.\nThe emergence of graph neural networks (GNNs) as a powerful framework for\nlearning representations of graph data provides new approaches to solving the\nproblem. Previous work has shown GNNs to be capable of proposing partitionings\nusing a variety of criteria, however, these approaches have not yet been\nextended to work on Markov chains or kinetic networks. These arise frequently\nin the study of molecular systems and are of particular interest to the\nbiochemical modelling community. In this work, we propose several GNN-based\narchitectures to tackle the graph partitioning problem for Markov Chains\ndescribed as kinetic networks. This approach aims to minimize how much a\nproposed partitioning changes the Kemeny constant. We propose using an\nencoder-decoder architecture and show how simple GraphSAGE-based GNNs with\nlinear layers can outperform much larger and more expressive attention-based\nmodels in this context. As a proof of concept, we first demonstrate the\nmethod's ability to cluster randomly connected graphs. We also use a linear\nchain architecture corresponding to a 1D free energy profile as our kinetic\nnetwork. Subsequently, we demonstrate the effectiveness of our method through\nexperiments on a data set derived from molecular dynamics. We compare the\nperformance of our method to other partitioning techniques such as PCCA+. We\nexplore the importance of feature and hyperparameter selection and propose a\ngeneral strategy for large-scale parallel training of GNNs for discovering\noptimal graph partitionings.\n","authors":["Sam Alexander Martino","João Morado","Chenghao Li","Zhenghao Lu","Edina Rosta"],"pdf_url":"https://arxiv.org/pdf/2312.14847v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03492v1","updated":"2024-09-05T12:59:38Z","published":"2024-09-05T12:59:38Z","title":"Distributionally Robust Optimisation with Bayesian Ambiguity Sets","summary":" Decision making under uncertainty is challenging since the data-generating\nprocess (DGP) is often unknown. Bayesian inference proceeds by estimating the\nDGP through posterior beliefs about the model's parameters. However, minimising\nthe expected risk under these posterior beliefs can lead to sub-optimal\ndecisions due to model uncertainty or limited, noisy observations. To address\nthis, we introduce Distributionally Robust Optimisation with Bayesian Ambiguity\nSets (DRO-BAS) which hedges against uncertainty in the model by optimising the\nworst-case risk over a posterior-informed ambiguity set. We show that our\nmethod admits a closed-form dual representation for many exponential family\nmembers and showcase its improved out-of-sample robustness against existing\nBayesian DRO methodology in the Newsvendor problem.\n","authors":["Charita Dellaporta","Patrick O'Hara","Theodoros Damoulas"],"pdf_url":"https://arxiv.org/pdf/2409.03492v1.pdf","comment":"13 pages, 3 figures. Under review"},{"id":"http://arxiv.org/abs/2409.03489v1","updated":"2024-09-05T12:56:39Z","published":"2024-09-05T12:56:39Z","title":"Sparsifying Parametric Models with L0 Regularization","summary":" This document contains an educational introduction to the problem of\nsparsifying parametric models with L0 regularization. We utilize this approach\ntogether with dictionary learning to learn sparse polynomial policies for deep\nreinforcement learning to control parametric partial differential equations.\nThe code and a tutorial are provided here:\nhttps://github.com/nicob15/Sparsifying-Parametric-Models-with-L0.\n","authors":["Nicolò Botteghi","Urban Fasel"],"pdf_url":"https://arxiv.org/pdf/2409.03489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03478v1","updated":"2024-09-05T12:38:13Z","published":"2024-09-05T12:38:13Z","title":"LLM-based event abstraction and integration for IoT-sourced logs","summary":" The continuous flow of data collected by Internet of Things (IoT) devices,\nhas revolutionised our ability to understand and interact with the world across\nvarious applications. However, this data must be prepared and transformed into\nevent data before analysis can begin. In this paper, we shed light on the\npotential of leveraging Large Language Models (LLMs) in event abstraction and\nintegration. Our approach aims to create event records from raw sensor readings\nand merge the logs from multiple IoT sources into a single event log suitable\nfor further Process Mining applications. We demonstrate the capabilities of\nLLMs in event abstraction considering a case study for IoT application in\nelderly care and longitudinal health monitoring. The results, showing on\naverage an accuracy of 90% in detecting high-level activities. These results\nhighlight LLMs' promising potential in addressing event abstraction and\nintegration challenges, effectively bridging the existing gap.\n","authors":["Mohsen Shirali","Mohammadreza Fani Sani","Zahra Ahmadi","Estefania Serral"],"pdf_url":"https://arxiv.org/pdf/2409.03478v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2409.03470v1","updated":"2024-09-05T12:31:51Z","published":"2024-09-05T12:31:51Z","title":"Improving Uncertainty-Error Correspondence in Deep Bayesian Medical\n Image Segmentation","summary":" Increased usage of automated tools like deep learning in medical image\nsegmentation has alleviated the bottleneck of manual contouring. This has\nshifted manual labour to quality assessment (QA) of automated contours which\ninvolves detecting errors and correcting them. A potential solution to\nsemi-automated QA is to use deep Bayesian uncertainty to recommend potentially\nerroneous regions, thus reducing time spent on error detection. Previous work\nhas investigated the correspondence between uncertainty and error, however, no\nwork has been done on improving the \"utility\" of Bayesian uncertainty maps such\nthat it is only present in inaccurate regions and not in the accurate ones. Our\nwork trains the FlipOut model with the Accuracy-vs-Uncertainty (AvU) loss which\npromotes uncertainty to be present only in inaccurate regions. We apply this\nmethod on datasets of two radiotherapy body sites, c.f. head-and-neck CT and\nprostate MR scans. Uncertainty heatmaps (i.e. predictive entropy) are evaluated\nagainst voxel inaccuracies using Receiver Operating Characteristic (ROC) and\nPrecision-Recall (PR) curves. Numerical results show that when compared to the\nBayesian baseline the proposed method successfully suppresses uncertainty for\naccurate voxels, with similar presence of uncertainty for inaccurate voxels.\nCode to reproduce experiments is available at\nhttps://github.com/prerakmody/bayesuncertainty-error-correspondence\n","authors":["Prerak Mody","Nicolas F. Chaves-de-Plaza","Chinmay Rao","Eleftheria Astrenidou","Mischa de Ridder","Nienke Hoekstra","Klaus Hildebrandt","Marius Staring"],"pdf_url":"https://arxiv.org/pdf/2409.03470v1.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024:018"},{"id":"http://arxiv.org/abs/2404.18519v3","updated":"2024-09-05T12:24:52Z","published":"2024-04-29T09:05:01Z","title":"On the Impact of Data Heterogeneity in Federated Learning Environments\n with Application to Healthcare Networks","summary":" Federated Learning (FL) allows multiple privacy-sensitive applications to\nleverage their dataset for a global model construction without any disclosure\nof the information. One of those domains is healthcare, where groups of silos\ncollaborate in order to generate a global predictor with improved accuracy and\ngeneralization. However, the inherent challenge lies in the high heterogeneity\nof medical data, necessitating sophisticated techniques for assessment and\ncompensation. This paper presents a comprehensive exploration of the\nmathematical formalization and taxonomy of heterogeneity within FL\nenvironments, focusing on the intricacies of medical data. In particular, we\naddress the evaluation and comparison of the most popular FL algorithms with\nrespect to their ability to cope with quantity-based, feature and label\ndistribution-based heterogeneity. The goal is to provide a quantitative\nevaluation of the impact of data heterogeneity in FL systems for healthcare\nnetworks as well as a guideline on FL algorithm selection. Our research extends\nbeyond existing studies by benchmarking seven of the most common FL algorithms\nagainst the unique challenges posed by medical data use cases. The paper\ntargets the prediction of the risk of stroke recurrence through a set of\ntabular clinical reports collected by different federated hospital silos: data\nheterogeneity frequently encountered in this scenario and its impact on FL\nperformance are discussed.\n","authors":["Usevalad Milasheuski","Luca Barbieri","Bernardo Camajori Tedeschini","Monica Nicoli","Stefano Savazzi"],"pdf_url":"https://arxiv.org/pdf/2404.18519v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06366v3","updated":"2024-09-05T12:24:25Z","published":"2024-03-11T01:36:37Z","title":"Finite-Time Error Analysis of Soft Q-Learning: Switching System Approach","summary":" Soft Q-learning is a variation of Q-learning designed to solve entropy\nregularized Markov decision problems where an agent aims to maximize the\nentropy regularized value function. Despite its empirical success, there have\nbeen limited theoretical studies of soft Q-learning to date. This paper aims to\noffer a novel and unified finite-time, control-theoretic analysis of soft\nQ-learning algorithms. We focus on two types of soft Q-learning algorithms: one\nutilizing the log-sum-exp operator and the other employing the Boltzmann\noperator. By using dynamical switching system models, we derive novel\nfinite-time error bounds for both soft Q-learning algorithms. We hope that our\nanalysis will deepen the current understanding of soft Q-learning by\nestablishing connections with switching system models and may even pave the way\nfor new frameworks in the finite-time analysis of other reinforcement learning\nalgorithms.\n","authors":["Narim Jeong","Donghwan Lee"],"pdf_url":"https://arxiv.org/pdf/2403.06366v3.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2409.03466v1","updated":"2024-09-05T12:21:51Z","published":"2024-09-05T12:21:51Z","title":"Panopticon: a novel deep learning model to detect single transit events\n with no prior data filtering in PLATO light curves","summary":" To prepare for the analyses of the future PLATO light curves, we develop a\ndeep learning model, Panopticon, to detect transits in high precision\nphotometric light curves. Since PLATO's main objective is the detection of\ntemperate Earth-size planets around solar-type stars, the code is designed to\ndetect individual transit events. The filtering step, required by conventional\ndetection methods, can affect the transit, which could be an issue for long and\nshallow transits. To protect transit shape and depth, the code is also designed\nto work on unfiltered light curves. We trained the model on a set of simulated\nPLATO light curves in which we injected, at pixel level, either planetary,\neclipsing binary, or background eclipsing binary signals. We also include a\nvariety of noises in our data, such as granulation, stellar spots or cosmic\nrays. The approach is able to recover 90% of our test population, including\nmore than 25% of the Earth-analogs, even in the unfiltered light curves. The\nmodel also recovers the transits irrespective of the orbital period, and is\nable to retrieve transits on a unique event basis. These figures are obtained\nwhen accepting a false alarm rate of 1%. When keeping the false alarm rate low\n(<0.01%), it is still able to recover more than 85% of the transit signals. Any\ntransit deeper than 180ppm is essentially guaranteed to be recovered. This\nmethod is able to recover transits on a unique event basis, and does so with a\nlow false alarm rate. Thanks to light curves being one-dimensional, model\ntraining is fast, on the order of a few hours per model. This speed in training\nand inference, coupled to the recovery effectiveness and precision of the model\nmake it an ideal tool to complement, or be used ahead of, classical approaches.\n","authors":["H. G. Vivien","M. Deleuil","N. Jannsen","J. De Ridder","D. Seynaeve","M. -A. Carpine","Y. Zerah"],"pdf_url":"https://arxiv.org/pdf/2409.03466v1.pdf","comment":"Submitted to A&A"},{"id":"http://arxiv.org/abs/2409.03463v1","updated":"2024-09-05T12:19:07Z","published":"2024-09-05T12:19:07Z","title":"Characterizing Massive Activations of Attention Mechanism in Graph\n Neural Networks","summary":" Graph Neural Networks (GNNs) have become increasingly popular for effectively\nmodeling data with graph structures. Recently, attention mechanisms have been\nintegrated into GNNs to improve their ability to capture complex patterns. This\npaper presents the first comprehensive study revealing a critical, unexplored\nconsequence of this integration: the emergence of Massive Activations (MAs)\nwithin attention layers. We introduce a novel method for detecting and\nanalyzing MAs, focusing on edge features in different graph transformer\narchitectures. Our study assesses various GNN models using benchmark datasets,\nincluding ZINC, TOX21, and PROTEINS. Key contributions include (1) establishing\nthe direct link between attention mechanisms and MAs generation in GNNs, (2)\ndeveloping a robust definition and detection method for MAs based on activation\nratio distributions, (3) introducing the Explicit Bias Term (EBT) as a\npotential countermeasure and exploring it as an adversarial framework to assess\nmodels robustness based on the presence or absence of MAs. Our findings\nhighlight the prevalence and impact of attention-induced MAs across different\narchitectures, such as GraphTransformer, GraphiT, and SAN. The study reveals\nthe complex interplay between attention mechanisms, model architecture, dataset\ncharacteristics, and MAs emergence, providing crucial insights for developing\nmore robust and reliable graph models.\n","authors":["Lorenzo Bini","Marco Sorbi","Stephane Marchand-Maillet"],"pdf_url":"https://arxiv.org/pdf/2409.03463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12193v2","updated":"2024-09-05T12:10:06Z","published":"2023-01-28T13:28:34Z","title":"CyclicFL: A Cyclic Model Pre-Training Approach to Efficient Federated\n Learning","summary":" Federated learning (FL) has been proposed to enable distributed learning on\nArtificial Intelligence Internet of Things (AIoT) devices with guarantees of\nhigh-level data privacy. Since random initial models in FL can easily result in\nunregulated Stochastic Gradient Descent (SGD) processes, existing FL methods\ngreatly suffer from both slow convergence and poor accuracy, especially in\nnon-IID scenarios. To address this problem, we propose a novel method named\nCyclicFL, which can quickly derive effective initial models to guide the SGD\nprocesses, thus improving the overall FL training performance. We formally\nanalyze the significance of data consistency between the pre-training and\ntraining stages of CyclicFL, showing the limited Lipschitzness of loss for the\npre-trained models by CyclicFL. Moreover, we systematically prove that our\nmethod can achieve faster convergence speed under various convexity\nassumptions. Unlike traditional centralized pre-training methods that require\npublic proxy data, CyclicFL pre-trains initial models on selected AIoT devices\ncyclically without exposing their local data. Therefore, they can be easily\nintegrated into any security-critical FL methods. Comprehensive experimental\nresults show that CyclicFL can not only improve the maximum classification\naccuracy by up to $14.11\\%$ but also significantly accelerate the overall FL\ntraining process.\n","authors":["Pengyu Zhang","Yingbo Zhou","Ming Hu","Xian Wei","Mingsong Chen"],"pdf_url":"https://arxiv.org/pdf/2301.12193v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08762v3","updated":"2024-09-05T11:35:35Z","published":"2024-07-09T19:31:49Z","title":"Commute-Time-Optimised Graphs for GNNs","summary":" We explore graph rewiring methods that optimise commute time. Recent graph\nrewiring approaches facilitate long-range interactions in sparse graphs, making\nsuch rewirings commute-time-optimal on average. However, when an expert prior\nexists on which node pairs should or should not interact, a superior rewiring\nwould favour short commute times between these privileged node pairs. We\nconstruct two synthetic datasets with known priors reflecting realistic\nsettings, and use these to motivate two bespoke rewiring methods that\nincorporate the known prior. We investigate the regimes where our rewiring\nimproves test performance on the synthetic datasets. Finally, we perform a case\nstudy on a real-world citation graph to investigate the practical implications\nof our work.\n","authors":["Igor Sterner","Shiye Su","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2407.08762v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01100v2","updated":"2024-09-05T11:30:25Z","published":"2024-04-01T13:13:25Z","title":"Finite Sample Frequency Domain Identification","summary":" We study non-parametric frequency-domain system identification from a\nfinite-sample perspective. We assume an open loop scenario where the excitation\ninput is periodic and consider the Empirical Transfer Function Estimate (ETFE),\nwhere the goal is to estimate the frequency response at certain desired\n(evenly-spaced) frequencies, given input-output samples. We show that under\nsub-Gaussian colored noise (in time-domain) and stability assumptions, the ETFE\nestimates are concentrated around the true values. The error rate is of the\norder of\n$\\mathcal{O}((d_{\\mathrm{u}}+\\sqrt{d_{\\mathrm{u}}d_{\\mathrm{y}}})\\sqrt{M/N_{\\mathrm{tot}}})$,\nwhere $N_{\\mathrm{tot}}$ is the total number of samples, $M$ is the number of\ndesired frequencies, and $d_{\\mathrm{u}},\\,d_{\\mathrm{y}}$ are the dimensions\nof the input and output signals respectively. This rate remains valid for\ngeneral irrational transfer functions and does not require a finite order\nstate-space representation. By tuning $M$, we obtain a\n$N_{\\mathrm{tot}}^{-1/3}$ finite-sample rate for learning the frequency\nresponse over all frequencies in the $ \\mathcal{H}_{\\infty}$ norm. Our result\ndraws upon an extension of the Hanson-Wright inequality to semi-infinite\nmatrices. We study the finite-sample behavior of ETFE in simulations.\n","authors":["Anastasios Tsiamis","Mohamed Abdalmoaty","Roy S. Smith","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2404.01100v2.pdf","comment":"Version 2 changes: several typos were fixed and some proof steps were\n expanded"},{"id":"http://arxiv.org/abs/2209.06388v4","updated":"2024-09-05T11:19:28Z","published":"2022-09-14T03:02:22Z","title":"TSFool: Crafting Highly-Imperceptible Adversarial Time Series through\n Multi-Objective Attack","summary":" Recent years have witnessed the success of recurrent neural network (RNN)\nmodels in time series classification (TSC). However, neural networks (NNs) are\nvulnerable to adversarial samples, which cause real-life adversarial attacks\nthat undermine the robustness of AI models. To date, most existing attacks\ntarget at feed-forward NNs and image recognition tasks, but they cannot perform\nwell on RNN-based TSC. This is due to the cyclical computation of RNN, which\nprevents direct model differentiation. In addition, the high visual sensitivity\nof time series to perturbations also poses challenges to local objective\noptimization of adversarial samples. In this paper, we propose an efficient\nmethod called TSFool to craft highly-imperceptible adversarial time series for\nRNN-based TSC. The core idea is a new global optimization objective known as\n\"Camouflage Coefficient\" that captures the imperceptibility of adversarial\nsamples from the class distribution. Based on this, we reduce the adversarial\nattack problem to a multi-objective optimization problem that enhances the\nperturbation quality. Furthermore, to speed up the optimization process, we\npropose to use a representation model for RNN to capture deeply embedded\nvulnerable samples whose features deviate from the latent manifold. Experiments\non 11 UCR and UEA datasets showcase that TSFool significantly outperforms six\nwhite-box and three black-box benchmark attacks in terms of effectiveness,\nefficiency and imperceptibility from various perspectives including standard\nmeasure, human study and real-world defense.\n","authors":["Yanyun Wang","Dehui Du","Haibo Hu","Zi Liang","Yuanhao Liu"],"pdf_url":"https://arxiv.org/pdf/2209.06388v4.pdf","comment":"27th European Conference on Artificial Intelligence (ECAI'24)"},{"id":"http://arxiv.org/abs/2405.16581v2","updated":"2024-09-05T11:15:48Z","published":"2024-05-26T14:18:38Z","title":"On Bits and Bandits: Quantifying the Regret-Information Trade-off","summary":" In interactive decision-making tasks, information can be acquired by direct\ninteractions, through receiving indirect feedback, and from external\nknowledgeable sources. We examine the trade-off between the information an\nagent accumulates and the regret it suffers. We show that information from\nexternal sources, measured in bits, can be traded off for regret, measured in\nreward. We invoke information-theoretic methods for obtaining regret lower\nbounds, that also allow us to easily re-derive several known lower bounds. We\nthen generalize a variety of interactive decision-making tasks with external\ninformation to a new setting. Using this setting, we introduce the first\nBayesian regret lower bounds that depend on the information an agent\naccumulates. These lower bounds also prove the near-optimality of Thompson\nsampling for Bayesian problems. Finally, we demonstrate the utility of these\nbounds in improving the performance of a question-answering task with large\nlanguage models, allowing us to obtain valuable insights.\n","authors":["Itai Shufaro","Nadav Merlis","Nir Weinberger","Shie Mannor"],"pdf_url":"https://arxiv.org/pdf/2405.16581v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02139v2","updated":"2024-09-05T11:09:38Z","published":"2024-09-02T19:12:54Z","title":"The Role of Transformer Models in Advancing Blockchain Technology: A\n Systematic Survey","summary":" As blockchain technology rapidly evolves, the demand for enhanced efficiency,\nsecurity, and scalability grows.Transformer models, as powerful deep learning\narchitectures,have shown unprecedented potential in addressing various\nblockchain challenges. However, a systematic review of Transformer applications\nin blockchain is lacking. This paper aims to fill this research gap by\nsurveying over 200 relevant papers, comprehensively reviewing practical cases\nand research progress of Transformers in blockchain applications. Our survey\ncovers key areas including anomaly detection, smart contract security analysis,\ncryptocurrency prediction and trend analysis, and code summary generation. To\nclearly articulate the advancements of Transformers across various blockchain\ndomains, we adopt a domain-oriented classification system, organizing and\nintroducing representative methods based on major challenges in current\nblockchain research. For each research domain,we first introduce its background\nand objectives, then review previous representative methods and analyze their\nlimitations,and finally introduce the advancements brought by Transformer\nmodels. Furthermore, we explore the challenges of utilizing Transformer, such\nas data privacy, model complexity, and real-time processing requirements.\nFinally, this article proposes future research directions, emphasizing the\nimportance of exploring the Transformer architecture in depth to adapt it to\nspecific blockchain applications, and discusses its potential role in promoting\nthe development of blockchain technology. This review aims to provide new\nperspectives and a research foundation for the integrated development of\nblockchain technology and machine learning, supporting further innovation and\napplication expansion of blockchain technology.\n","authors":["Tianxu Liu","Yanbin Wang","Jianguo Sun","Ye Tian","Yanyu Huang","Tao Xue","Peiyue Li","Yiwei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.02139v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03377v1","updated":"2024-09-05T09:28:56Z","published":"2024-09-05T09:28:56Z","title":"Raw Speech Enhancement with Deep State Space Modeling","summary":" We present aTENNuate, a simple deep state-space autoencoder configured for\nefficient online raw speech enhancement in an end-to-end fashion. The network's\nperformance is primarily evaluated on raw speech denoising, with additional\nassessments on tasks such as super-resolution and de-quantization. We benchmark\naTENNuate on the VoiceBank + DEMAND and the Microsoft DNS1 synthetic test sets.\nThe network outperforms previous real-time denoising models in terms of PESQ\nscore, parameter count, MACs, and latency. Even as a raw waveform processing\nmodel, the model maintains high fidelity to the clean signal with minimal\naudible artifacts. In addition, the model remains performant even when the\nnoisy input is compressed down to 4000Hz and 4 bits, suggesting general speech\nenhancement capabilities in low-resource environments.\n","authors":["Yan Ru Pei","Ritik Shrivastava","FNU Sidharth"],"pdf_url":"https://arxiv.org/pdf/2409.03377v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.03375v1","updated":"2024-09-05T09:27:05Z","published":"2024-09-05T09:27:05Z","title":"Leveraging Large Language Models through Natural Language Processing to\n provide interpretable Machine Learning predictions of mental deterioration in\n real time","summary":" Based on official estimates, 50 million people worldwide are affected by\ndementia, and this number increases by 10 million new patients every year.\nWithout a cure, clinical prognostication and early intervention represent the\nmost effective ways to delay its progression. To this end, Artificial\nIntelligence and computational linguistics can be exploited for natural\nlanguage analysis, personalized assessment, monitoring, and treatment. However,\ntraditional approaches need more semantic knowledge management and\nexplicability capabilities. Moreover, using Large Language Models (LLMs) for\ncognitive decline diagnosis is still scarce, even though these models represent\nthe most advanced way for clinical-patient communication using intelligent\nsystems. Consequently, we leverage an LLM using the latest Natural Language\nProcessing (NLP) techniques in a chatbot solution to provide interpretable\nMachine Learning prediction of cognitive decline in real-time.\nLinguistic-conceptual features are exploited for appropriate natural language\nanalysis. Through explainability, we aim to fight potential biases of the\nmodels and improve their potential to help clinical workers in their diagnosis\ndecisions. More in detail, the proposed pipeline is composed of (i) data\nextraction employing NLP-based prompt engineering; (ii) stream-based data\nprocessing including feature engineering, analysis, and selection; (iii)\nreal-time classification; and (iv) the explainability dashboard to provide\nvisual and natural language descriptions of the prediction outcome.\nClassification results exceed 80 % in all evaluation metrics, with a recall\nvalue for the mental deterioration class about 85 %. To sum up, we contribute\nwith an affordable, flexible, non-invasive, personalized diagnostic system to\nthis work.\n","authors":["Francisco de Arriba-Pérez","Silvia García-Méndez"],"pdf_url":"https://arxiv.org/pdf/2409.03375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03365v1","updated":"2024-09-05T09:10:40Z","published":"2024-09-05T09:10:40Z","title":"Efficient Multi-Task Large Model Training via Data Heterogeneity-aware\n Model Management","summary":" Recent foundation models are capable of handling multiple machine learning\n(ML) tasks and multiple data modalities with the unified base model structure\nand several specialized model components. However, the development of such\nmulti-task (MT) multi-modal (MM) models poses significant model management\nchallenges to existing training systems. Due to the sophisticated model\narchitecture and the heterogeneous workloads of different ML tasks and data\nmodalities, training these models usually requires massive GPU resources and\nsuffers from sub-optimal system efficiency.\n In this paper, we investigate how to achieve high-performance training of\nlarge-scale MT MM models through data heterogeneity-aware model management\noptimization. The key idea is to decompose the model execution into stages and\naddress the joint optimization problem sequentially, including both\nheterogeneity-aware workload parallelization and dependency-driven execution\nscheduling. Based on this, we build a prototype system and evaluate it on\nvarious large MT MM models. Experiments demonstrate the superior performance\nand efficiency of our system, with speedup ratio up to 71% compared to\nstate-of-the-art training systems.\n","authors":["Yujie Wang","Shenhan Zhu","Fangcheng Fu","Xupeng Miao","Jie Zhang","Juan Zhu","Fan Hong","Yong Li","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2409.03365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15332v3","updated":"2024-09-05T09:09:55Z","published":"2024-05-24T08:13:16Z","title":"Cross-Validated Off-Policy Evaluation","summary":" In this paper, we study the problem of estimator selection and\nhyper-parameter tuning in off-policy evaluation. Although cross-validation is\nthe most popular method for model selection in supervised learning, off-policy\nevaluation relies mostly on theory-based approaches, which provide only limited\nguidance to practitioners. We show how to use cross-validation for off-policy\nevaluation. This challenges a popular belief that cross-validation in\noff-policy evaluation is not feasible. We evaluate our method empirically and\nshow that it addresses a variety of use cases.\n","authors":["Matej Cief","Branislav Kveton","Michal Kompan"],"pdf_url":"https://arxiv.org/pdf/2405.15332v3.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.01444v2","updated":"2024-09-05T09:05:54Z","published":"2024-09-02T20:00:45Z","title":"A causal viewpoint on prediction model performance under changes in\n case-mix: discrimination and calibration respond differently for prognosis\n and diagnosis predictions","summary":" Prediction models inform important clinical decisions, aiding in diagnosis,\nprognosis, and treatment planning. The predictive performance of these models\nis typically assessed through discrimination and calibration. However, changes\nin the distribution of the data impact model performance. In health-care, a\ntypical change is a shift in case-mix: for example, for cardiovascular risk\nmanagement, a general practitioner sees a different mix of patients than a\nspecialist in a tertiary hospital.\n This work introduces a novel framework that differentiates the effects of\ncase-mix shifts on discrimination and calibration based on the causal direction\nof the prediction task. When prediction is in the causal direction (often the\ncase for prognosis predictions), calibration remains stable under case-mix\nshifts, while discrimination does not. Conversely, when predicting in the\nanti-causal direction (often with diagnosis predictions), discrimination\nremains stable, but calibration does not.\n A simulation study and empirical validation using cardiovascular disease\nprediction models demonstrate the implications of this framework. This\nframework provides critical insights for evaluating and deploying prediction\nmodels across different clinical settings, emphasizing the importance of\nunderstanding the causal structure of the prediction task.\n","authors":["Wouter A. C. van Amsterdam"],"pdf_url":"https://arxiv.org/pdf/2409.01444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03358v1","updated":"2024-09-05T09:01:11Z","published":"2024-09-05T09:01:11Z","title":"MouseSIS: A Frames-and-Events Dataset for Space-Time Instance\n Segmentation of Mice","summary":" Enabled by large annotated datasets, tracking and segmentation of objects in\nvideos has made remarkable progress in recent years. Despite these\nadvancements, algorithms still struggle under degraded conditions and during\nfast movements. Event cameras are novel sensors with high temporal resolution\nand high dynamic range that offer promising advantages to address these\nchallenges. However, annotated data for developing learning-based mask-level\ntracking algorithms with events is not available. To this end, we introduce:\n($i$) a new task termed \\emph{space-time instance segmentation}, similar to\nvideo instance segmentation, whose goal is to segment instances throughout the\nentire duration of the sensor input (here, the input are quasi-continuous\nevents and optionally aligned frames); and ($ii$) \\emph{\\dname}, a dataset for\nthe new task, containing aligned grayscale frames and events. It includes\nannotated ground-truth labels (pixel-level instance segmentation masks) of a\ngroup of up to seven freely moving and interacting mice. We also provide two\nreference methods, which show that leveraging event data can consistently\nimprove tracking performance, especially when used in combination with\nconventional cameras. The results highlight the potential of event-aided\ntracking in difficult scenarios. We hope our dataset opens the field of\nevent-based video instance segmentation and enables the development of robust\ntracking algorithms for challenging\nconditions.\\url{https://github.com/tub-rip/MouseSIS}\n","authors":["Friedhelm Hamann","Hanxiong Li","Paul Mieske","Lars Lewejohann","Guillermo Gallego"],"pdf_url":"https://arxiv.org/pdf/2409.03358v1.pdf","comment":"18 pages, 5 figures, ECCV Workshops"},{"id":"http://arxiv.org/abs/2408.14762v2","updated":"2024-09-05T08:30:01Z","published":"2024-08-27T03:30:01Z","title":"Explainable Hierarchical Urban Representation Learning for Commuting\n Flow Prediction","summary":" Commuting flow prediction is an essential task for municipal operations in\nthe real world. Previous studies have revealed that it is feasible to estimate\nthe commuting origin-destination (OD) demand within a city using multiple\nauxiliary data. However, most existing methods are not suitable to deal with a\nsimilar task at a large scale, namely within a prefecture or the whole nation,\nowing to the increased number of geographical units that need to be maintained.\nIn addition, region representation learning is a universal approach for gaining\nurban knowledge for diverse metropolitan downstream tasks. Although many\nresearchers have developed comprehensive frameworks to describe urban units\nfrom multi-source data, they have not clarified the relationship between the\nselected geographical elements. Furthermore, metropolitan areas naturally\npreserve ranked structures, like cities and their inclusive districts, which\nmakes elucidating relations between cross-level urban units necessary.\nTherefore, we develop a heterogeneous graph-based model to generate meaningful\nregion embeddings at multiple spatial resolutions for predicting different\ntypes of inter-level OD flows. To demonstrate the effectiveness of the proposed\nmethod, extensive experiments were conducted using real-world aggregated mobile\nphone datasets collected from Shizuoka Prefecture, Japan. The results indicate\nthat our proposed model outperforms existing models in terms of a uniform urban\nstructure. We extend the understanding of predicted results using reasonable\nexplanations to enhance the credibility of the model.\n","authors":["Mingfei Cai","Yanbo Pang","Yoshihide Sekimoto"],"pdf_url":"https://arxiv.org/pdf/2408.14762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03335v1","updated":"2024-09-05T08:21:05Z","published":"2024-09-05T08:21:05Z","title":"Semi-Supervised Sparse Gaussian Classification: Provable Benefits of\n Unlabeled Data","summary":" The premise of semi-supervised learning (SSL) is that combining labeled and\nunlabeled data yields significantly more accurate models. Despite empirical\nsuccesses, the theoretical understanding of SSL is still far from complete. In\nthis work, we study SSL for high dimensional sparse Gaussian classification. To\nconstruct an accurate classifier a key task is feature selection, detecting the\nfew variables that separate the two classes. % For this SSL setting, we analyze\ninformation theoretic lower bounds for accurate feature selection as well as\ncomputational lower bounds, assuming the low-degree likelihood hardness\nconjecture. % Our key contribution is the identification of a regime in the\nproblem parameters (dimension, sparsity, number of labeled and unlabeled\nsamples) where SSL is guaranteed to be advantageous for classification.\nSpecifically, there is a regime where it is possible to construct in polynomial\ntime an accurate SSL classifier. However, % any computationally efficient\nsupervised or unsupervised learning schemes, that separately use only the\nlabeled or unlabeled data would fail. Our work highlights the provable benefits\nof combining labeled and unlabeled data for {classification and} feature\nselection in high dimensions. We present simulations that complement our\ntheoretical analysis.\n","authors":["Eyar Azar","Boaz Nadler"],"pdf_url":"https://arxiv.org/pdf/2409.03335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00858v2","updated":"2024-09-05T08:07:27Z","published":"2024-09-01T22:20:32Z","title":"Trustworthy Human-AI Collaboration: Reinforcement Learning with Human\n Feedback and Physics Knowledge for Safe Autonomous Driving","summary":" In the field of autonomous driving, developing safe and trustworthy\nautonomous driving policies remains a significant challenge. Recently,\nReinforcement Learning with Human Feedback (RLHF) has attracted substantial\nattention due to its potential to enhance training safety and sampling\nefficiency. Nevertheless, existing RLHF-enabled methods often falter when faced\nwith imperfect human demonstrations, potentially leading to training\noscillations or even worse performance than rule-based approaches. Inspired by\nthe human learning process, we propose Physics-enhanced Reinforcement Learning\nwith Human Feedback (PE-RLHF). This novel framework synergistically integrates\nhuman feedback (e.g., human intervention and demonstration) and physics\nknowledge (e.g., traffic flow model) into the training loop of reinforcement\nlearning. The key advantage of PE-RLHF is its guarantee that the learned policy\nwill perform at least as well as the given physics-based policy, even when\nhuman feedback quality deteriorates, thus ensuring trustworthy safety\nimprovements. PE-RLHF introduces a Physics-enhanced Human-AI (PE-HAI)\ncollaborative paradigm for dynamic action selection between human and\nphysics-based actions, employs a reward-free approach with a proxy value\nfunction to capture human preferences, and incorporates a minimal intervention\nmechanism to reduce the cognitive load on human mentors. Extensive experiments\nacross diverse driving scenarios demonstrate that PE-RLHF significantly\noutperforms traditional methods, achieving state-of-the-art (SOTA) performance\nin safety, efficiency, and generalizability, even with varying quality of human\nfeedback. The philosophy behind PE-RLHF not only advances autonomous driving\ntechnology but can also offer valuable insights for other safety-critical\ndomains. Demo video and code are available at:\n\\https://zilin-huang.github.io/PE-RLHF-website/\n","authors":["Zilin Huang","Zihao Sheng","Sikai Chen"],"pdf_url":"https://arxiv.org/pdf/2409.00858v2.pdf","comment":"33 pages, 20 figures"},{"id":"http://arxiv.org/abs/2311.15327v4","updated":"2024-09-05T07:27:57Z","published":"2023-11-26T15:11:17Z","title":"FRAC-Q-Learning: A Reinforcement Learning with Boredom Avoidance\n Processes for Social Robots","summary":" The reinforcement learning algorithms have often been applied to social\nrobots. However, most reinforcement learning algorithms were not optimized for\nthe use of social robots, and consequently they may bore users. We proposed a\nnew reinforcement learning method specialized for the social robot, the\nFRAC-Q-learning, that can avoid user boredom. The proposed algorithm consists\nof a forgetting process in addition to randomizing and categorizing processes.\nThis study evaluated interest and boredom hardness scores of the\nFRAC-Q-learning by a comparison with the traditional Q-learning. The\nFRAC-Q-learning showed significantly higher trend of interest score, and\nindicated significantly harder to bore users compared to the traditional\nQ-learning. Therefore, the FRAC-Q-learning can contribute to develop a social\nrobot that will not bore users. The proposed algorithm has a potential to apply\nfor Web-based communication and educational systems. This paper presents the\nentire process, detailed implementation and a detailed evaluation method of the\nof the FRAC-Q-learning for the first time.\n","authors":["Akinari Onishi"],"pdf_url":"https://arxiv.org/pdf/2311.15327v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03306v1","updated":"2024-09-05T07:22:19Z","published":"2024-09-05T07:22:19Z","title":"Towards training digitally-tied analog blocks via hybrid gradient\n computation","summary":" Power efficiency is plateauing in the standard digital electronics realm such\nthat novel hardware, models, and algorithms are needed to reduce the costs of\nAI training. The combination of energy-based analog circuits and the\nEquilibrium Propagation (EP) algorithm constitutes one compelling alternative\ncompute paradigm for gradient-based optimization of neural nets. Existing\nanalog hardware accelerators, however, typically incorporate digital circuitry\nto sustain auxiliary non-weight-stationary operations, mitigate analog device\nimperfections, and leverage existing digital accelerators.This heterogeneous\nhardware approach calls for a new theoretical model building block. In this\nwork, we introduce Feedforward-tied Energy-based Models (ff-EBMs), a hybrid\nmodel comprising feedforward and energy-based blocks accounting for digital and\nanalog circuits. We derive a novel algorithm to compute gradients end-to-end in\nff-EBMs by backpropagating and \"eq-propagating\" through feedforward and\nenergy-based parts respectively, enabling EP to be applied to much more\nflexible and realistic architectures. We experimentally demonstrate the\neffectiveness of the proposed approach on ff-EBMs where Deep Hopfield Networks\n(DHNs) are used as energy-based blocks. We first show that a standard DHN can\nbe arbitrarily split into any uniform size while maintaining performance. We\nthen train ff-EBMs on ImageNet32 where we establish new SOTA performance in the\nEP literature (46 top-1 %). Our approach offers a principled, scalable, and\nincremental roadmap to gradually integrate self-trainable analog computational\nprimitives into existing digital accelerators.\n","authors":["Timothy Nest","Maxence Ernoult"],"pdf_url":"https://arxiv.org/pdf/2409.03306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03303v1","updated":"2024-09-05T07:19:03Z","published":"2024-09-05T07:19:03Z","title":"Improving Robustness to Multiple Spurious Correlations by\n Multi-Objective Optimization","summary":" We study the problem of training an unbiased and accurate model given a\ndataset with multiple biases. This problem is challenging since the multiple\nbiases cause multiple undesirable shortcuts during training, and even worse,\nmitigating one may exacerbate the other. We propose a novel training method to\ntackle this challenge. Our method first groups training data so that different\ngroups induce different shortcuts, and then optimizes a linear combination of\ngroup-wise losses while adjusting their weights dynamically to alleviate\nconflicts between the groups in performance; this approach, rooted in the\nmulti-objective optimization theory, encourages to achieve the minimax Pareto\nsolution. We also present a new benchmark with multiple biases, dubbed\nMultiCelebA, for evaluating debiased training methods under realistic and\nchallenging scenarios. Our method achieved the best on three datasets with\nmultiple biases, and also showed superior performance on conventional\nsingle-bias datasets.\n","authors":["Nayeong Kim","Juwon Kang","Sungsoo Ahn","Jungseul Ok","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2409.03303v1.pdf","comment":"International Conference on Machine Learning 2024"},{"id":"http://arxiv.org/abs/2409.03302v1","updated":"2024-09-05T07:18:09Z","published":"2024-09-05T07:18:09Z","title":"Fourier Neural Operators for Learning Dynamics in Quantum Spin Systems","summary":" Fourier Neural Operators (FNOs) excel on tasks using functional data, such as\nthose originating from partial differential equations. Such characteristics\nrender them an effective approach for simulating the time evolution of quantum\nwavefunctions, which is a computationally challenging, yet coveted task for\nunderstanding quantum systems. In this manuscript, we use FNOs to model the\nevolution of random quantum spin systems, so chosen due to their representative\nquantum dynamics and minimal symmetry. We explore two distinct FNO\narchitectures and examine their performance for learning and predicting time\nevolution using both random and low-energy input states. Additionally, we apply\nFNOs to a compact set of Hamiltonian observables ($\\sim\\text{poly}(n)$) instead\nof the entire $2^n$ quantum wavefunction, which greatly reduces the size of our\ninputs and outputs and, consequently, the requisite dimensions of the resulting\nFNOs. Moreover, this Hamiltonian observable-based method demonstrates that FNOs\ncan effectively distill information from high-dimensional spaces into\nlower-dimensional spaces. The extrapolation of Hamiltonian observables to times\nlater than those used in training is of particular interest, as this stands to\nfundamentally increase the simulatability of quantum systems past both the\ncoherence times of contemporary quantum architectures and the circuit-depths of\ntractable tensor networks.\n","authors":["Freya Shah","Taylor L. Patti","Julius Berner","Bahareh Tolooshams","Jean Kossaifi","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2409.03302v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.03301v1","updated":"2024-09-05T07:14:03Z","published":"2024-09-05T07:14:03Z","title":"ELO-Rated Sequence Rewards: Advancing Reinforcement Learning Models","summary":" Reinforcement Learning (RL) is highly dependent on the meticulous design of\nthe reward function. However, accurately assigning rewards to each state-action\npair in Long-Term RL (LTRL) challenges is formidable. Consequently, RL agents\nare predominantly trained with expert guidance. Drawing on the principles of\nordinal utility theory from economics, we propose a novel reward estimation\nalgorithm: ELO-Rating based RL (ERRL). This approach is distinguished by two\nmain features. Firstly, it leverages expert preferences over trajectories\ninstead of cardinal rewards (utilities) to compute the ELO rating of each\ntrajectory as its reward. Secondly, a new reward redistribution algorithm is\nintroduced to mitigate training volatility in the absence of a fixed anchor\nreward. Our method demonstrates superior performance over several leading\nbaselines in long-term scenarios (extending up to 5000 steps), where\nconventional RL algorithms falter. Furthermore, we conduct a thorough analysis\nof how expert preferences affect the outcomes.\n","authors":["Qi Ju","Falin Hei","Zhemei Fang","Yunfeng Luo"],"pdf_url":"https://arxiv.org/pdf/2409.03301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03299v1","updated":"2024-09-05T07:09:14Z","published":"2024-09-05T07:09:14Z","title":"Bringing the RT-1-X Foundation Model to a SCARA robot","summary":" Traditional robotic systems require specific training data for each task,\nenvironment, and robot form. While recent advancements in machine learning have\nenabled models to generalize across new tasks and environments, the challenge\nof adapting these models to entirely new settings remains largely unexplored.\nThis study addresses this by investigating the generalization capabilities of\nthe RT-1-X robotic foundation model to a type of robot unseen during its\ntraining: a SCARA robot from UMI-RTX.\n Initial experiments reveal that RT-1-X does not generalize zero-shot to the\nunseen type of robot. However, fine-tuning of the RT-1-X model by demonstration\nallows the robot to learn a pickup task which was part of the foundation model\n(but learned for another type of robot). When the robot is presented with an\nobject that is included in the foundation model but not in the fine-tuning\ndataset, it demonstrates that only the skill, but not the object-specific\nknowledge, has been transferred.\n","authors":["Jonathan Salzer","Arnoud Visser"],"pdf_url":"https://arxiv.org/pdf/2409.03299v1.pdf","comment":"14 pages, submitted to the joint Artificial Intelligence & Machine\n Learning conference for Belgium, Netherlands & Luxembourg (BNAIC/BeNeLearn)"},{"id":"http://arxiv.org/abs/2205.15409v2","updated":"2024-09-05T07:00:35Z","published":"2022-05-27T07:32:33Z","title":"Painful intelligence: What AI can tell us about human suffering","summary":" This book uses the modern theory of artificial intelligence (AI) to\nunderstand human suffering or mental pain. Both humans and sophisticated AI\nagents process information about the world in order to achieve goals and obtain\nrewards, which is why AI can be used as a model of the human brain and mind.\nThis book intends to make the theory accessible to a relatively general\naudience, requiring only some relevant scientific background.\n The book starts with the assumption that suffering is mainly caused by\nfrustration. Frustration means the failure of an agent (whether AI or human) to\nachieve a goal or a reward it wanted or expected. Frustration is inevitable\nbecause of the overwhelming complexity of the world, limited computational\nresources, and scarcity of good data. In particular, such limitations imply\nthat an agent acting in the real world must cope with uncontrollability,\nunpredictability, and uncertainty, which all lead to frustration.\n Fundamental in such modelling is the idea of learning, or adaptation to the\nenvironment. While AI uses machine learning, humans and animals adapt by a\ncombination of evolutionary mechanisms and ordinary learning. Even frustration\nis fundamentally an error signal that the system uses for learning. This book\nexplores various aspects and limitations of learning algorithms and their\nimplications regarding suffering.\n At the end of the book, the computational theory is used to derive various\ninterventions or training methods that will reduce suffering in humans. The\namount of frustration is expressed by a simple equation which indicates how it\ncan be reduced. The ensuing interventions are very similar to those proposed by\nBuddhist and Stoic philosophy, and include mindfulness meditation. Therefore,\nthis book can be interpreted as an exposition of a computational theory\njustifying why such philosophies and meditation reduce human suffering.\n","authors":["Aapo Hyvärinen"],"pdf_url":"https://arxiv.org/pdf/2205.15409v2.pdf","comment":"Second Edition of this book with 258 pages"},{"id":"http://arxiv.org/abs/2409.03291v1","updated":"2024-09-05T06:55:13Z","published":"2024-09-05T06:55:13Z","title":"LLM Detectors Still Fall Short of Real World: Case of LLM-Generated\n Short News-Like Posts","summary":" With the emergence of widely available powerful LLMs, disinformation\ngenerated by large Language Models (LLMs) has become a major concern.\nHistorically, LLM detectors have been touted as a solution, but their\neffectiveness in the real world is still to be proven. In this paper, we focus\non an important setting in information operations -- short news-like posts\ngenerated by moderately sophisticated attackers.\n We demonstrate that existing LLM detectors, whether zero-shot or\npurpose-trained, are not ready for real-world use in that setting. All tested\nzero-shot detectors perform inconsistently with prior benchmarks and are highly\nvulnerable to sampling temperature increase, a trivial attack absent from\nrecent benchmarks. A purpose-trained detector generalizing across LLMs and\nunseen attacks can be developed, but it fails to generalize to new\nhuman-written texts.\n We argue that the former indicates domain-specific benchmarking is needed,\nwhile the latter suggests a trade-off between the adversarial evasion\nresilience and overfitting to the reference human text, with both needing\nevaluation in benchmarks and currently absent. We believe this suggests a\nre-consideration of current LLM detector benchmarking approaches and provides a\ndynamically extensible benchmark to allow it\n(https://github.com/Reliable-Information-Lab-HEVS/dynamic_llm_detector_benchmark).\n","authors":["Henrique Da Silva Gameiro","Andrei Kucharavy","Ljiljana Dolamic"],"pdf_url":"https://arxiv.org/pdf/2409.03291v1.pdf","comment":"20 pages, 7 tables, 13 figures, under consideration for EMNLP"},{"id":"http://arxiv.org/abs/2409.03282v1","updated":"2024-09-05T06:47:01Z","published":"2024-09-05T06:47:01Z","title":"Interpretable mixture of experts for time series prediction under\n recurrent and non-recurrent conditions","summary":" Non-recurrent conditions caused by incidents are different from recurrent\nconditions that follow periodic patterns. Existing traffic speed prediction\nstudies are incident-agnostic and use one single model to learn all possible\npatterns from these drastically diverse conditions. This study proposes a novel\nMixture of Experts (MoE) model to improve traffic speed prediction under two\nseparate conditions, recurrent and non-recurrent (i.e., with and without\nincidents). The MoE leverages separate recurrent and non-recurrent expert\nmodels (Temporal Fusion Transformers) to capture the distinct patterns of each\ntraffic condition. Additionally, we propose a training pipeline for\nnon-recurrent models to remedy the limited data issues. To train our model,\nmulti-source datasets, including traffic speed, incident reports, and weather\ndata, are integrated and processed to be informative features. Evaluations on a\nreal road network demonstrate that the MoE achieves lower errors compared to\nother benchmark algorithms. The model predictions are interpreted in terms of\ntemporal dependencies and variable importance in each condition separately to\nshed light on the differences between recurrent and non-recurrent conditions.\n","authors":["Zemian Ke","Haocheng Duan","Sean Qian"],"pdf_url":"https://arxiv.org/pdf/2409.03282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03276v1","updated":"2024-09-05T06:38:27Z","published":"2024-09-05T06:38:27Z","title":"Tensor network square root Kalman filter for online Gaussian process\n regression","summary":" The state-of-the-art tensor network Kalman filter lifts the curse of\ndimensionality for high-dimensional recursive estimation problems. However, the\nrequired rounding operation can cause filter divergence due to the loss of\npositive definiteness of covariance matrices. We solve this issue by\ndeveloping, for the first time, a tensor network square root Kalman filter, and\napply it to high-dimensional online Gaussian process regression. In our\nexperiments, we demonstrate that our method is equivalent to the conventional\nKalman filter when choosing a full-rank tensor network. Furthermore, we apply\nour method to a real-life system identification problem where we estimate\n$4^{14}$ parameters on a standard laptop. The estimated model outperforms the\nstate-of-the-art tensor network Kalman filter in terms of prediction accuracy\nand uncertainty quantification.\n","authors":["Clara Menzen","Manon Kok","Kim Batselier"],"pdf_url":"https://arxiv.org/pdf/2409.03276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03260v1","updated":"2024-09-05T05:51:42Z","published":"2024-09-05T05:51:42Z","title":"In Search of Trees: Decision-Tree Policy Synthesis for Black-Box Systems\n via Search","summary":" Decision trees, owing to their interpretability, are attractive as control\npolicies for (dynamical) systems. Unfortunately, constructing, or synthesising,\nsuch policies is a challenging task. Previous approaches do so by imitating a\nneural-network policy, approximating a tabular policy obtained via formal\nsynthesis, employing reinforcement learning, or modelling the problem as a\nmixed-integer linear program. However, these works may require access to a\nhard-to-obtain accurate policy or a formal model of the environment (within\nreach of formal synthesis), and may not provide guarantees on the quality or\nsize of the final tree policy. In contrast, we present an approach to\nsynthesise optimal decision-tree policies given a black-box environment and\nspecification, and a discretisation of the tree predicates, where optimality is\ndefined with respect to the number of steps to achieve the goal. Our approach\nis a specialised search algorithm which systematically explores the\n(exponentially large) space of decision trees under the given discretisation.\nThe key component is a novel pruning mechanism that significantly reduces the\nsearch space. Our approach represents a conceptually novel way of synthesising\nsmall decision-tree policies with optimality guarantees even for black-box\nenvironments with black-box specifications.\n","authors":["Emir Demirović","Christian Schilling","Anna Lukina"],"pdf_url":"https://arxiv.org/pdf/2409.03260v1.pdf","comment":"8 pages main text incl. references, 1 page appendix"},{"id":"http://arxiv.org/abs/2404.12415v2","updated":"2024-09-05T05:38:13Z","published":"2024-04-17T17:57:20Z","title":"Prediction of soil fertility parameters using USB-microscope imagery and\n portable X-ray fluorescence spectrometry","summary":" This study investigated the use of portable X-ray fluorescence (PXRF)\nspectrometry and soil image analysis for rapid soil fertility assessment, with\na focus on key indicators such as available boron (B), organic carbon (OC),\navailable manganese (Mn), available sulfur (S), and the sulfur availability\nindex (SAI). A total of 1,133 soil samples from diverse agro-climatic zones in\nEastern India were analyzed. The research integrated color and texture features\nfrom microscopic soil images, PXRF data, and auxiliary soil variables (AVs)\nusing a Random Forest model. Results showed that combining image features (IFs)\nwith AVs significantly improved prediction accuracy for available B (R2 = 0.80)\nand OC (R2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF\ndata, further enhanced predictions for available Mn and SAI, with R2 values of\n0.72 and 0.70, respectively. The study highlights the potential of integrating\nthese technologies to offer rapid, cost-effective soil testing methods, paving\nthe way for more advanced predictive models and a deeper understanding of soil\nfertility. Future work should explore the application of deep learning models\non a larger dataset, incorporating soils from a wider range of agro-climatic\nzones under field conditions.\n","authors":["Shubhadip Dasgupta","Satwik Pate","Divya Rathore","L. G. Divyanth","Ayan Das","Anshuman Nayak","Subhadip Dey","Asim Biswas","David C. Weindorf","Bin Li","Sergio Henrique Godinho Silva","Bruno Teixeira Ribeiro","Sanjay Srivastava","Somsubhra Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2404.12415v2.pdf","comment":"Published in 'Soil Advances'"},{"id":"http://arxiv.org/abs/2409.03253v1","updated":"2024-09-05T05:13:28Z","published":"2024-09-05T05:13:28Z","title":"SpinMultiNet: Neural Network Potential Incorporating Spin Degrees of\n Freedom with Multi-Task Learning","summary":" Neural Network Potentials (NNPs) have attracted significant attention as a\nmethod for accelerating density functional theory (DFT) calculations. However,\nconventional NNP models typically do not incorporate spin degrees of freedom,\nlimiting their applicability to systems where spin states critically influence\nmaterial properties, such as transition metal oxides. This study introduces\nSpinMultiNet, a novel NNP model that integrates spin degrees of freedom through\nmulti-task learning. SpinMultiNet achieves accurate predictions without relying\non correct spin values obtained from DFT calculations. Instead, it utilizes\ninitial spin estimates as input and leverages multi-task learning to optimize\nthe spin latent representation while maintaining both $E(3)$ and time-reversal\nequivariance. Validation on a dataset of transition metal oxides demonstrates\nthe high predictive accuracy of SpinMultiNet. The model successfully reproduces\nthe energy ordering of stable spin configurations originating from\nsuperexchange interactions and accurately captures the rhombohedral distortion\nof the rocksalt structure. These results pave the way for new possibilities in\nmaterials simulations that consider spin degrees of freedom, promising future\napplications in large-scale simulations of various material systems, including\nmagnetic materials.\n","authors":["Koki Ueno","Satoru Ohuchi","Kazuhide Ichikawa","Kei Amii","Kensuke Wakasugi"],"pdf_url":"https://arxiv.org/pdf/2409.03253v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.03251v1","updated":"2024-09-05T05:08:43Z","published":"2024-09-05T05:08:43Z","title":"Dual-TSST: A Dual-Branch Temporal-Spectral-Spatial Transformer Model for\n EEG Decoding","summary":" The decoding of electroencephalography (EEG) signals allows access to user\nintentions conveniently, which plays an important role in the fields of\nhuman-machine interaction. To effectively extract sufficient characteristics of\nthe multichannel EEG, a novel decoding architecture network with a dual-branch\ntemporal-spectral-spatial transformer (Dual-TSST) is proposed in this study.\nSpecifically, by utilizing convolutional neural networks (CNNs) on different\nbranches, the proposed processing network first extracts the temporal-spatial\nfeatures of the original EEG and the temporal-spectral-spatial features of\ntime-frequency domain data converted by wavelet transformation, respectively.\nThese perceived features are then integrated by a feature fusion block, serving\nas the input of the transformer to capture the global long-range dependencies\nentailed in the non-stationary EEG, and being classified via the global average\npooling and multi-layer perceptron blocks. To evaluate the efficacy of the\nproposed approach, the competitive experiments are conducted on three publicly\navailable datasets of BCI IV 2a, BCI IV 2b, and SEED, with the head-to-head\ncomparison of more than ten other state-of-the-art methods. As a result, our\nproposed Dual-TSST performs superiorly in various tasks, which achieves the\npromising EEG classification performance of average accuracy of 80.67% in BCI\nIV 2a, 88.64% in BCI IV 2b, and 96.65% in SEED, respectively. Extensive\nablation experiments conducted between the Dual-TSST and comparative baseline\nmodel also reveal the enhanced decoding performance with each module of our\nproposed method. This study provides a new approach to high-performance EEG\ndecoding, and has great potential for future CNN-Transformer based\napplications.\n","authors":["Hongqi Li","Haodong Zhang","Yitong Chen"],"pdf_url":"https://arxiv.org/pdf/2409.03251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11940v3","updated":"2024-09-05T05:06:57Z","published":"2024-02-19T08:27:23Z","title":"AICAttack: Adversarial Image Captioning Attack with Attention-Based\n Optimization","summary":" Recent advances in deep learning research have shown remarkable achievements\nacross many tasks in computer vision (CV) and natural language processing\n(NLP). At the intersection of CV and NLP is the problem of image captioning,\nwhere the related models' robustness against adversarial attacks has not been\nwell studied. This paper presents a novel adversarial attack strategy,\nAICAttack (Attention-based Image Captioning Attack), designed to attack image\ncaptioning models through subtle perturbations on images. Operating within a\nblack-box attack scenario, our algorithm requires no access to the target\nmodel's architecture, parameters, or gradient information. We introduce an\nattention-based candidate selection mechanism that identifies the optimal\npixels to attack, followed by a customised differential evolution method to\noptimise the perturbations of pixels' RGB values. We demonstrate AICAttack's\neffectiveness through extensive experiments on benchmark datasets against\nmultiple victim models. The experimental results demonstrate that our method\noutperforms current leading-edge techniques by achieving consistently higher\nattack success rates.\n","authors":["Jiyao Li","Mingze Ni","Yifei Dong","Tianqing Zhu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2402.11940v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02097v2","updated":"2024-09-05T04:53:37Z","published":"2024-09-03T17:54:39Z","title":"LinFusion: 1 GPU, 1 Minute, 16K Image","summary":" Modern diffusion models, particularly those utilizing a Transformer-based\nUNet for denoising, rely heavily on self-attention operations to manage complex\nspatial relationships, thus achieving impressive generation performance.\nHowever, this existing paradigm faces significant challenges in generating\nhigh-resolution visual content due to its quadratic time and memory complexity\nwith respect to the number of spatial tokens. To address this limitation, we\naim at a novel linear attention mechanism as an alternative in this paper.\nSpecifically, we begin our exploration from recently introduced models with\nlinear complexity, e.g., Mamba2, RWKV6, Gated Linear Attention, etc, and\nidentify two key features-attention normalization and non-causal inference-that\nenhance high-resolution visual generation performance. Building on these\ninsights, we introduce a generalized linear attention paradigm, which serves as\na low-rank approximation of a wide spectrum of popular linear token mixers. To\nsave the training cost and better leverage pre-trained models, we initialize\nour models and distill the knowledge from pre-trained StableDiffusion (SD). We\nfind that the distilled model, termed LinFusion, achieves performance on par\nwith or superior to the original SD after only modest training, while\nsignificantly reducing time and memory complexity. Extensive experiments on\nSD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory\nzero-shot cross-resolution generation performance, generating high-resolution\nimages like 16K resolution. Moreover, it is highly compatible with pre-trained\nSD components, such as ControlNet and IP-Adapter, requiring no adaptation\nefforts. Codes are available at https://github.com/Huage001/LinFusion.\n","authors":["Songhua Liu","Weihao Yu","Zhenxiong Tan","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.02097v2.pdf","comment":"Work in Progress. Codes are available at\n https://github.com/Huage001/LinFusion"},{"id":"http://arxiv.org/abs/2409.03239v1","updated":"2024-09-05T04:39:35Z","published":"2024-09-05T04:39:35Z","title":"DiffGrad for Physics-Informed Neural Networks","summary":" Physics-Informed Neural Networks (PINNs) are regarded as state-of-the-art\ntools for addressing highly nonlinear problems based on partial differential\nequations. Despite their broad range of applications, PINNs encounter several\nperformance challenges, including issues related to efficiency, minimization of\ncomputational cost, and enhancement of accuracy. Burgers' equation, a\nfundamental equation in fluid dynamics that is extensively used in PINNs,\nprovides flexible results with the Adam optimizer that does not account for\npast gradients. This paper introduces a novel strategy for solving Burgers'\nequation by incorporating DiffGrad with PINNs, a method that leverages the\ndifference between current and immediately preceding gradients to enhance\nperformance. A comprehensive computational analysis is conducted using\noptimizers such as Adam, Adamax, RMSprop, and DiffGrad to evaluate and compare\ntheir effectiveness. Our approach includes visualizing the solutions over space\nat various time intervals to demonstrate the accuracy of the network. The\nresults show that DiffGrad not only improves the accuracy of the solution but\nalso reduces training time compared to the other optimizers.\n","authors":["Jamshaid Ul Rahman"," Nimra"],"pdf_url":"https://arxiv.org/pdf/2409.03239v1.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.03238v1","updated":"2024-09-05T04:38:49Z","published":"2024-09-05T04:38:49Z","title":"Preserving Empirical Probabilities in BERT for Small-sample Clinical\n Entity Recognition","summary":" Named Entity Recognition (NER) encounters the challenge of unbalanced labels,\nwhere certain entity types are overrepresented while others are\nunderrepresented in real-world datasets. This imbalance can lead to biased\nmodels that perform poorly on minority entity classes, impeding accurate and\nequitable entity recognition. This paper explores the effects of unbalanced\nentity labels of the BERT-based pre-trained model. We analyze the different\nmechanisms of loss calculation and loss propagation for the task of token\nclassification on randomized datasets. Then we propose ways to improve the\ntoken classification for the highly imbalanced task of clinical entity\nrecognition.\n","authors":["Abdul Rehman","Jian Jun Zhang","Xiaosong Yang"],"pdf_url":"https://arxiv.org/pdf/2409.03238v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.03237v1","updated":"2024-09-05T04:37:02Z","published":"2024-09-05T04:37:02Z","title":"Robust Q-Learning under Corrupted Rewards","summary":" Recently, there has been a surge of interest in analyzing the non-asymptotic\nbehavior of model-free reinforcement learning algorithms. However, the\nperformance of such algorithms in non-ideal environments, such as in the\npresence of corrupted rewards, is poorly understood. Motivated by this gap, we\ninvestigate the robustness of the celebrated Q-learning algorithm to a\nstrong-contamination attack model, where an adversary can arbitrarily perturb a\nsmall fraction of the observed rewards. We start by proving that such an attack\ncan cause the vanilla Q-learning algorithm to incur arbitrarily large errors.\nWe then develop a novel robust synchronous Q-learning algorithm that uses\nhistorical reward data to construct robust empirical Bellman operators at each\ntime step. Finally, we prove a finite-time convergence rate for our algorithm\nthat matches known state-of-the-art bounds (in the absence of attacks) up to a\nsmall inevitable $O(\\varepsilon)$ error term that scales with the adversarial\ncorruption fraction $\\varepsilon$. Notably, our results continue to hold even\nwhen the true reward distributions have infinite support, provided they admit\nbounded second moments.\n","authors":["Sreejeet Maity","Aritra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.03237v1.pdf","comment":"Accepted to the Decision and Control Conference (CDC) 2024"},{"id":"http://arxiv.org/abs/2409.03231v1","updated":"2024-09-05T03:57:28Z","published":"2024-09-05T03:57:28Z","title":"State-space models are accurate and efficient neural operators for\n dynamical systems","summary":" Physics-informed machine learning (PIML) has emerged as a promising\nalternative to classical methods for predicting dynamical systems, offering\nfaster and more generalizable solutions. However, existing models, including\nrecurrent neural networks (RNNs), transformers, and neural operators, face\nchallenges such as long-time integration, long-range dependencies, chaotic\ndynamics, and extrapolation, to name a few. To this end, this paper introduces\nstate-space models implemented in Mamba for accurate and efficient dynamical\nsystem operator learning. Mamba addresses the limitations of existing\narchitectures by dynamically capturing long-range dependencies and enhancing\ncomputational efficiency through reparameterization techniques. To extensively\ntest Mamba and compare against another 11 baselines, we introduce several\nstrict extrapolation testbeds that go beyond the standard interpolation\nbenchmarks. We demonstrate Mamba's superior performance in both interpolation\nand challenging extrapolation tasks. Mamba consistently ranks among the top\nmodels while maintaining the lowest computational cost and exceptional\nextrapolation capabilities. Moreover, we demonstrate the good performance of\nMamba for a real-world application in quantitative systems pharmacology for\nassessing the efficacy of drugs in tumor growth under limited data scenarios.\nTaken together, our findings highlight Mamba's potential as a powerful tool for\nadvancing scientific machine learning in dynamical systems modeling. (The code\nwill be available at\nhttps://github.com/zheyuanhu01/State_Space_Model_Neural_Operator upon\nacceptance.)\n","authors":["Zheyuan Hu","Nazanin Ahmadi Daryakenari","Qianli Shen","Kenji Kawaguchi","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2409.03231v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2402.01105v4","updated":"2024-09-05T03:38:08Z","published":"2024-02-02T02:44:59Z","title":"A Survey for Foundation Models in Autonomous Driving","summary":" The advent of foundation models has revolutionized the fields of natural\nlanguage processing and computer vision, paving the way for their application\nin autonomous driving (AD). This survey presents a comprehensive review of more\nthan 40 research papers, demonstrating the role of foundation models in\nenhancing AD. Large language models contribute to planning and simulation in\nAD, particularly through their proficiency in reasoning, code generation and\ntranslation. In parallel, vision foundation models are increasingly adapted for\ncritical tasks such as 3D object detection and tracking, as well as creating\nrealistic driving scenarios for simulation and testing. Multi-modal foundation\nmodels, integrating diverse inputs, exhibit exceptional visual understanding\nand spatial reasoning, crucial for end-to-end AD. This survey not only provides\na structured taxonomy, categorizing foundation models based on their modalities\nand functionalities within the AD domain but also delves into the methods\nemployed in current research. It identifies the gaps between existing\nfoundation models and cutting-edge AD approaches, thereby charting future\nresearch directions and proposing a roadmap for bridging these gaps.\n","authors":["Haoxiang Gao","Zhongruo Wang","Yaqian Li","Kaiwen Long","Ming Yang","Yiqing Shen"],"pdf_url":"https://arxiv.org/pdf/2402.01105v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03220v1","updated":"2024-09-05T03:36:05Z","published":"2024-09-05T03:36:05Z","title":"FairQuant: Certifying and Quantifying Fairness of Deep Neural Networks","summary":" We propose a method for formally certifying and quantifying individual\nfairness of deep neural networks (DNN). Individual fairness guarantees that any\ntwo individuals who are identical except for a legally protected attribute\n(e.g., gender or race) receive the same treatment. While there are existing\ntechniques that provide such a guarantee, they tend to suffer from lack of\nscalability or accuracy as the size and input dimension of the DNN increase.\nOur method overcomes this limitation by applying abstraction to a symbolic\ninterval based analysis of the DNN followed by iterative refinement guided by\nthe fairness property. Furthermore, our method lifts the symbolic interval\nbased analysis from conventional qualitative certification to quantitative\ncertification, by computing the percentage of individuals whose classification\noutputs are provably fair, instead of merely deciding if the DNN is fair. We\nhave implemented our method and evaluated it on deep neural networks trained on\nfour popular fairness research datasets. The experimental results show that our\nmethod is not only more accurate than state-of-the-art techniques but also\nseveral orders-of-magnitude faster.\n","authors":["Brian Hyeongseok Kim","Jingbo Wang","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.03220v1.pdf","comment":"To Appear In Proceedings of the 47th IEEE/ACM International\n Conference on Software Engineering (ICSE 2025)"},{"id":"http://arxiv.org/abs/2409.03219v1","updated":"2024-09-05T03:33:54Z","published":"2024-09-05T03:33:54Z","title":"Content Moderation by LLM: From Accuracy to Legitimacy","summary":" One trending application of LLM (large language model) is to use it for\ncontent moderation in online platforms. Most current studies on this\napplication have focused on the metric of accuracy - the extent to which LLM\nmakes correct decisions about content. This article argues that accuracy is\ninsufficient and misleading, because it fails to grasp the distinction between\neasy cases and hard cases as well as the inevitable trade-offs in achieving\nhigher accuracy. Closer examination reveals that content moderation is a\nconstitutive part of platform governance, the key of which is to gain and\nenhance legitimacy. Instead of making moderation decisions correct, the chief\ngoal of LLM is to make them legitimate. In this regard, this article proposes a\nparadigm shift from the single benchmark of accuracy towards a legitimacy-based\nframework of evaluating the performance of LLM moderators. The framework\nsuggests that for easy cases, the key is to ensure accuracy, speed and\ntransparency, while for hard cases, what matters is reasoned justification and\nuser participation. Examined under this framework, LLM's real potential in\nmoderation is not accuracy improvement. Rather, LLM can better contribute in\nfour other aspects: to conduct screening of hard cases from easy cases, to\nprovide quality explanations for moderation decisions, to assist human\nreviewers in getting more contextual information, and to facilitate user\nparticipation in a more interactive way. Using normative theories from law and\nsocial sciences to critically assess the new technological application, this\narticle seeks to redefine LLM's role in content moderation and redirect\nrelevant research in this field.\n","authors":["Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.03219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03218v1","updated":"2024-09-05T03:32:39Z","published":"2024-09-05T03:32:39Z","title":"Application Research On Real-Time Perception Of Device Performance\n Status","summary":" In order to accurately identify the performance status of mobile devices and\nfinely adjust the user experience, a real-time performance perception\nevaluation method based on TOPSIS (Technique for Order Preference by Similarity\nto Ideal Solution) combined with entropy weighting method and time series model\nconstruction was studied. After collecting the performance characteristics of\nvarious mobile devices, the device performance profile was fitted by using PCA\n(principal component analysis) dimensionality reduction and feature engineering\nmethods such as descriptive time series analysis. The ability of performance\nfeatures and profiles to describe the real-time performance status of devices\nwas understood and studied by applying the TOPSIS method and multi-level\nweighting processing. A time series model was constructed for the feature set\nunder objective weighting, and multiple sensitivity (real-time, short-term,\nlong-term) performance status perception results were provided to obtain\nreal-time performance evaluation data and long-term stable performance\nprediction data. Finally, by configuring dynamic AB experiments and overlaying\nfine-grained power reduction strategies, the usability of the method was\nverified, and the accuracy of device performance status identification and\nprediction was compared with the performance of the profile features including\ndimensionality reduction time series modeling, TOPSIS method and entropy\nweighting method, subjective weighting, HMA method. The results show that\naccurate real-time performance perception results can greatly enhance business\nvalue, and this research has application effectiveness and certain\nforward-looking significance.\n","authors":["Zhe Wang","Zhen Wang","Jianwen Wu","Wangzhong Xiao","Yidong Chen","Zihua Feng","Dian Yang","Hongchen Liu","Bo Liang","Jiaojiao Fu"],"pdf_url":"https://arxiv.org/pdf/2409.03218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03215v1","updated":"2024-09-05T03:22:22Z","published":"2024-09-05T03:22:22Z","title":"xLAM: A Family of Large Action Models to Empower AI Agent Systems","summary":" Autonomous agents powered by large language models (LLMs) have attracted\nsignificant research interest. However, the open-source community faces many\nchallenges in developing specialized models for agent tasks, driven by the\nscarcity of high-quality agent datasets and the absence of standard protocols\nin this area. We introduce and publicly release xLAM, a series of large action\nmodels designed for AI agent tasks. The xLAM series includes five models with\nboth dense and mixture-of-expert architectures, ranging from 1B to 8x22B\nparameters, trained using a scalable, flexible pipeline that unifies, augments,\nand synthesizes diverse datasets to enhance AI agents' generalizability and\nperformance across varied environments. Our experimental results demonstrate\nthat xLAM consistently delivers exceptional performance across multiple agent\nability benchmarks, notably securing the 1st position on the Berkeley\nFunction-Calling Leaderboard, outperforming GPT-4, Claude-3, and many other\nmodels in terms of tool use. By releasing the xLAM series, we aim to advance\nthe performance of open-source LLMs for autonomous AI agents, potentially\naccelerating progress and democratizing access to high-performance models for\nagent tasks. Models are available at\nhttps://huggingface.co/collections/Salesforce/xlam-models-65f00e2a0a63bbcd1c2dade4\n","authors":["Jianguo Zhang","Tian Lan","Ming Zhu","Zuxin Liu","Thai Hoang","Shirley Kokane","Weiran Yao","Juntao Tan","Akshara Prabhakar","Haolin Chen","Zhiwei Liu","Yihao Feng","Tulika Awalgaonkar","Rithesh Murthy","Eric Hu","Zeyuan Chen","Ran Xu","Juan Carlos Niebles","Shelby Heinecke","Huan Wang","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2409.03215v1.pdf","comment":"Technical report for the Salesforce xLAM model series"},{"id":"http://arxiv.org/abs/2409.03212v1","updated":"2024-09-05T03:16:41Z","published":"2024-09-05T03:16:41Z","title":"Bi-capacity Choquet Integral for Sensor Fusion with Label Uncertainty","summary":" Sensor fusion combines data from multiple sensor sources to improve\nreliability, robustness, and accuracy of data interpretation. The Fuzzy\nIntegral (FI), in particular, the Choquet integral (ChI), is often used as a\npowerful nonlinear aggregator for fusion across multiple sensors. However,\nexisting supervised ChI learning algorithms typically require precise training\nlabels for each input data point, which can be difficult or impossible to\nobtain. Additionally, prior work on ChI fusion is often based only on the\nnormalized fuzzy measures, which bounds the fuzzy measure values between [0,\n1]. This can be limiting in cases where the underlying scales of input data\nsources are bipolar (i.e., between [-1, 1]). To address these challenges, this\npaper proposes a novel Choquet integral-based fusion framework, named Bi-MIChI\n(pronounced \"bi-mi-kee\"), which uses bi-capacities to represent the\ninteractions between pairs of subsets of the input sensor sources on a bi-polar\nscale. This allows for extended non-linear interactions between the sensor\nsources and can lead to interesting fusion results. Bi-MIChI also addresses\nlabel uncertainty through Multiple Instance Learning, where training labels are\napplied to \"bags\" (sets) of data instead of per-instance. Our proposed Bi-MIChI\nframework shows effective classification and detection performance on both\nsynthetic and real-world experiments for sensor fusion with label uncertainty.\nWe also provide detailed analyses on the behavior of the fuzzy measures to\ndemonstrate our fusion process.\n","authors":["Hersh Vakharia","Xiaoxiao Du"],"pdf_url":"https://arxiv.org/pdf/2409.03212v1.pdf","comment":"10 pages, 7 figures, 7 tables; Accepted to 2024 FUZZ-IEEE and\n presented at 2024 IEEE WCCI; Code available at\n https://github.com/hvak/Bi-MIChI"},{"id":"http://arxiv.org/abs/2409.03204v1","updated":"2024-09-05T02:52:11Z","published":"2024-09-05T02:52:11Z","title":"Pricing American Options using Machine Learning Algorithms","summary":" This study investigates the application of machine learning algorithms,\nparticularly in the context of pricing American options using Monte Carlo\nsimulations. Traditional models, such as the Black-Scholes-Merton framework,\noften fail to adequately address the complexities of American options, which\ninclude the ability for early exercise and non-linear payoff structures. By\nleveraging Monte Carlo methods in conjunction Least Square Method machine\nlearning was used. This research aims to improve the accuracy and efficiency of\noption pricing. The study evaluates several machine learning models, including\nneural networks and decision trees, highlighting their potential to outperform\ntraditional approaches. The results from applying machine learning algorithm in\nLSM indicate that integrating machine learning with Monte Carlo simulations can\nenhance pricing accuracy and provide more robust predictions, offering\nsignificant insights into quantitative finance by merging classical financial\ntheories with modern computational techniques. The dataset was split into\nfeatures and the target variable representing bid prices, with an 80-20\ntrain-validation split. LSTM and GRU models were constructed using TensorFlow's\nKeras API, each with four hidden layers of 200 neurons and an output layer for\nbid price prediction, optimized with the Adam optimizer and MSE loss function.\nThe GRU model outperformed the LSTM model across all evaluated metrics,\ndemonstrating lower mean absolute error, mean squared error, and root mean\nsquared error, along with greater stability and efficiency in training.\n","authors":["Prudence Djagba","Callixte Ndizihiwe"],"pdf_url":"https://arxiv.org/pdf/2409.03204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02430v2","updated":"2024-09-05T02:32:35Z","published":"2024-09-04T04:17:57Z","title":"Transfer-based Adversarial Poisoning Attacks for Online (MIMO-)Deep\n Receviers","summary":" Recently, the design of wireless receivers using deep neural networks (DNNs),\nknown as deep receivers, has attracted extensive attention for ensuring\nreliable communication in complex channel environments. To adapt quickly to\ndynamic channels, online learning has been adopted to update the weights of\ndeep receivers with over-the-air data (e.g., pilots). However, the fragility of\nneural models and the openness of wireless channels expose these systems to\nmalicious attacks. To this end, understanding these attack methods is essential\nfor robust receiver design. In this paper, we propose a transfer-based\nadversarial poisoning attack method for online receivers.Without knowledge of\nthe attack target, adversarial perturbations are injected to the pilots,\npoisoning the online deep receiver and impairing its ability to adapt to\ndynamic channels and nonlinear effects. In particular, our attack method\ntargets Deep Soft Interference Cancellation (DeepSIC)[1] using online\nmeta-learning. As a classical model-driven deep receiver, DeepSIC incorporates\nwireless domain knowledge into its architecture. This integration allows it to\nadapt efficiently to time-varying channels with only a small number of pilots,\nachieving optimal performance in a multi-input and multi-output (MIMO)\nscenario.The deep receiver in this scenario has a number of applications in the\nfield of wireless communication, which motivates our study of the attack\nmethods targeting it.Specifically, we demonstrate the effectiveness of our\nattack in simulations on synthetic linear, synthetic nonlinear, static, and\nCOST 2100 channels. Simulation results indicate that the proposed poisoning\nattack significantly reduces the performance of online receivers in rapidly\nchanging scenarios.\n","authors":["Kunze Wu","Weiheng Jiang","Dusit Niyato","Yinghuan Li","Chuang Luo"],"pdf_url":"https://arxiv.org/pdf/2409.02430v2.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.03187v1","updated":"2024-09-05T02:25:04Z","published":"2024-09-05T02:25:04Z","title":"How noise affects memory in linear recurrent networks","summary":" The effects of noise on memory in a linear recurrent network are\ntheoretically investigated. Memory is characterized by its ability to store\nprevious inputs in its instantaneous state of network, which receives a\ncorrelated or uncorrelated noise. Two major properties are revealed: First, the\nmemory reduced by noise is uniquely determined by the noise's power spectral\ndensity (PSD). Second, the memory will not decrease regardless of noise\nintensity if the PSD is in a certain class of distribution (including power\nlaw). The results are verified using the human brain signals, showing good\nagreement.\n","authors":["JingChuan Guan","Tomoyuki Kubota","Yasuo Kuniyoshi","Kohei Nakajima"],"pdf_url":"https://arxiv.org/pdf/2409.03187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01447v2","updated":"2024-09-05T02:16:17Z","published":"2024-09-02T20:07:25Z","title":"Last-Iterate Convergence of Payoff-Based Independent Learning in\n Zero-Sum Stochastic Games","summary":" In this paper, we consider two-player zero-sum matrix and stochastic games\nand develop learning dynamics that are payoff-based, convergent, rational, and\nsymmetric between the two players. Specifically, the learning dynamics for\nmatrix games are based on the smoothed best-response dynamics, while the\nlearning dynamics for stochastic games build upon those for matrix games, with\nadditional incorporation of the minimax value iteration. To our knowledge, our\ntheoretical results present the first finite-sample analysis of such learning\ndynamics with last-iterate guarantees. In the matrix game setting, the results\nimply a sample complexity of $O(\\epsilon^{-1})$ to find the Nash distribution\nand a sample complexity of $O(\\epsilon^{-8})$ to find a Nash equilibrium. In\nthe stochastic game setting, the results also imply a sample complexity of\n$O(\\epsilon^{-8})$ to find a Nash equilibrium. To establish these results, the\nmain challenge is to handle stochastic approximation algorithms with multiple\nsets of coupled and stochastic iterates that evolve on (possibly) different\ntime scales. To overcome this challenge, we developed a coupled Lyapunov-based\napproach, which may be of independent interest to the broader community\nstudying the convergence behavior of stochastic approximation algorithms.\n","authors":["Zaiwei Chen","Kaiqing Zhang","Eric Mazumdar","Asuman Ozdaglar","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2409.01447v2.pdf","comment":"A preliminary version [arXiv:2303.03100] of this paper, with a subset\n of the results that are presented here, was presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2409.03180v1","updated":"2024-09-05T02:14:31Z","published":"2024-09-05T02:14:31Z","title":"Machine learning-based algorithms for at-home respiratory disease\n monitoring and respiratory assessment","summary":" Respiratory diseases impose a significant burden on global health, with\ncurrent diagnostic and management practices primarily reliant on specialist\nclinical testing. This work aims to develop machine learning-based algorithms\nto facilitate at-home respiratory disease monitoring and assessment for\npatients undergoing continuous positive airway pressure (CPAP) therapy. Data\nwere collected from 30 healthy adults, encompassing respiratory pressure, flow,\nand dynamic thoraco-abdominal circumferential measurements under three\nbreathing conditions: normal, panting, and deep breathing. Various machine\nlearning models, including the random forest classifier, logistic regression,\nand support vector machine (SVM), were trained to predict breathing types. The\nrandom forest classifier demonstrated the highest accuracy, particularly when\nincorporating breathing rate as a feature. These findings support the potential\nof AI-driven respiratory monitoring systems to transition respiratory\nassessments from clinical settings to home environments, enhancing\naccessibility and patient autonomy. Future work involves validating these\nmodels with larger, more diverse populations and exploring additional machine\nlearning techniques.\n","authors":["Negar Orangi-Fard","Alexandru Bogdan","Hersh Sagreiya"],"pdf_url":"https://arxiv.org/pdf/2409.03180v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.13958v2","updated":"2024-09-05T02:07:11Z","published":"2024-08-25T23:41:39Z","title":"Prediction of COPD Using Machine Learning, Clinical Summary Notes, and\n Vital Signs","summary":" Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung\ndisease that causes obstructed airflow from the lungs. In the United States,\nmore than 15.7 million Americans have been diagnosed with COPD, with 96% of\nindividuals living with at least one other chronic health condition. It is the\n4th leading cause of death in the country. Over 2.2 million patients are\nadmitted to hospitals annually due to COPD exacerbations. Monitoring and\npredicting patient exacerbations on-time could save their life. This paper\npresents two different predictive models to predict COPD exacerbation using AI\nand natural language processing (NLP) approaches. These models use respiration\nsummary notes, symptoms, and vital signs. To train and test these models, data\nrecords containing physiologic signals and vital signs time series were used.\nThese records were captured from patient monitors and comprehensive clinical\ndata obtained from hospital medical information systems for tens of thousands\nof Intensive Care Unit (ICU) patients. We achieved an area under the Receiver\noperating characteristic (ROC) curve of 0.82 in detection and prediction of\nCOPD exacerbation.\n","authors":["Negar Orangi-Fard"],"pdf_url":"https://arxiv.org/pdf/2408.13958v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.01354v2","updated":"2024-09-05T02:00:12Z","published":"2024-09-02T16:15:26Z","title":"Explanation Space: A New Perspective into Time Series Interpretability","summary":" Human understandable explanation of deep learning models is necessary for\nmany critical and sensitive applications. Unlike image or tabular data where\nthe importance of each input feature (for the classifier's decision) can be\ndirectly projected into the input, time series distinguishable features (e.g.\ndominant frequency) are often hard to manifest in time domain for a user to\neasily understand. Moreover, most explanation methods require a baseline value\nas an indication of the absence of any feature. However, the notion of lack of\nfeature, which is often defined as black pixels for vision tasks or zero/mean\nvalues for tabular data, is not well-defined in time series. Despite the\nadoption of explainable AI methods (XAI) from tabular and vision domain into\ntime series domain, these differences limit the application of these XAI\nmethods in practice. In this paper, we propose a simple yet effective method\nthat allows a model originally trained on time domain to be interpreted in\nother explanation spaces using existing methods. We suggest four explanation\nspaces that each can potentially alleviate these issues in certain types of\ntime series. Our method can be readily adopted in existing platforms without\nany change to trained models or XAI methods. The code is available at\nhttps://github.com/shrezaei/TS-X-spaces.\n","authors":["Shahbaz Rezaei","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2409.01354v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03167v1","updated":"2024-09-05T01:54:29Z","published":"2024-09-05T01:54:29Z","title":"InfraLib: Enabling Reinforcement Learning and Decision Making for Large\n Scale Infrastructure Management","summary":" Efficient management of infrastructure systems is crucial for economic\nstability, sustainability, and public safety. However, infrastructure\nmanagement is challenging due to the vast scale of systems, stochastic\ndeterioration of components, partial observability, and resource constraints.\nWhile data-driven approaches like reinforcement learning (RL) offer a promising\navenue for optimizing management policies, their application to infrastructure\nhas been limited by the lack of suitable simulation environments. We introduce\nInfraLib, a comprehensive framework for modeling and analyzing infrastructure\nmanagement problems. InfraLib employs a hierarchical, stochastic approach to\nrealistically model infrastructure systems and their deterioration. It supports\npractical functionality such as modeling component unavailability, cyclical\nbudgets, and catastrophic failures. To facilitate research, InfraLib provides\ntools for expert data collection, simulation-driven analysis, and\nvisualization. We demonstrate InfraLib's capabilities through case studies on a\nreal-world road network and a synthetic benchmark with 100,000 components.\n","authors":["Pranay Thangeda","Trevor S. Betz","Michael N. Grussing","Melkior Ornik"],"pdf_url":"https://arxiv.org/pdf/2409.03167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03164v1","updated":"2024-09-05T01:48:11Z","published":"2024-09-05T01:48:11Z","title":"A Scalable Matrix Visualization for Understanding Tree Ensemble\n Classifiers","summary":" The high performance of tree ensemble classifiers benefits from a large set\nof rules, which, in turn, makes the models hard to understand. To improve\ninterpretability, existing methods extract a subset of rules for approximation\nusing model reduction techniques. However, by focusing on the reduced rule set,\nthese methods often lose fidelity and ignore anomalous rules that, despite\ntheir infrequency, play crucial roles in real-world applications. This paper\nintroduces a scalable visual analysis method to explain tree ensemble\nclassifiers that contain tens of thousands of rules. The key idea is to address\nthe issue of losing fidelity by adaptively organizing the rules as a hierarchy\nrather than reducing them. To ensure the inclusion of anomalous rules, we\ndevelop an anomaly-biased model reduction method to prioritize these rules at\neach hierarchical level. Synergized with this hierarchical organization of\nrules, we develop a matrix-based hierarchical visualization to support\nexploration at different levels of detail. Our quantitative experiments and\ncase studies demonstrate how our method fosters a deeper understanding of both\ncommon and anomalous rules, thereby enhancing interpretability without\nsacrificing comprehensiveness.\n","authors":["Zhen Li","Weikai Yang","Jun Yuan","Jing Wu","Changjian Chen","Yao Ming","Fan Yang","Hui Zhang","Shixia Liu"],"pdf_url":"https://arxiv.org/pdf/2409.03164v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2405.10443v3","updated":"2024-09-05T01:06:40Z","published":"2024-05-16T21:07:42Z","title":"Simultaneous Masking, Not Prompting Optimization: A Paradigm Shift in\n Fine-tuning LLMs for Simultaneous Translation","summary":" Large language models (LLMs) have achieved state-of-the-art performance in\nvarious language processing tasks, motivating their adoption in simultaneous\ntranslation. Current fine-tuning methods to adapt LLMs for simultaneous\ntranslation focus on prompting optimization strategies using either data\naugmentation or prompt structure modifications. However, these methods suffer\nfrom several issues, such as unnecessarily expanded training sets,\ncomputational inefficiency from dumping the key and value cache, increased\nprompt sizes, or restriction to a single decision policy. To eliminate these\nissues, in this work, we propose SimulMask, a new paradigm for fine-tuning LLMs\nfor simultaneous translation. It utilizes a novel attention mask approach that\nmodels simultaneous translation during fine-tuning by masking attention for a\ndesired decision policy. Applying the proposed SimulMask on a Falcon LLM for\nthe IWSLT 2017 dataset, we have observed a significant translation quality\nimprovement compared to state-of-the-art prompting optimization strategies on\nfive language pairs while reducing the computational cost.\n","authors":["Matthew Raffel","Victor Agostinelli","Lizhong Chen"],"pdf_url":"https://arxiv.org/pdf/2405.10443v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01731v3","updated":"2024-09-05T01:01:20Z","published":"2024-09-03T09:14:21Z","title":"Stacked ensemble\\-based mutagenicity prediction model using multiple\n modalities with graph attention network","summary":" Mutagenicity is a concern due to its association with genetic mutations which\ncan result in a variety of negative consequences, including the development of\ncancer. Earlier identification of mutagenic compounds in the drug development\nprocess is therefore crucial for preventing the progression of unsafe\ncandidates and reducing development costs. While computational techniques,\nespecially machine learning models have become increasingly prevalent for this\nendpoint, they rely on a single modality. In this work, we introduce a novel\nstacked ensemble based mutagenicity prediction model which incorporate multiple\nmodalities such as simplified molecular input line entry system (SMILES) and\nmolecular graph. These modalities capture diverse information about molecules\nsuch as substructural, physicochemical, geometrical and topological. To derive\nsubstructural, geometrical and physicochemical information, we use SMILES,\nwhile topological information is extracted through a graph attention network\n(GAT) via molecular graph. Our model uses a stacked ensemble of machine\nlearning classifiers to make predictions using these multiple features. We\nemploy the explainable artificial intelligence (XAI) technique SHAP (Shapley\nAdditive Explanations) to determine the significance of each classifier and the\nmost relevant features in the prediction. We demonstrate that our method\nsurpasses SOTA methods on two standard datasets across various metrics.\nNotably, we achieve an area under the curve of 95.21\\% on the Hansen benchmark\ndataset, affirming the efficacy of our method in predicting mutagenicity. We\nbelieve that this research will captivate the interest of both clinicians and\ncomputational biologists engaged in translational research.\n","authors":["Tanya Liyaqat","Tanvir Ahmad","Mohammad Kashif","Chandni Saxena"],"pdf_url":"https://arxiv.org/pdf/2409.01731v3.pdf","comment":"Submitted to a journal"},{"id":"http://arxiv.org/abs/2409.03151v1","updated":"2024-09-05T00:58:07Z","published":"2024-09-05T00:58:07Z","title":"Standing on the shoulders of giants","summary":" Although fundamental to the advancement of Machine Learning, the classic\nevaluation metrics extracted from the confusion matrix, such as precision and\nF1, are limited. Such metrics only offer a quantitative view of the models'\nperformance, without considering the complexity of the data or the quality of\nthe hit. To overcome these limitations, recent research has introduced the use\nof psychometric metrics such as Item Response Theory (IRT), which allows an\nassessment at the level of latent characteristics of instances. This work\ninvestigates how IRT concepts can enrich a confusion matrix in order to\nidentify which model is the most appropriate among options with similar\nperformance. In the study carried out, IRT does not replace, but complements\nclassical metrics by offering a new layer of evaluation and observation of the\nfine behavior of models in specific instances. It was also observed that there\nis 97% confidence that the score from the IRT has different contributions from\n66% of the classical metrics analyzed.\n","authors":["Lucas Felipe Ferraro Cardoso","José de Sousa Ribeiro Filho","Vitor Cirilo Araujo Santos","Regiane Silva Kawasaki Frances","Ronnie Cley de Oliveira Alves"],"pdf_url":"https://arxiv.org/pdf/2409.03151v1.pdf","comment":"15 pages, 8 figures, 3 tables, submitted for the BRACIS'24 conference"},{"id":"http://arxiv.org/abs/2409.03149v1","updated":"2024-09-05T00:56:25Z","published":"2024-09-05T00:56:25Z","title":"Non-stationary and Sparsely-correlated Multi-output Gaussian Process\n with Spike-and-Slab Prior","summary":" Multi-output Gaussian process (MGP) is commonly used as a transfer learning\nmethod to leverage information among multiple outputs. A key advantage of MGP\nis providing uncertainty quantification for prediction, which is highly\nimportant for subsequent decision-making tasks. However, traditional MGP may\nnot be sufficiently flexible to handle multivariate data with dynamic\ncharacteristics, particularly when dealing with complex temporal correlations.\nAdditionally, since some outputs may lack correlation, transferring information\namong them may lead to negative transfer. To address these issues, this study\nproposes a non-stationary MGP model that can capture both the dynamic and\nsparse correlation among outputs. Specifically, the covariance functions of MGP\nare constructed using convolutions of time-varying kernel functions. Then a\ndynamic spike-and-slab prior is placed on correlation parameters to\nautomatically decide which sources are informative to the target output in the\ntraining process. An expectation-maximization (EM) algorithm is proposed for\nefficient model fitting. Both numerical studies and a real case demonstrate its\nefficacy in capturing dynamic and sparse correlation structure and mitigating\nnegative transfer for high-dimensional time-series data. Finally, a\nmountain-car reinforcement learning case highlights its potential application\nin decision making problems.\n","authors":["Wang Xinming","Li Yongxiang","Yue Xiaowei","Wu Jianguo"],"pdf_url":"https://arxiv.org/pdf/2409.03149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03148v1","updated":"2024-09-05T00:54:48Z","published":"2024-09-05T00:54:48Z","title":"Discovering Cyclists' Street Visual Preferences Through Multi-Source Big\n Data Using Deep Inverse Reinforcement Learning","summary":" Cycling has gained global popularity for its health benefits and positive\nurban impacts. To effectively promote cycling, early studies have extensively\ninvestigated the relationship between cycling behaviors and environmental\nfactors, especially cyclists' preferences when making route decisions. However,\nthese studies often struggle to comprehensively describe detailed cycling\nprocedures at a large scale due to data limitations, and they tend to overlook\nthe complex nature of cyclists' preferences. To address these issues, we\npropose a novel framework aimed to quantify and interpret cyclists' complicated\nstreet visual preferences from cycling records by leveraging maximum entropy\ndeep inverse reinforcement learning (MEDIRL) and explainable artificial\nintelligence (XAI). Implemented in Bantian Sub-district, Shenzhen, we adapt\nMEDIRL model for efficient estimation of cycling reward function by integrating\ndockless-bike-sharing (DBS) trajectory and street view images (SVIs), which\nserves as a representation of cyclists' preferences for street visual\nenvironments during routing. In addition, we demonstrate the feasibility and\nreliability of MEDIRL in discovering cyclists' street visual preferences.\nFurther analysis reveals the nonlinear and interactive effects of street visual\nelements on cyclists' preferences, offering a holistic perspective on\nstreetscape design. Our proposed framework advances the understanding of\nindividual cycling behaviors and provides actionable insights for urban\nplanners to design bicycle-friendly streetscapes that prioritize cyclists'\npreferences.\n","authors":["Ren Kezhou","Gong Yongxi"],"pdf_url":"https://arxiv.org/pdf/2409.03148v1.pdf","comment":"38 pages, 16 figures"},{"id":"http://arxiv.org/abs/2409.03147v1","updated":"2024-09-05T00:52:59Z","published":"2024-09-05T00:52:59Z","title":"Addressing the Gaps in Early Dementia Detection: A Path Towards Enhanced\n Diagnostic Models through Machine Learning","summary":" The rapid global aging trend has led to an increase in dementia cases,\nincluding Alzheimer's disease, underscoring the urgent need for early and\naccurate diagnostic methods. Traditional diagnostic techniques, such as\ncognitive tests, neuroimaging, and biomarker analysis, face significant\nlimitations in sensitivity, accessibility, and cost, particularly in the early\nstages. This study explores the potential of machine learning (ML) as a\ntransformative approach to enhance early dementia detection by leveraging ML\nmodels to analyze and integrate complex multimodal datasets, including\ncognitive assessments, neuroimaging, and genetic information. A comprehensive\nreview of existing literature was conducted to evaluate various ML models,\nincluding supervised learning, deep learning, and advanced techniques such as\nensemble learning and transformer models, assessing their accuracy,\ninterpretability, and potential for clinical integration. The findings indicate\nthat while ML models show significant promise in improving diagnostic precision\nand enabling earlier interventions, challenges remain in their\ngeneralizability, interpretability, and ethical deployment. This research\nconcludes by outlining future directions aimed at enhancing the clinical\nutility of ML models in dementia detection, emphasizing interdisciplinary\ncollaboration and ethically sound frameworks to improve early detection and\nintervention strategies for Alzheimer's disease and other forms of dementia.\n","authors":["Juan A. Berrios Moya"],"pdf_url":"https://arxiv.org/pdf/2409.03147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03142v1","updated":"2024-09-05T00:38:27Z","published":"2024-09-05T00:38:27Z","title":"Causal Temporal Representation Learning with Nonstationary Sparse\n Transition","summary":" Causal Temporal Representation Learning (Ctrl) methods aim to identify the\ntemporal causal dynamics of complex nonstationary temporal sequences. Despite\nthe success of existing Ctrl methods, they require either directly observing\nthe domain variables or assuming a Markov prior on them. Such requirements\nlimit the application of these methods in real-world scenarios when we do not\nhave such prior knowledge of the domain variables. To address this problem,\nthis work adopts a sparse transition assumption, aligned with intuitive human\nunderstanding, and presents identifiability results from a theoretical\nperspective. In particular, we explore under what conditions on the\nsignificance of the variability of the transitions we can build a model to\nidentify the distribution shifts. Based on the theoretical result, we introduce\na novel framework, Causal Temporal Representation Learning with Nonstationary\nSparse Transition (CtrlNS), designed to leverage the constraints on transition\nsparsity and conditional independence to reliably identify both distribution\nshifts and latent factors. Our experimental evaluations on synthetic and\nreal-world datasets demonstrate significant improvements over existing\nbaselines, highlighting the effectiveness of our approach.\n","authors":["Xiangchen Song","Zijian Li","Guangyi Chen","Yujia Zheng","Yewen Fan","Xinshuai Dong","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03141v1","updated":"2024-09-05T00:36:23Z","published":"2024-09-05T00:36:23Z","title":"Towards Autonomous Cybersecurity: An Intelligent AutoML Framework for\n Autonomous Intrusion Detection","summary":" The rapid evolution of mobile networks from 5G to 6G has necessitated the\ndevelopment of autonomous network management systems, such as Zero-Touch\nNetworks (ZTNs). However, the increased complexity and automation of these\nnetworks have also escalated cybersecurity risks. Existing Intrusion Detection\nSystems (IDSs) leveraging traditional Machine Learning (ML) techniques have\nshown effectiveness in mitigating these risks, but they often require extensive\nmanual effort and expert knowledge. To address these challenges, this paper\nproposes an Automated Machine Learning (AutoML)-based autonomous IDS framework\ntowards achieving autonomous cybersecurity for next-generation networks. To\nachieve autonomous intrusion detection, the proposed AutoML framework automates\nall critical procedures of the data analytics pipeline, including data\npre-processing, feature engineering, model selection, hyperparameter tuning,\nand model ensemble. Specifically, it utilizes a Tabular Variational\nAuto-Encoder (TVAE) method for automated data balancing, tree-based ML models\nfor automated feature selection and base model learning, Bayesian Optimization\n(BO) for hyperparameter optimization, and a novel Optimized Confidence-based\nStacking Ensemble (OCSE) method for automated model ensemble. The proposed\nAutoML-based IDS was evaluated on two public benchmark network security\ndatasets, CICIDS2017 and 5G-NIDD, and demonstrated improved performance\ncompared to state-of-the-art cybersecurity methods. This research marks a\nsignificant step towards fully autonomous cybersecurity in next-generation\nnetworks, potentially revolutionizing network security applications.\n","authors":["Li Yang","Abdallah Shami"],"pdf_url":"https://arxiv.org/pdf/2409.03141v1.pdf","comment":"Accepted to the Workshop on Autonomous Cybersecurity, ACM CCS 2024;\n Code is available at Github link:\n https://github.com/Western-OC2-Lab/AutonomousCyber-AutoML-based-Autonomous-Intrusion-Detection-System"},{"id":"http://arxiv.org/abs/2409.03140v1","updated":"2024-09-05T00:25:37Z","published":"2024-09-05T00:25:37Z","title":"GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase\n Recommendation","summary":" Online sellers and advertisers are recommended keyphrases for their listed\nproducts, which they bid on to enhance their sales. One popular paradigm that\ngenerates such recommendations is Extreme Multi-Label Classification (XMC),\nwhich involves tagging/mapping keyphrases to items. We outline the limitations\nof using traditional item-query based tagging or mapping techniques for\nkeyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an\ninnovative graph-based approach that recommends keyphrases to sellers using\nextraction of token permutations from item titles. Additionally, we demonstrate\nthat relying on traditional metrics such as precision/recall can be misleading\nin practical applications, thereby necessitating a combination of metrics to\nevaluate performance in real-world scenarios. These metrics are designed to\nassess the relevance of keyphrases to items and the potential for buyer\noutreach. GraphEx outperforms production models at eBay, achieving the\nobjectives mentioned above. It supports near real-time inferencing in\nresource-constrained production environments and scales effectively for\nbillions of items.\n","authors":["Ashirbad Mishra","Soumik Dey","Marshall Wu","Jinyu Zhao","He Yu","Kaichen Ni","Binbin Li","Kamesh Madduri"],"pdf_url":"https://arxiv.org/pdf/2409.03140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03137v1","updated":"2024-09-05T00:13:16Z","published":"2024-09-05T00:13:16Z","title":"The AdEMAMix Optimizer: Better, Faster, Older","summary":" Momentum based optimizers are central to a wide range of machine learning\napplications. These typically rely on an Exponential Moving Average (EMA) of\ngradients, which decays exponentially the present contribution of older\ngradients. This accounts for gradients being local linear approximations which\nlose their relevance as the iterate moves along the loss landscape. This work\nquestions the use of a single EMA to accumulate past gradients and empirically\ndemonstrates how this choice can be sub-optimal: a single EMA cannot\nsimultaneously give a high weight to the immediate past, and a non-negligible\nweight to older gradients. Building on this observation, we propose AdEMAMix, a\nsimple modification of the Adam optimizer with a mixture of two EMAs to better\ntake advantage of past gradients. Our experiments on language modeling and\nimage classification show -- quite surprisingly -- that gradients can stay\nrelevant for tens of thousands of steps. They help to converge faster, and\noften to lower minima: e.g., a $1.3$B parameter AdEMAMix LLM trained on $101$B\ntokens performs comparably to an AdamW model trained on $197$B tokens\n($+95\\%$). Moreover, our method significantly slows-down model forgetting\nduring training. Our work motivates further exploration of different types of\nfunctions to leverage past gradients, beyond EMAs.\n","authors":["Matteo Pagliardini","Pierre Ablin","David Grangier"],"pdf_url":"https://arxiv.org/pdf/2409.03137v1.pdf","comment":"38 pages, 27 figures"},{"id":"http://arxiv.org/abs/2408.16966v2","updated":"2024-09-05T23:18:00Z","published":"2024-08-30T01:56:57Z","title":"UserSumBench: A Benchmark Framework for Evaluating User Summarization\n Approaches","summary":" Large language models (LLMs) have shown remarkable capabilities in generating\nuser summaries from a long list of raw user activity data. These summaries\ncapture essential user information such as preferences and interests, and\ntherefore are invaluable for LLM-based personalization applications, such as\nexplainable recommender systems. However, the development of new summarization\ntechniques is hindered by the lack of ground-truth labels, the inherent\nsubjectivity of user summaries, and human evaluation which is often costly and\ntime-consuming. To address these challenges, we introduce \\UserSumBench, a\nbenchmark framework designed to facilitate iterative development of LLM-based\nsummarization approaches. This framework offers two key components: (1) A\nreference-free summary quality metric. We show that this metric is effective\nand aligned with human preferences across three diverse datasets (MovieLens,\nYelp and Amazon Review). (2) A novel robust summarization method that leverages\ntime-hierarchical summarizer and self-critique verifier to produce high-quality\nsummaries while eliminating hallucination. This method serves as a strong\nbaseline for further innovation in summarization techniques.\n","authors":["Chao Wang","Neo Wu","Lin Ning","Jiaxing Wu","Luyang Liu","Jun Xie","Shawn O'Banion","Bradley Green"],"pdf_url":"https://arxiv.org/pdf/2408.16966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16990v2","updated":"2024-09-05T22:36:44Z","published":"2024-02-26T19:49:54Z","title":"inGRASS: Incremental Graph Spectral Sparsification via\n Low-Resistance-Diameter Decomposition","summary":" This work presents inGRASS, a novel algorithm designed for incremental\nspectral sparsification of large undirected graphs. The proposed inGRASS\nalgorithm is highly scalable and parallel-friendly, having a nearly-linear time\ncomplexity for the setup phase and the ability to update the spectral\nsparsifier in $O(\\log N)$ time for each incremental change made to the original\ngraph with $N$ nodes. A key component in the setup phase of inGRASS is a\nmultilevel resistance embedding framework introduced for efficiently\nidentifying spectrally-critical edges and effectively detecting redundant ones,\nwhich is achieved by decomposing the initial sparsifier into many node clusters\nwith bounded effective-resistance diameters leveraging a\nlow-resistance-diameter decomposition (LRD) scheme. The update phase of inGRASS\nexploits low-dimensional node embedding vectors for efficiently estimating the\nimportance and uniqueness of each newly added edge. As demonstrated through\nextensive experiments, inGRASS achieves up to over $200 \\times$ speedups while\nretaining comparable solution quality in incremental spectral sparsification of\ngraphs obtained from various datasets, such as circuit simulations, finite\nelement analysis, and social networks.\n","authors":["Ali Aghdaei","Zhuo Feng"],"pdf_url":"https://arxiv.org/pdf/2402.16990v2.pdf","comment":"Accepted on DAC 2024"},{"id":"http://arxiv.org/abs/2409.03924v1","updated":"2024-09-05T22:08:28Z","published":"2024-09-05T22:08:28Z","title":"Generating High Dimensional User-Specific Wireless Channels using\n Diffusion Models","summary":" Deep neural network (DNN)-based algorithms are emerging as an important tool\nfor many physical and MAC layer functions in future wireless communication\nsystems, including for large multi-antenna channels. However, training such\nmodels typically requires a large dataset of high-dimensional channel\nmeasurements, which are very difficult and expensive to obtain. This paper\nintroduces a novel method for generating synthetic wireless channel data using\ndiffusion-based models to produce user-specific channels that accurately\nreflect real-world wireless environments. Our approach employs a conditional\ndenoising diffusion implicit models (cDDIM) framework, effectively capturing\nthe relationship between user location and multi-antenna channel\ncharacteristics. We generate synthetic high fidelity channel samples using user\npositions as conditional inputs, creating larger augmented datasets to overcome\nmeasurement scarcity. The utility of this method is demonstrated through its\nefficacy in training various downstream tasks such as channel compression and\nbeam alignment. Our approach significantly improves over prior methods, such as\nadding noise or using generative adversarial networks (GANs), especially in\nscenarios with limited initial measurements.\n","authors":["Taekyun Lee","Juseong Park","Hyeji Kim","Jeffrey G. Andrews"],"pdf_url":"https://arxiv.org/pdf/2409.03924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03916v1","updated":"2024-09-05T21:24:03Z","published":"2024-09-05T21:24:03Z","title":"A Survey on Signed Graph Embedding: Methods and Applications","summary":" A signed graph (SG) is a graph where edges carry sign information attached to\nit. The sign of a network can be positive, negative, or neutral. A signed\nnetwork is ubiquitous in a real-world network like social networks, citation\nnetworks, and various technical networks. There are many network embedding\nmodels have been proposed and developed for signed networks for both\nhomogeneous and heterogeneous types. SG embedding learns low-dimensional vector\nrepresentations for nodes of a network, which helps to do many network analysis\ntasks such as link prediction, node classification, and community detection. In\nthis survey, we perform a comprehensive study of SG embedding methods and\napplications. We introduce here the basic theories and methods of SGs and\nsurvey the current state of the art of signed graph embedding methods. In\naddition, we explore the applications of different types of SG embedding\nmethods in real-world scenarios. As an application, we have explored the\ncitation network to analyze authorship networks. We also provide source code\nand datasets to give future direction. Lastly, we explore the challenges of SG\nembedding and forecast various future research directions in this field.\n","authors":["Shrabani Ghosh"],"pdf_url":"https://arxiv.org/pdf/2409.03916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03915v1","updated":"2024-09-05T21:23:51Z","published":"2024-09-05T21:23:51Z","title":"Asynchronous Stochastic Approximation and Average-Reward Reinforcement\n Learning","summary":" This paper studies asynchronous stochastic approximation (SA) algorithms and\ntheir application to reinforcement learning in semi-Markov decision processes\n(SMDPs) with an average-reward criterion. We first extend Borkar and Meyn's\nstability proof method to accommodate more general noise conditions, leading to\nbroader convergence guarantees for asynchronous SA algorithms. Leveraging these\nresults, we establish the convergence of an asynchronous SA analogue of\nSchweitzer's classical relative value iteration algorithm, RVI Q-learning, for\nfinite-space, weakly communicating SMDPs. Furthermore, to fully utilize the SA\nresults in this application, we introduce new monotonicity conditions for\nestimating the optimal reward rate in RVI Q-learning. These conditions\nsubstantially expand the previously considered algorithmic framework, and we\naddress them with novel proof arguments in the stability and convergence\nanalysis of RVI Q-learning.\n","authors":["Huizhen Yu","Yi Wan","Richard S. Sutton"],"pdf_url":"https://arxiv.org/pdf/2409.03915v1.pdf","comment":"The materials in this paper extend the authors' results from 2023,\n reported in arXiv:2408.16262 and arXiv:2312.15091. This paper incorporates\n and subsumes the results of arXiv:2312.15091 and serves as Part II of\n arXiv:2408.16262"},{"id":"http://arxiv.org/abs/2407.09050v2","updated":"2024-09-05T21:17:13Z","published":"2024-07-12T07:18:05Z","title":"Refusing Safe Prompts for Multi-modal Large Language Models","summary":" Multimodal large language models (MLLMs) have become the cornerstone of\ntoday's generative AI ecosystem, sparking intense competition among tech giants\nand startups. In particular, an MLLM generates a text response given a prompt\nconsisting of an image and a question. While state-of-the-art MLLMs use safety\nfilters and alignment techniques to refuse unsafe prompts, in this work, we\nintroduce MLLM-Refusal, the first method that induces refusals for safe\nprompts. In particular, our MLLM-Refusal optimizes a nearly-imperceptible\nrefusal perturbation and adds it to an image, causing target MLLMs to likely\nrefuse a safe prompt containing the perturbed image and a safe question.\nSpecifically, we formulate MLLM-Refusal as a constrained optimization problem\nand propose an algorithm to solve it. Our method offers competitive advantages\nfor MLLM model providers by potentially disrupting user experiences of\ncompeting MLLMs, since competing MLLM's users will receive unexpected refusals\nwhen they unwittingly use these perturbed images in their prompts. We evaluate\nMLLM-Refusal on four MLLMs across four datasets, demonstrating its\neffectiveness in causing competing MLLMs to refuse safe prompts while not\naffecting non-competing MLLMs. Furthermore, we explore three potential\ncountermeasures-adding Gaussian noise, DiffPure, and adversarial training. Our\nresults show that though they can mitigate MLLM-Refusal's effectiveness, they\nalso sacrifice the accuracy and/or efficiency of the competing MLLM. The code\nis available at https://github.com/Sadcardation/MLLM-Refusal.\n","authors":["Zedian Shao","Hongbin Liu","Yuepeng Hu","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2407.09050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09237v3","updated":"2024-09-05T21:16:28Z","published":"2024-08-17T16:06:14Z","title":"QEDCartographer: Automating Formal Verification Using Reward-Free\n Reinforcement Learning","summary":" Formal verification is a promising method for producing reliable software,\nbut the difficulty of manually writing verification proofs severely limits its\nutility in practice. Recent methods have automated some proof synthesis by\nguiding a search through the proof space using a theorem prover. Unfortunately,\nthe theorem prover provides only the crudest estimate of progress, resulting in\neffectively undirected search. To address this problem, we create\nQEDCartographer, an automated proof-synthesis tool that combines supervised and\nreinforcement learning to more effectively explore the proof space.\nQEDCartographer incorporates the proofs' branching structure, enabling\nreward-free search and overcoming the sparse reward problem inherent to formal\nverification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K\ntheorems from 124 open-source Coq projects. QEDCartographer fully automatically\nproves 21.4% of the test-set theorems. Previous search-based proof-synthesis\ntools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on\nsupervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively.\nDiva, which combines 62 tools, proves 19.2%. Comparing to the most effective\nprior tool, Proverbot9001, QEDCartographer produces 34% shorter proofs 29%\nfaster, on average over the theorems both tools prove. Together,\nQEDCartographer and non-learning-based CoqHammer prove 30.3% of the theorems,\nwhile CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement\nlearning is a fruitful research direction for improving proof-synthesis tools'\nsearch mechanisms.\n","authors":["Alex Sanchez-Stern","Abhishek Varghese","Zhanna Kaufman","Dylan Zhang","Talia Ringer","Yuriy Brun"],"pdf_url":"https://arxiv.org/pdf/2408.09237v3.pdf","comment":"Published in the International Conference on Software Engineering\n (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan\n Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal\n Verification Using Reward-Free Reinforcement Learning, in Proceedings of the\n 47th International Conference on Software Engineering (ICSE), 2025"},{"id":"http://arxiv.org/abs/2312.01397v3","updated":"2024-09-05T20:29:23Z","published":"2023-12-03T13:50:24Z","title":"Visual Prompting Upgrades Neural Network Sparsification: A Data-Model\n Perspective","summary":" The rapid development of large-scale deep learning models questions the\naffordability of hardware platforms, which necessitates the pruning to reduce\ntheir computational and memory footprints. Sparse neural networks as the\nproduct, have demonstrated numerous favorable benefits like low complexity,\nundamaged generalization, etc. Most of the prominent pruning strategies are\ninvented from a model-centric perspective, focusing on searching and preserving\ncrucial weights by analyzing network topologies. However, the role of data and\nits interplay with model-centric pruning has remained relatively unexplored. In\nthis research, we introduce a novel data-model co-design perspective: to\npromote superior weight sparsity by learning important model topology and\nadequate input data in a synergetic manner. Specifically, customized Visual\nPrompts are mounted to upgrade neural Network sparsification in our proposed\nVPNs framework. As a pioneering effort, this paper conducts systematic\ninvestigations about the impact of different visual prompts on model pruning\nand suggests an effective joint optimization approach. Extensive experiments\nwith 3 network architectures and 8 datasets evidence the substantial\nperformance improvements from VPNs over existing start-of-the-art pruning\nalgorithms. Furthermore, we find that subnetworks discovered by VPNs from\npre-trained models enjoy better transferability across diverse downstream\nscenarios. These insights shed light on new promising possibilities of\ndata-model co-designs for vision model sparsification.\n","authors":["Can Jin","Tianjin Huang","Yihua Zhang","Mykola Pechenizkiy","Sijia Liu","Shiwei Liu","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01397v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03902v1","updated":"2024-09-05T20:22:01Z","published":"2024-09-05T20:22:01Z","title":"WaterMAS: Sharpness-Aware Maximization for Neural Network Watermarking","summary":" Nowadays, deep neural networks are used for solving complex tasks in several\ncritical applications and protecting both their integrity and intellectual\nproperty rights (IPR) has become of utmost importance. To this end, we advance\nWaterMAS, a substitutive, white-box neural network watermarking method that\nimproves the trade-off among robustness, imperceptibility, and computational\ncomplexity, while making provisions for increased data payload and security.\nWasterMAS insertion keeps unchanged the watermarked weights while sharpening\ntheir underlying gradient space. The robustness is thus ensured by limiting the\nattack's strength: even small alterations of the watermarked weights would\nimpact the model's performance. The imperceptibility is ensured by inserting\nthe watermark during the training process. The relationship among the WaterMAS\ndata payload, imperceptibility, and robustness properties is discussed. The\nsecret key is represented by the positions of the weights conveying the\nwatermark, randomly chosen through multiple layers of the model. The security\nis evaluated by investigating the case in which an attacker would intercept the\nkey. The experimental validations consider 5 models and 2 tasks (VGG16,\nResNet18, MobileNetV3, SwinT for CIFAR10 image classification, and DeepLabV3\nfor Cityscapes image segmentation) as well as 4 types of attacks (Gaussian\nnoise addition, pruning, fine-tuning, and quantization). The code will be\nreleased open-source upon acceptance of the article.\n","authors":["Carl De Sousa Trias","Mihai Mitrea","Attilio Fiandrotti","Marco Cagnazzo","Sumanta Chaudhuri","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2409.03902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04472v3","updated":"2024-09-05T20:11:40Z","published":"2024-01-09T10:22:23Z","title":"A Survey on Efficient Federated Learning Methods for Foundation Model\n Training","summary":" Federated Learning (FL) has become an established technique to facilitate\nprivacy-preserving collaborative training across a multitude of clients.\nHowever, new approaches to FL often discuss their contributions involving small\ndeep-learning models only and focus on training full models on clients. In the\nwake of Foundation Models (FM), the reality is different for many deep learning\napplications. Typically, FMs have already been pre-trained across a wide\nvariety of tasks and can be fine-tuned to specific downstream tasks over\nsignificantly smaller datasets than required for full model training. However,\naccess to such datasets is often challenging. By its design, FL can help to\nopen data silos. With this survey, we introduce a novel taxonomy focused on\ncomputational and communication efficiency, the vital elements to make use of\nFMs in FL systems. We discuss the benefits and drawbacks of parameter-efficient\nfine-tuning (PEFT) for FL applications, elaborate on the readiness of FL\nframeworks to work with FMs, and provide future research opportunities on how\nto evaluate generative models in FL as well as the interplay of privacy and\nPEFT.\n","authors":["Herbert Woisetschläger","Alexander Isenko","Shiqiang Wang","Ruben Mayer","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2401.04472v3.pdf","comment":"Accepted for publication at IJCAI 2024. Please cite the published\n paper via https://doi.org/10.24963/ijcai.2024/919"},{"id":"http://arxiv.org/abs/2403.18582v2","updated":"2024-09-05T20:11:14Z","published":"2024-03-27T14:03:41Z","title":"One flow to correct them all: improving simulations in high-energy\n physics with a single normalising flow and a switch","summary":" Simulated events are key ingredients in almost all high-energy physics\nanalyses. However, imperfections in the simulation can lead to sizeable\ndifferences between the observed data and simulated events. The effects of such\nmismodelling on relevant observables must be corrected either effectively via\nscale factors, with weights or by modifying the distributions of the\nobservables and their correlations. We introduce a correction method that\ntransforms one multidimensional distribution (simulation) into another one\n(data) using a simple architecture based on a single normalising flow with a\nboolean condition. We demonstrate the effectiveness of the method on a\nphysics-inspired toy dataset with non-trivial mismodelling of several\nobservables and their correlations.\n","authors":["Caio Cesar Daumann","Mauro Donega","Johannes Erdmann","Massimiliano Galli","Jan Lukas Späh","Davide Valsecchi"],"pdf_url":"https://arxiv.org/pdf/2403.18582v2.pdf","comment":"19 pages, 12 figures, Dataset:\n https://doi.org/10.5281/zenodo.13305706"},{"id":"http://arxiv.org/abs/2409.03897v1","updated":"2024-09-05T20:09:56Z","published":"2024-09-05T20:09:56Z","title":"On the Convergence Rates of Federated Q-Learning across Heterogeneous\n Environments","summary":" Large-scale multi-agent systems are often deployed across wide geographic\nareas, where agents interact with heterogeneous environments. There is an\nemerging interest in understanding the role of heterogeneity in the performance\nof the federated versions of classic reinforcement learning algorithms. In this\npaper, we study synchronous federated Q-learning, which aims to learn an\noptimal Q-function by having $K$ agents average their local Q-estimates per $E$\niterations. We observe an interesting phenomenon on the convergence speeds in\nterms of $K$ and $E$. Similar to the homogeneous environment settings, there is\na linear speed-up concerning $K$ in reducing the errors that arise from\nsampling randomness. Yet, in sharp contrast to the homogeneous settings, $E>1$\nleads to significant performance degradation. Specifically, we provide a\nfine-grained characterization of the error evolution in the presence of\nenvironmental heterogeneity, which decay to zero as the number of iterations\n$T$ increases. The slow convergence of having $E>1$ turns out to be fundamental\nrather than an artifact of our analysis. We prove that, for a wide range of\nstepsizes, the $\\ell_{\\infty}$ norm of the error cannot decay faster than\n$\\Theta (E/T)$. In addition, our experiments demonstrate that the convergence\nexhibits an interesting two-phase phenomenon. For any given stepsize, there is\na sharp phase-transition of the convergence: the error decays rapidly in the\nbeginning yet later bounces up and stabilizes. Provided that the\nphase-transition time can be estimated, choosing different stepsizes for the\ntwo phases leads to faster overall convergence.\n","authors":["Muxing Wang","Pengkun Yang","Lili Su"],"pdf_url":"https://arxiv.org/pdf/2409.03897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03893v1","updated":"2024-09-05T19:59:42Z","published":"2024-09-05T19:59:42Z","title":"Understanding Fairness Metrics in Recommender Systems: A Healthcare\n Perspective","summary":" Fairness in AI-driven decision-making systems has become a critical concern,\nespecially when these systems directly affect human lives. This paper explores\nthe public's comprehension of fairness in healthcare recommendations. We\nconducted a survey where participants selected from four fairness metrics --\nDemographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive\nValue -- across different healthcare scenarios to assess their understanding of\nthese concepts. Our findings reveal that fairness is a complex and often\nmisunderstood concept, with a generally low level of public understanding\nregarding fairness metrics in recommender systems. This study highlights the\nneed for enhanced information and education on algorithmic fairness to support\ninformed decision-making in using these systems. Furthermore, the results\nsuggest that a one-size-fits-all approach to fairness may be insufficient,\npointing to the importance of context-sensitive designs in developing equitable\nAI systems.\n","authors":["Veronica Kecki","Alan Said"],"pdf_url":"https://arxiv.org/pdf/2409.03893v1.pdf","comment":"Accepted to the 18th ACM Conference on Recommender Systems"},{"id":"http://arxiv.org/abs/2409.03892v1","updated":"2024-09-05T19:59:14Z","published":"2024-09-05T19:59:14Z","title":"Active Sampling of Interpolation Points to Identify Dominant Subspaces\n for Model Reduction","summary":" Model reduction is an active research field to construct low-dimensional\nsurrogate models of high fidelity to accelerate engineering design cycles. In\nthis work, we investigate model reduction for linear structured systems using\ndominant reachable and observable subspaces. When the training set $-$\ncontaining all possible interpolation points $-$ is large, then these subspaces\ncan be determined by solving many large-scale linear systems. However, for\nhigh-fidelity models, this easily becomes computationally intractable. To\ncircumvent this issue, in this work, we propose an active sampling strategy to\nsample only a few points from the given training set, which can allow us to\nestimate those subspaces accurately. To this end, we formulate the\nidentification of the subspaces as the solution of the generalized Sylvester\nequations, guiding us to select the most relevant samples from the training set\nto achieve our goals. Consequently, we construct solutions of the matrix\nequations in low-rank forms, which encode subspace information. We extensively\ndiscuss computational aspects and efficient usage of the low-rank factors in\nthe process of obtaining reduced-order models. We illustrate the proposed\nactive sampling scheme to obtain reduced-order models via dominant reachable\nand observable subspaces and present its comparison with the method where all\nthe points from the training set are taken into account. It is shown that the\nactive sample strategy can provide us $17$x speed-up without sacrificing any\nnoticeable accuracy.\n","authors":["Celine Reddig","Pawan Goyal","Igor Pontes Duff","Peter Benner"],"pdf_url":"https://arxiv.org/pdf/2409.03892v1.pdf","comment":"20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2409.03891v1","updated":"2024-09-05T19:58:58Z","published":"2024-09-05T19:58:58Z","title":"Overfitting Behaviour of Gaussian Kernel Ridgeless Regression: Varying\n Bandwidth or Dimensionality","summary":" We consider the overfitting behavior of minimum norm interpolating solutions\nof Gaussian kernel ridge regression (i.e. kernel ridgeless regression), when\nthe bandwidth or input dimension varies with the sample size. For fixed\ndimensions, we show that even with varying or tuned bandwidth, the ridgeless\nsolution is never consistent and, at least with large enough noise, always\nworse than the null predictor. For increasing dimension, we give a generic\ncharacterization of the overfitting behavior for any scaling of the dimension\nwith sample size. We use this to provide the first example of benign\noverfitting using the Gaussian kernel with sub-polynomial scaling dimension.\nAll our results are under the Gaussian universality ansatz and the\n(non-rigorous) risk predictions in terms of the kernel eigenstructure.\n","authors":["Marko Medvedev","Gal Vardi","Nathan Srebro"],"pdf_url":"https://arxiv.org/pdf/2409.03891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09164v3","updated":"2024-09-05T19:56:09Z","published":"2024-02-14T13:30:02Z","title":"Less is More: Fewer Interpretable Region via Submodular Subset Selection","summary":" Image attribution algorithms aim to identify important regions that are\nhighly relevant to model decisions. Although existing attribution solutions can\neffectively assign importance to target elements, they still face the following\nchallenges: 1) existing attribution methods generate inaccurate small regions\nthus misleading the direction of correct attribution, and 2) the model cannot\nproduce good attribution results for samples with wrong predictions. To address\nthe above challenges, this paper re-models the above image attribution problem\nas a submodular subset selection problem, aiming to enhance model\ninterpretability using fewer regions. To address the lack of attention to local\nregions, we construct a novel submodular function to discover more accurate\nsmall interpretation regions. To enhance the attribution effect for all\nsamples, we also impose four different constraints on the selection of\nsub-regions, i.e., confidence, effectiveness, consistency, and collaboration\nscores, to assess the importance of various subsets. Moreover, our theoretical\nanalysis substantiates that the proposed function is in fact submodular.\nExtensive experiments show that the proposed method outperforms SOTA methods on\ntwo face datasets (Celeb-A and VGG-Face2) and one fine-grained dataset\n(CUB-200-2011). For correctly predicted samples, the proposed method improves\nthe Deletion and Insertion scores with an average of 4.9% and 2.5% gain\nrelative to HSIC-Attribution. For incorrectly predicted samples, our method\nachieves gains of 81.0% and 18.4% compared to the HSIC-Attribution algorithm in\nthe average highest confidence and Insertion score respectively. The code is\nreleased at https://github.com/RuoyuChen10/SMDL-Attribution.\n","authors":["Ruoyu Chen","Hua Zhang","Siyuan Liang","Jingzhi Li","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2402.09164v3.pdf","comment":"Accepted to ICLR 2024 (Oral)"},{"id":"http://arxiv.org/abs/2409.03887v1","updated":"2024-09-05T19:50:26Z","published":"2024-09-05T19:50:26Z","title":"The Influence of Faulty Labels in Data Sets on Human Pose Estimation","summary":" In this study we provide empirical evidence demonstrating that the quality of\ntraining data impacts model performance in Human Pose Estimation (HPE).\nInaccurate labels in widely used data sets, ranging from minor errors to severe\nmislabeling, can negatively influence learning and distort performance metrics.\nWe perform an in-depth analysis of popular HPE data sets to show the extent and\nnature of label inaccuracies. Our findings suggest that accounting for the\nimpact of faulty labels will facilitate the development of more robust and\naccurate HPE models for a variety of real-world applications. We show improved\nperformance with cleansed data.\n","authors":["Arnold Schwarz","Levente Hernadi","Felix Bießmann","Kristian Hildebrand"],"pdf_url":"https://arxiv.org/pdf/2409.03887v1.pdf","comment":"15 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2406.09246v3","updated":"2024-09-05T19:46:34Z","published":"2024-06-13T15:46:55Z","title":"OpenVLA: An Open-Source Vision-Language-Action Model","summary":" Large policies pretrained on a combination of Internet-scale vision-language\ndata and diverse robot demonstrations have the potential to change how we teach\nrobots new skills: rather than training new behaviors from scratch, we can\nfine-tune such vision-language-action (VLA) models to obtain robust,\ngeneralizable policies for visuomotor control. Yet, widespread adoption of VLAs\nfor robotics has been challenging as 1) existing VLAs are largely closed and\ninaccessible to the public, and 2) prior work fails to explore methods for\nefficiently fine-tuning VLAs for new tasks, a key component for adoption.\nAddressing these challenges, we introduce OpenVLA, a 7B-parameter open-source\nVLA trained on a diverse collection of 970k real-world robot demonstrations.\nOpenVLA builds on a Llama 2 language model combined with a visual encoder that\nfuses pretrained features from DINOv2 and SigLIP. As a product of the added\ndata diversity and new model components, OpenVLA demonstrates strong results\nfor generalist manipulation, outperforming closed models such as RT-2-X (55B)\nby 16.5% in absolute task success rate across 29 tasks and multiple robot\nembodiments, with 7x fewer parameters. We further show that we can effectively\nfine-tune OpenVLA for new settings, with especially strong generalization\nresults in multi-task environments involving multiple objects and strong\nlanguage grounding abilities, and outperform expressive from-scratch imitation\nlearning methods such as Diffusion Policy by 20.4%. We also explore compute\nefficiency; as a separate contribution, we show that OpenVLA can be fine-tuned\non consumer GPUs via modern low-rank adaptation methods and served efficiently\nvia quantization without a hit to downstream success rate. Finally, we release\nmodel checkpoints, fine-tuning notebooks, and our PyTorch codebase with\nbuilt-in support for training VLAs at scale on Open X-Embodiment datasets.\n","authors":["Moo Jin Kim","Karl Pertsch","Siddharth Karamcheti","Ted Xiao","Ashwin Balakrishna","Suraj Nair","Rafael Rafailov","Ethan Foster","Grace Lam","Pannag Sanketi","Quan Vuong","Thomas Kollar","Benjamin Burchfiel","Russ Tedrake","Dorsa Sadigh","Sergey Levine","Percy Liang","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2406.09246v3.pdf","comment":"Website: https://openvla.github.io/"},{"id":"http://arxiv.org/abs/2409.03874v1","updated":"2024-09-05T19:22:33Z","published":"2024-09-05T19:22:33Z","title":"Cost-Control in Display Advertising: Theory vs Practice","summary":" In display advertising, advertisers want to achieve a marketing objective\nwith constraints on budget and cost-per-outcome. This is usually formulated as\nan optimization problem that maximizes the total utility under constraints. The\noptimization is carried out in an online fashion in the dual space - for an\nincoming Ad auction, a bid is placed using an optimal bidding formula, assuming\noptimal values for the dual variables; based on the outcome of the previous\nauctions, the dual variables are updated in an online fashion. While this\napproach is theoretically sound, in practice, the dual variables are not\noptimal from the beginning, but rather converge over time. Specifically, for\nthe cost-constraint, the convergence is asymptotic. As a result, we find that\ncost-control is ineffective. In this work, we analyse the shortcomings of the\noptimal bidding formula and propose a modification that deviates from the\ntheoretical derivation. We simulate various practical scenarios and study the\ncost-control behaviors of the two algorithms. Through a large-scale evaluation\non the real-word data, we show that the proposed modification reduces the cost\nviolations by 50%, thereby achieving a better cost-control than the theoretical\nbidding formula.\n","authors":["Anoop R Katti","Rui C. Gonçalves","Rinchin Iakovlev"],"pdf_url":"https://arxiv.org/pdf/2409.03874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11867v2","updated":"2024-09-05T19:19:59Z","published":"2024-07-16T15:52:36Z","title":"Unlearning Targeted Information via Single Layer Unlearning Gradient","summary":" Unauthorized privacy-related and copyrighted content generation using\ngenerative-AI is becoming a significant concern for human society, raising\nethical, legal, and privacy issues that demand urgent attention. The EU's\nGeneral Data Protection Regulation (GDPR) include a \"right to be forgotten,\"\nwhich allows individuals to request the deletion of their personal data.\nHowever, this primarily applies to data stored in traditional databases, not AI\nmodels. Recently, machine unlearning techniques have arise that attempt to\neliminate the influence of sensitive content used during AI model training, but\nthey often require extensive updates to the deployed systems and incur\nsubstantial computational costs. In this work, we propose a novel and efficient\nmethod called Single Layer Unlearning Gradient (SLUG), that can unlearn\ntargeted information by updating targeted layers of a model using a one-time\ngradient computation. Our method is highly modular and enables the selective\nremoval of multiple sensitive concepts, such as celebrity names and copyrighted\ncontent, from the generated outputs of widely used foundation models (e.g.,\nCLIP) and generative models (e.g., Stable Diffusion). Broadly, our method\nensures AI-generated content complies with privacy regulations and intellectual\nproperty laws, fostering responsible use of generative models, mitigating legal\nrisks and promoting a trustworthy, socially responsible AI ecosystem.\n","authors":["Zikui Cai","Yaoteng Tan","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2407.11867v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03863v1","updated":"2024-09-05T19:00:18Z","published":"2024-09-05T19:00:18Z","title":"Can We Theoretically Quantify the Impacts of Local Updates on the\n Generalization Performance of Federated Learning?","summary":" Federated Learning (FL) has gained significant popularity due to its\neffectiveness in training machine learning models across diverse sites without\nrequiring direct data sharing. While various algorithms along with their\noptimization analyses have shown that FL with local updates is a\ncommunication-efficient distributed learning framework, the generalization\nperformance of FL with local updates has received comparatively less attention.\nThis lack of investigation can be attributed to the complex interplay between\ndata heterogeneity and infrequent communication due to the local updates within\nthe FL framework. This motivates us to investigate a fundamental question in\nFL: Can we quantify the impact of data heterogeneity and local updates on the\ngeneralization performance for FL as the learning process evolves? To this end,\nwe conduct a comprehensive theoretical study of FL's generalization performance\nusing a linear model as the first step, where the data heterogeneity is\nconsidered for both the stationary and online/non-stationary cases. By\nproviding closed-form expressions of the model error, we rigorously quantify\nthe impact of the number of the local updates (denoted as $K$) under three\nsettings ($K=1$, $K<\\infty$, and $K=\\infty$) and show how the generalization\nperformance evolves with the number of rounds $t$. Our investigation also\nprovides a comprehensive understanding of how different configurations\n(including the number of model parameters $p$ and the number of training\nsamples $n$) contribute to the overall generalization performance, thus\nshedding new insights (such as benign overfitting) for implementing FL over\nnetworks.\n","authors":["Peizhong Ju","Haibo Yang","Jia Liu","Yingbin Liang","Ness Shroff"],"pdf_url":"https://arxiv.org/pdf/2409.03863v1.pdf","comment":"Published in MobiHoc 2024"},{"id":"http://arxiv.org/abs/2404.08839v4","updated":"2024-09-05T18:36:34Z","published":"2024-04-12T22:57:01Z","title":"Multiply-Robust Causal Change Attribution","summary":" Comparing two samples of data, we observe a change in the distribution of an\noutcome variable. In the presence of multiple explanatory variables, how much\nof the change can be explained by each possible cause? We develop a new\nestimation strategy that, given a causal model, combines regression and\nre-weighting methods to quantify the contribution of each causal mechanism. Our\nproposed methodology is multiply robust, meaning that it still recovers the\ntarget parameter under partial misspecification. We prove that our estimator is\nconsistent and asymptotically normal. Moreover, it can be incorporated into\nexisting frameworks for causal attribution, such as Shapley values, which will\ninherit the consistency and large-sample distribution properties. Our method\ndemonstrates excellent performance in Monte Carlo simulations, and we show its\nusefulness in an empirical application. Our method is implemented as part of\nthe Python library DoWhy (arXiv:2011.04216, arXiv:2206.06821).\n","authors":["Victor Quintas-Martinez","Mohammad Taha Bahadori","Eduardo Santiago","Jeff Mu","Dominik Janzing","David Heckerman"],"pdf_url":"https://arxiv.org/pdf/2404.08839v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03845v1","updated":"2024-09-05T18:14:22Z","published":"2024-09-05T18:14:22Z","title":"Latent Space Energy-based Neural ODEs","summary":" This paper introduces a novel family of deep dynamical models designed to\nrepresent continuous-time sequence data. This family of models generates each\ndata point in the time series by a neural emission model, which is a non-linear\ntransformation of a latent state vector. The trajectory of the latent states is\nimplicitly described by a neural ordinary differential equation (ODE), with the\ninitial state following an informative prior distribution parameterized by an\nenergy-based model. Furthermore, we can extend this model to disentangle\ndynamic states from underlying static factors of variation, represented as\ntime-invariant variables in the latent space. We train the model using maximum\nlikelihood estimation with Markov chain Monte Carlo (MCMC) in an end-to-end\nmanner, without requiring additional assisting components such as an inference\nnetwork. Our experiments on oscillating systems, videos and real-world state\nsequences (MuJoCo) illustrate that ODEs with the learnable energy-based prior\noutperform existing counterparts, and can generalize to new dynamic\nparameterization, enabling long-horizon predictions.\n","authors":["Sheng Cheng","Deqian Kong","Jianwen Xie","Kookjin Lee","Ying Nian Wu","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2409.03845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03817v1","updated":"2024-09-05T18:00:00Z","published":"2024-09-05T18:00:00Z","title":"Neural Entropy","summary":" We examine the connection between deep learning and information theory\nthrough the paradigm of diffusion models. Using well-established principles\nfrom non-equilibrium thermodynamics we can characterize the amount of\ninformation required to reverse a diffusive process. Neural networks store this\ninformation and operate in a manner reminiscent of Maxwell's demon during the\ngenerative stage. We illustrate this cycle using a novel diffusion scheme we\ncall the entropy matching model, wherein the information conveyed to the\nnetwork during training exactly corresponds to the entropy that must be negated\nduring reversal. We demonstrate that this entropy can be used to analyze the\nencoding efficiency and storage capacity of the network. This conceptual\npicture blends elements of stochastic optimal control, thermodynamics,\ninformation theory, and optimal transport, and raises the prospect of applying\ndiffusion models as a test bench to understand neural networks.\n","authors":["Akhil Premkumar"],"pdf_url":"https://arxiv.org/pdf/2409.03817v1.pdf","comment":"37 pages + references, 11 figures"},{"id":"http://arxiv.org/abs/2409.03810v1","updated":"2024-09-05T17:46:30Z","published":"2024-09-05T17:46:30Z","title":"How Do Your Code LLMs Perform? Empowering Code Instruction Tuning with\n High-Quality Data","summary":" Recently, there has been a growing interest in studying how to construct\nbetter code instruction tuning data. However, we observe Code models trained\nwith these datasets exhibit high performance on HumanEval but perform worse on\nother benchmarks such as LiveCodeBench. Upon further investigation, we find\nthat many datasets suffer from severe data leakage. After cleaning up most of\nthe leaked data, some well-known high-quality datasets perform poorly. This\ndiscovery reveals a new challenge: identifying which dataset genuinely qualify\nas high-quality code instruction data. To address this, we propose an efficient\ncode data pruning strategy for selecting good samples. Our approach is based on\nthree dimensions: instruction complexity, response quality, and instruction\ndiversity. Based on our selected data, we present XCoder, a family of models\nfinetuned from LLaMA3. Our experiments show XCoder achieves new\nstate-of-the-art performance using fewer training data, which verify the\neffectiveness of our data strategy. Moreover, we perform a comprehensive\nanalysis on the data composition and find existing code datasets have different\ncharacteristics according to their construction methods, which provide new\ninsights for future code LLMs. Our models and dataset are released in\nhttps://github.com/banksy23/XCoder\n","authors":["Yejie Wang","Keqing He","Dayuan Fu","Zhuoma Gongque","Heyang Xu","Yanxu Chen","Zhexu Wang","Yujia Fu","Guanting Dong","Muxi Diao","Jingang Wang","Mengdi Zhang","Xunliang Cai","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2409.03810v1.pdf","comment":"Working in progress"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.03605v1","updated":"2024-09-05T15:11:40Z","published":"2024-09-05T15:11:40Z","title":"SegTalker: Segmentation-based Talking Face Generation with Mask-guided\n Local Editing","summary":" Audio-driven talking face generation aims to synthesize video with lip\nmovements synchronized to input audio. However, current generative techniques\nface challenges in preserving intricate regional textures (skin, teeth). To\naddress the aforementioned challenges, we propose a novel framework called\nSegTalker to decouple lip movements and image textures by introducing\nsegmentation as intermediate representation. Specifically, given the mask of\nimage employed by a parsing network, we first leverage the speech to drive the\nmask and generate talking segmentation. Then we disentangle semantic regions of\nimage into style codes using a mask-guided encoder. Ultimately, we inject the\npreviously generated talking segmentation and style codes into a mask-guided\nStyleGAN to synthesize video frame. In this way, most of textures are fully\npreserved. Moreover, our approach can inherently achieve background separation\nand facilitate mask-guided facial local editing. In particular, by editing the\nmask and swapping the region textures from a given reference image (e.g. hair,\nlip, eyebrows), our approach enables facial editing seamlessly when generating\ntalking face video. Experiments demonstrate that our proposed approach can\neffectively preserve texture details and generate temporally consistent video\nwhile remaining competitive in lip synchronization. Quantitative and\nqualitative results on the HDTF and MEAD datasets illustrate the superior\nperformance of our method over existing methods.\n","authors":["Lingyu Xiong","Xize Cheng","Jintao Tan","Xianjia Wu","Xiandong Li","Lei Zhu","Fei Ma","Minglei Li","Huang Xu","Zhihu Hu"],"pdf_url":"https://arxiv.org/pdf/2409.03605v1.pdf","comment":"10 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2409.03385v1","updated":"2024-09-05T09:44:43Z","published":"2024-09-05T09:44:43Z","title":"Make Graph-based Referring Expression Comprehension Great Again through\n Expression-guided Dynamic Gating and Regression","summary":" One common belief is that with complex models and pre-training on large-scale\ndatasets, transformer-based methods for referring expression comprehension\n(REC) perform much better than existing graph-based methods. We observe that\nsince most graph-based methods adopt an off-the-shelf detector to locate\ncandidate objects (i.e., regions detected by the object detector), they face\ntwo challenges that result in subpar performance: (1) the presence of\nsignificant noise caused by numerous irrelevant objects during reasoning, and\n(2) inaccurate localization outcomes attributed to the provided detector. To\naddress these issues, we introduce a plug-and-adapt module guided by\nsub-expressions, called dynamic gate constraint (DGC), which can adaptively\ndisable irrelevant proposals and their connections in graphs during reasoning.\nWe further introduce an expression-guided regression strategy (EGR) to refine\nlocation prediction. Extensive experimental results on the RefCOCO, RefCOCO+,\nRefCOCOg, Flickr30K, RefClef, and Ref-reasoning datasets demonstrate the\neffectiveness of the DGC module and the EGR strategy in consistently boosting\nthe performances of various graph-based REC methods. Without any pretaining,\nthe proposed graph-based method achieves better performance than the\nstate-of-the-art (SOTA) transformer-based methods.\n","authors":["Jingcheng Ke","Dele Wang","Jun-Cheng Chen","I-Hong Jhuo","Chia-Wen Lin","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2409.03385v1.pdf","comment":"12 pages to appear in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2409.03336v1","updated":"2024-09-05T08:28:36Z","published":"2024-09-05T08:28:36Z","title":"Eetimating Indoor Scene Depth Maps from Ultrasonic Echoes","summary":" Measuring 3D geometric structures of indoor scenes requires dedicated depth\nsensors, which are not always available. Echo-based depth estimation has\nrecently been studied as a promising alternative solution. All previous studies\nhave assumed the use of echoes in the audible range. However, one major problem\nis that audible echoes cannot be used in quiet spaces or other situations where\nproducing audible sounds is prohibited. In this paper, we consider echo-based\ndepth estimation using inaudible ultrasonic echoes. While ultrasonic waves\nprovide high measurement accuracy in theory, the actual depth estimation\naccuracy when ultrasonic echoes are used has remained unclear, due to its\ndisadvantage of being sensitive to noise and susceptible to attenuation. We\nfirst investigate the depth estimation accuracy when the frequency of the sound\nsource is restricted to the high-frequency band, and found that the accuracy\ndecreased when the frequency was limited to ultrasonic ranges. Based on this\nobservation, we propose a novel deep learning method to improve the accuracy of\nultrasonic echo-based depth estimation by using audible echoes as auxiliary\ndata only during training. Experimental results with a public dataset\ndemonstrate that our method improves the estimation accuracy.\n","authors":["Junpei Honma","Akisato Kimura","Go Irie"],"pdf_url":"https://arxiv.org/pdf/2409.03336v1.pdf","comment":"ICIP 2024"},{"id":"http://arxiv.org/abs/2404.13993v4","updated":"2024-09-05T02:21:42Z","published":"2024-04-22T08:59:35Z","title":"Zero-Shot Character Identification and Speaker Prediction in Comics via\n Iterative Multimodal Fusion","summary":" Recognizing characters and predicting speakers of dialogue are critical for\ncomic processing tasks, such as voice generation or translation. However,\nbecause characters vary by comic title, supervised learning approaches like\ntraining character classifiers which require specific annotations for each\ncomic title are infeasible. This motivates us to propose a novel zero-shot\napproach, allowing machines to identify characters and predict speaker names\nbased solely on unannotated comic images. In spite of their importance in\nreal-world applications, these task have largely remained unexplored due to\nchallenges in story comprehension and multimodal integration. Recent large\nlanguage models (LLMs) have shown great capability for text understanding and\nreasoning, while their application to multimodal content analysis is still an\nopen problem. To address this problem, we propose an iterative multimodal\nframework, the first to employ multimodal information for both character\nidentification and speaker prediction tasks. Our experiments demonstrate the\neffectiveness of the proposed framework, establishing a robust baseline for\nthese tasks. Furthermore, since our method requires no training data or\nannotations, it can be used as-is on any comic series.\n","authors":["Yingxuan Li","Ryota Hinami","Kiyoharu Aizawa","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2404.13993v4.pdf","comment":"Accepted to ACM Multimedia 2024. Project page:\n https://liyingxuan1012.github.io/zeroshot-speaker-prediction ; Github repo:\n https://github.com/liyingxuan1012/zeroshot-speaker-prediction"},{"id":"http://arxiv.org/abs/2409.03902v1","updated":"2024-09-05T20:22:01Z","published":"2024-09-05T20:22:01Z","title":"WaterMAS: Sharpness-Aware Maximization for Neural Network Watermarking","summary":" Nowadays, deep neural networks are used for solving complex tasks in several\ncritical applications and protecting both their integrity and intellectual\nproperty rights (IPR) has become of utmost importance. To this end, we advance\nWaterMAS, a substitutive, white-box neural network watermarking method that\nimproves the trade-off among robustness, imperceptibility, and computational\ncomplexity, while making provisions for increased data payload and security.\nWasterMAS insertion keeps unchanged the watermarked weights while sharpening\ntheir underlying gradient space. The robustness is thus ensured by limiting the\nattack's strength: even small alterations of the watermarked weights would\nimpact the model's performance. The imperceptibility is ensured by inserting\nthe watermark during the training process. The relationship among the WaterMAS\ndata payload, imperceptibility, and robustness properties is discussed. The\nsecret key is represented by the positions of the weights conveying the\nwatermark, randomly chosen through multiple layers of the model. The security\nis evaluated by investigating the case in which an attacker would intercept the\nkey. The experimental validations consider 5 models and 2 tasks (VGG16,\nResNet18, MobileNetV3, SwinT for CIFAR10 image classification, and DeepLabV3\nfor Cityscapes image segmentation) as well as 4 types of attacks (Gaussian\nnoise addition, pruning, fine-tuning, and quantization). The code will be\nreleased open-source upon acceptance of the article.\n","authors":["Carl De Sousa Trias","Mihai Mitrea","Attilio Fiandrotti","Marco Cagnazzo","Sumanta Chaudhuri","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2409.03902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03844v1","updated":"2024-09-05T18:12:11Z","published":"2024-09-05T18:12:11Z","title":"MetaBGM: Dynamic Soundtrack Transformation For Continuous Multi-Scene\n Experiences With Ambient Awareness And Personalization","summary":" This paper introduces MetaBGM, a groundbreaking framework for generating\nbackground music that adapts to dynamic scenes and real-time user interactions.\nWe define multi-scene as variations in environmental contexts, such as\ntransitions in game settings or movie scenes. To tackle the challenge of\nconverting backend data into music description texts for audio generation\nmodels, MetaBGM employs a novel two-stage generation approach that transforms\ncontinuous scene and user state data into these texts, which are then fed into\nan audio generation model for real-time soundtrack creation. Experimental\nresults demonstrate that MetaBGM effectively generates contextually relevant\nand dynamic background music for interactive applications.\n","authors":["Haoxuan Liu","Zihao Wang","Haorong Hong","Youwei Feng","Jiaxin Yu","Han Diao","Yunfei Xu","Kejun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03844v1.pdf","comment":null}]},"2024-09-06T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.04421v1","updated":"2024-09-06T17:30:45Z","published":"2024-09-06T17:30:45Z","title":"RLPF: Reinforcement Learning from Prediction Feedback for User\n Summarization with LLMs","summary":" LLM-powered personalization agent systems employ Large Language Models (LLMs)\nto predict users' behavior from their past activities. However, their\neffectiveness often hinges on the ability to effectively leverage extensive,\nlong user historical data due to its inherent noise and length of such data.\nExisting pretrained LLMs may generate summaries that are concise but lack the\nnecessary context for downstream tasks, hindering their utility in\npersonalization systems. To address these challenges, we introduce\nReinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to\ngenerate concise, human-readable user summaries that are optimized for\ndownstream task performance. By maximizing the usefulness of the generated\nsummaries, RLPF effectively distills extensive user history data while\npreserving essential information for downstream tasks. Our empirical evaluation\ndemonstrates significant improvements in both extrinsic downstream task utility\nand intrinsic summary quality, surpassing baseline methods by up to 22% on\ndownstream task performance and achieving an up to 84.59% win rate on\nFactuality, Abstractiveness, and Readability. RLPF also achieves a remarkable\n74% reduction in context length while improving performance on 16 out of 19\nunseen tasks and/or datasets, showcasing its generalizability. This approach\noffers a promising solution for enhancing LLM personalization by effectively\ntransforming long, noisy user histories into informative and human-readable\nrepresentations.\n","authors":["Jiaxing Wu","Lin Ning","Luyang Liu","Harrison Lee","Neo Wu","Chao Wang","Sushant Prakash","Shawn O'Banion","Bradley Green","Jun Xie"],"pdf_url":"https://arxiv.org/pdf/2409.04421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04384v1","updated":"2024-09-06T16:20:24Z","published":"2024-09-06T16:20:24Z","title":"Empirical Bayesian image restoration by Langevin sampling with a\n denoising diffusion implicit prior","summary":" Score-based diffusion methods provide a powerful strategy to solve image\nrestoration tasks by flexibly combining a pre-trained foundational prior model\nwith a likelihood function specified during test time. Such methods are\npredominantly derived from two stochastic processes: reversing\nOrnstein-Uhlenbeck, which underpins the celebrated denoising diffusion\nprobabilistic models (DDPM) and denoising diffusion implicit models (DDIM), and\nthe Langevin diffusion process. The solutions delivered by DDPM and DDIM are\noften remarkably realistic, but they are not always consistent with\nmeasurements because of likelihood intractability issues and the associated\nrequired approximations. Alternatively, using a Langevin process circumvents\nthe intractable likelihood issue, but usually leads to restoration results of\ninferior quality and longer computing times. This paper presents a novel and\nhighly computationally efficient image restoration method that carefully embeds\na foundational DDPM denoiser within an empirical Bayesian Langevin algorithm,\nwhich jointly calibrates key model hyper-parameters as it estimates the model's\nposterior mean. Extensive experimental results on three canonical tasks (image\ndeblurring, super-resolution, and inpainting) demonstrate that the proposed\napproach improves on state-of-the-art strategies both in image estimation\naccuracy and computing time.\n","authors":["Charlesquin Kemajou Mbakam","Jean-Francois Giovannelli","Marcelo Pereyra"],"pdf_url":"https://arxiv.org/pdf/2409.04384v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2409.00077v2","updated":"2024-09-06T16:12:00Z","published":"2024-08-24T09:26:59Z","title":"Are LLM-based methods good enough for detecting unfair terms of service?","summary":" Countless terms of service (ToS) are being signed everyday by users all over\nthe world while interacting with all kinds of apps and websites. More often\nthan not, these online contracts spanning double-digit pages are signed blindly\nby users who simply want immediate access to the desired service. What would\nnormally require a consultation with a legal team, has now become a mundane\nactivity consisting of a few clicks where users potentially sign away their\nrights, for instance in terms of their data privacy, to countless online\nentities/companies. Large language models (LLMs) are good at parsing long\ntext-based documents, and could potentially be adopted to help users when\ndealing with dubious clauses in ToS and their underlying privacy policies. To\ninvestigate the utility of existing models for this task, we first build a\ndataset consisting of 12 questions applied individually to a set of privacy\npolicies crawled from popular websites. Thereafter, a series of open-source as\nwell as commercial chatbots such as ChatGPT, are queried over each question,\nwith the answers being compared to a given ground truth. Our results show that\nsome open-source models are able to provide a higher accuracy compared to some\ncommercial models. However, the best performance is recorded from a commercial\nchatbot (ChatGPT4). Overall, all models perform only slightly better than\nrandom at this task. Consequently, their performance needs to be significantly\nimproved before they can be adopted at large for this purpose.\n","authors":["Mirgita Frasheri","Arian Bakhtiarnia","Lukas Esterle","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2409.00077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04340v1","updated":"2024-09-06T15:18:12Z","published":"2024-09-06T15:18:12Z","title":"AGR: Age Group fairness Reward for Bias Mitigation in LLMs","summary":" LLMs can exhibit age biases, resulting in unequal treatment of individuals\nacross age groups. While much research has addressed racial and gender biases,\nage bias remains little explored. The scarcity of instruction-tuning and\npreference datasets for age bias hampers its detection and measurement, and\nexisting fine-tuning methods seldom address age-related fairness. In this\npaper, we construct age bias preference datasets and instruction-tuning\ndatasets for RLHF. We introduce ARG, an age fairness reward to reduce\ndifferences in the response quality of LLMs across different age groups.\nExtensive experiments demonstrate that this reward significantly improves\nresponse accuracy and reduces performance disparities across age groups. Our\nsource code and datasets are available at the anonymous\n\\href{https://anonymous.4open.science/r/FairRLHF-D445/readme.md}{link}.\n","authors":["Shuirong Cao","Ruoxi Cheng","Zhiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04340v1.pdf","comment":"The first two authors contributed equally to this work. Corresponding\n to Zhiqiang Wang. ACKNOWLEDGMENT: we would like to thank the computing\n resources support from the State Key Laboratory of New Computer Software\n Technologies at Nanjing University"},{"id":"http://arxiv.org/abs/2409.04318v1","updated":"2024-09-06T14:46:37Z","published":"2024-09-06T14:46:37Z","title":"Learning vs Retrieval: The Role of In-Context Examples in Regression\n with LLMs","summary":" Generative Large Language Models (LLMs) are capable of being in-context\nlearners. However, the underlying mechanism of in-context learning (ICL) is\nstill a major research question, and experimental research results about how\nmodels exploit ICL are not always consistent. In this work, we propose a\nframework for evaluating in-context learning mechanisms, which we claim are a\ncombination of retrieving internal knowledge and learning from in-context\nexamples by focusing on regression tasks. First, we show that LLMs can perform\nregression on real-world datasets and then design experiments to measure the\nextent to which the LLM retrieves its internal knowledge versus learning from\nin-context examples. We argue that this process lies on a spectrum between\nthese two extremes. We provide an in-depth analysis of the degrees to which\nthese mechanisms are triggered depending on various factors, such as prior\nknowledge about the tasks and the type and richness of the information provided\nby the in-context examples. We employ three LLMs and utilize multiple datasets\nto corroborate the robustness of our findings. Our results shed light on how to\nengineer prompts to leverage meta-learning from in-context examples and foster\nknowledge retrieval depending on the problem being addressed.\n","authors":["Aliakbar Nafar","Kristen Brent Venable","Parisa Kordjamshidi"],"pdf_url":"https://arxiv.org/pdf/2409.04318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03708v2","updated":"2024-09-06T14:18:20Z","published":"2024-09-05T17:14:23Z","title":"RAG based Question-Answering for Contextual Response Prediction System","summary":" Large Language Models (LLMs) have shown versatility in various Natural\nLanguage Processing (NLP) tasks, including their potential as effective\nquestion-answering systems. However, to provide precise and relevant\ninformation in response to specific customer queries in industry settings, LLMs\nrequire access to a comprehensive knowledge base to avoid hallucinations.\nRetrieval Augmented Generation (RAG) emerges as a promising technique to\naddress this challenge. Yet, developing an accurate question-answering\nframework for real-world applications using RAG entails several challenges: 1)\ndata availability issues, 2) evaluating the quality of generated content, and\n3) the costly nature of human evaluation. In this paper, we introduce an\nend-to-end framework that employs LLMs with RAG capabilities for industry use\ncases. Given a customer query, the proposed system retrieves relevant knowledge\ndocuments and leverages them, along with previous chat history, to generate\nresponse suggestions for customer service agents in the contact centers of a\nmajor retail company. Through comprehensive automated and human evaluations, we\nshow that this solution outperforms the current BERT-based algorithms in\naccuracy and relevance. Our findings suggest that RAG-based LLMs can be an\nexcellent support to human customer service representatives by lightening their\nworkload.\n","authors":["Sriram Veturi","Saurabh Vaichal","Reshma Lal Jagadheesh","Nafis Irtiza Tripto","Nian Yan"],"pdf_url":"https://arxiv.org/pdf/2409.03708v2.pdf","comment":"Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise,\n CIKM'24. 6 pages"},{"id":"http://arxiv.org/abs/2409.04286v1","updated":"2024-09-06T13:53:28Z","published":"2024-09-06T13:53:28Z","title":"Using Large Language Models to Generate Authentic Multi-agent Knowledge\n Work Datasets","summary":" Current publicly available knowledge work data collections lack diversity,\nextensive annotations, and contextual information about the users and their\ndocuments. These issues hinder objective and comparable data-driven evaluations\nand optimizations of knowledge work assistance systems. Due to the considerable\nresources needed to collect such data in real-life settings and the necessity\nof data censorship, collecting such a dataset appears nearly impossible. For\nthis reason, we propose a configurable, multi-agent knowledge work dataset\ngenerator. This system simulates collaborative knowledge work among agents\nproducing Large Language Model-generated documents and accompanying data\ntraces. Additionally, the generator captures all background information, given\nin its configuration or created during the simulation process, in a knowledge\ngraph. Finally, the resulting dataset can be utilized and shared without\nprivacy or confidentiality concerns.\n This paper introduces our approach's design and vision and focuses on\ngenerating authentic knowledge work documents using Large Language Models. Our\nstudy involving human raters who assessed 53% of the generated and 74% of the\nreal documents as realistic demonstrates the potential of our approach.\nFurthermore, we analyze the authenticity criteria mentioned in the\nparticipants' comments and elaborate on potential improvements for identified\ncommon issues.\n","authors":["Desiree Heim","Christian Jilek","Adrian Ulges","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2409.04286v1.pdf","comment":"Accepted and in press (INFORMATIK Festival, Wiesbaden, 2024)"},{"id":"http://arxiv.org/abs/2406.17324v2","updated":"2024-09-06T13:52:57Z","published":"2024-06-25T07:15:10Z","title":"Delving into the Utilisation of ChatGPT in Scientific Publications in\n Astronomy","summary":" Rapid progress in the capabilities of machine learning approaches in natural\nlanguage processing has culminated in the rise of large language models over\nthe last two years. Recent works have shown unprecedented adoption of these for\nacademic writing, especially in some fields, but their pervasiveness in\nastronomy has not been studied sufficiently. To remedy this, we extract words\nthat ChatGPT uses more often than humans when generating academic text and\nsearch a total of 1 million articles for them. This way, we assess the\nfrequency of word occurrence in published works in astronomy tracked by the\nNASA Astrophysics Data System since 2000. We then perform a statistical\nanalysis of the occurrences. We identify a list of words favoured by ChatGPT\nand find a statistically significant increase for these words against a control\ngroup in 2024, which matches the trend in other disciplines. These results\nsuggest a widespread adoption of these models in the writing of astronomy\npapers. We encourage organisations, publishers, and researchers to work\ntogether to identify ethical and pragmatic guidelines to maximise the benefits\nof these systems while maintaining scientific rigour.\n","authors":["Simone Astarita","Sandor Kruk","Jan Reerink","Pablo Gómez"],"pdf_url":"https://arxiv.org/pdf/2406.17324v2.pdf","comment":"Accepted at SPAICE 2024"},{"id":"http://arxiv.org/abs/2409.04269v1","updated":"2024-09-06T13:25:18Z","published":"2024-09-06T13:25:18Z","title":"Open Language Data Initiative: Advancing Low-Resource Machine\n Translation for Karakalpak","summary":" This study presents several contributions for the Karakalpak language: a\nFLORES+ devtest dataset translated to Karakalpak, parallel corpora for\nUzbek-Karakalpak, Russian-Karakalpak and English-Karakalpak of 100,000 pairs\neach and open-sourced fine-tuned neural models for translation across these\nlanguages. Our experiments compare different model variants and training\napproaches, demonstrating improvements over existing baselines. This work,\nconducted as part of the Open Language Data Initiative (OLDI) shared task, aims\nto advance machine translation capabilities for Karakalpak and contribute to\nexpanding linguistic diversity in NLP technologies.\n","authors":["Mukhammadsaid Mamasaidov","Abror Shopulatov"],"pdf_url":"https://arxiv.org/pdf/2409.04269v1.pdf","comment":"Submitted to WMT 2024"},{"id":"http://arxiv.org/abs/2409.04267v1","updated":"2024-09-06T13:24:22Z","published":"2024-09-06T13:24:22Z","title":"An overview of domain-specific foundation model: key technologies,\n applications and challenges","summary":" The impressive performance of ChatGPT and other foundation-model-based\nproducts in human language understanding has prompted both academia and\nindustry to explore how these models can be tailored for specific industries\nand application scenarios. This process, known as the customization of\ndomain-specific foundation models, addresses the limitations of general-purpose\nmodels, which may not fully capture the unique patterns and requirements of\ndomain-specific data. Despite its importance, there is a notable lack of\ncomprehensive overview papers on building domain-specific foundation models,\nwhile numerous resources exist for general-purpose models. To bridge this gap,\nthis article provides a timely and thorough overview of the methodology for\ncustomizing domain-specific foundation models. It introduces basic concepts,\noutlines the general architecture, and surveys key methods for constructing\ndomain-specific models. Furthermore, the article discusses various domains that\ncan benefit from these specialized models and highlights the challenges ahead.\nThrough this overview, we aim to offer valuable guidance and reference for\nresearchers and practitioners from diverse fields to develop their own\ncustomized foundation models.\n","authors":["Haolong Chen","Hanzhi Chen","Zijian Zhao","Kaifeng Han","Guangxu Zhu","Yichen Zhao","Ying Du","Wei Xu","Qingjiang Shi"],"pdf_url":"https://arxiv.org/pdf/2409.04267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13492v2","updated":"2024-09-06T12:28:31Z","published":"2024-07-18T13:20:53Z","title":"Enhancing Biomedical Knowledge Discovery for Diseases: An Open-Source\n Framework Applied on Rett Syndrome and Alzheimer's Disease","summary":" The ever-growing volume of biomedical publications creates a critical need\nfor efficient knowledge discovery. In this context, we introduce an open-source\nend-to-end framework designed to construct knowledge around specific diseases\ndirectly from raw text. To facilitate research in disease-related knowledge\ndiscovery, we create two annotated datasets focused on Rett syndrome and\nAlzheimer's disease, enabling the identification of semantic relations between\nbiomedical entities. Extensive benchmarking explores various ways to represent\nrelations and entity representations, offering insights into optimal modeling\nstrategies for semantic relation detection and highlighting language models'\ncompetence in knowledge discovery. We also conduct probing experiments using\ndifferent layer representations and attention scores to explore transformers'\nability to capture semantic relations.\n","authors":["Christos Theodoropoulos","Andrei Catalin Coman","James Henderson","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2407.13492v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.04206v1","updated":"2024-09-06T11:53:37Z","published":"2024-09-06T11:53:37Z","title":"Fast Forwarding Low-Rank Training","summary":" Parameter efficient finetuning methods like low-rank adaptation (LoRA) aim to\nreduce the computational costs of finetuning pretrained Language Models (LMs).\nEnabled by these low-rank settings, we propose an even more efficient\noptimization strategy: Fast Forward, a simple and effective approach to\naccelerate large segments of training. In a Fast Forward stage, we repeat the\nmost recent optimizer step until the loss stops improving on a tiny validation\nset. By alternating between regular optimization steps and Fast Forward stages,\nFast Forward provides up to an 87\\% reduction in FLOPs and up to an 81\\%\nreduction in train time over standard SGD with Adam. We validate Fast Forward\nby finetuning various models on different tasks and demonstrate that it speeds\nup training without compromising model performance. Additionally, we analyze\nwhen and how to apply Fast Forward.\n","authors":["Adir Rahamim","Naomi Saphra","Sara Kangaslahti","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2409.04206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08632v2","updated":"2024-09-06T11:20:13Z","published":"2024-08-16T09:52:02Z","title":"A Survey on Benchmarks of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) are gaining increasing popularity in\nboth academia and industry due to their remarkable performance in various\napplications such as visual question answering, visual perception,\nunderstanding, and reasoning. Over the past few years, significant efforts have\nbeen made to examine MLLMs from multiple perspectives. This paper presents a\ncomprehensive review of 200 benchmarks and evaluations for MLLMs, focusing on\n(1)perception and understanding, (2)cognition and reasoning, (3)specific\ndomains, (4)key capabilities, and (5)other modalities. Finally, we discuss the\nlimitations of the current evaluation methods for MLLMs and explore promising\nfuture directions. Our key argument is that evaluation should be regarded as a\ncrucial discipline to support the development of MLLMs better. For more\ndetails, please visit our GitHub repository:\nhttps://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey.\n","authors":["Jian Li","Weiheng Lu","Hao Fei","Meng Luo","Ming Dai","Min Xia","Yizhang Jin","Zhenye Gan","Ding Qi","Chaoyou Fu","Ying Tai","Wankou Yang","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04185v1","updated":"2024-09-06T11:01:55Z","published":"2024-09-06T11:01:55Z","title":"Residual Stream Analysis with Multi-Layer SAEs","summary":" Sparse autoencoders (SAEs) are a promising approach to interpreting the\ninternal representations of transformer language models. However, standard SAEs\nare trained separately on each transformer layer, making it difficult to use\nthem to study how information flows across layers. To solve this problem, we\nintroduce the multi-layer SAE (MLSAE): a single SAE trained on the residual\nstream activation vectors from every transformer layer simultaneously. The\nresidual stream is usually understood as preserving information across layers,\nso we expected to, and did, find individual SAE features that are active at\nmultiple layers. Interestingly, while a single SAE feature is active at\ndifferent layers for different prompts, for a single prompt, we find that a\nsingle feature is far more likely to be active at a single layer. For larger\nunderlying models, we find that the cosine similarities between adjacent layers\nin the residual stream are higher, so we expect more features to be active at\nmultiple layers. These results show that MLSAEs are a promising method to study\ninformation flow in transformers. We release our code to train and analyze\nMLSAEs at https://github.com/tim-lawson/mlsae.\n","authors":["Tim Lawson","Lucy Farnik","Conor Houghton","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2409.04185v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.04183v1","updated":"2024-09-06T10:57:34Z","published":"2024-09-06T10:57:34Z","title":"GALLa: Graph Aligned Large Language Models for Improved Source Code\n Understanding","summary":" Programming languages possess rich semantic information such as data flow\nthat is represented by graphs and not available from the surface form of source\ncode. Recent code language models have scaled to billions of parameters, but\nmodel source code solely as text tokens while ignoring any other structural\ninformation. Conversely, models that do encode structural information of code\nmake modifications to the Transformer architecture, limiting their scale and\ncompatibility with pretrained LLMs. In this work, we take the best of both\nworlds with GALLa - Graph Aligned Large Language Model. GALLa utilizes graph\nneural networks and cross-modal alignment technologies to inject the structural\ninformation of code into LLMs as an auxiliary task during finetuning. This\nframework is both model-agnostic and task-agnostic, as it can be applied to any\ncode LLM for any code downstream task, and requires the structural graph data\nonly at training time from a corpus unrelated to the finetuning data, while\nincurring no cost at inference time over the baseline LLM. Experiments on five\ncode tasks with four different baseline LLMs ranging in size from 350M to 8B\nvalidate the effectiveness of GALLa, demonstrating consistent improvement over\nthe baseline, even for powerful models such as LLaMA3.\n","authors":["Ziyin Zhang","Hang Yu","Shijie Li","Peng Di","Jianguo Li","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04181v1","updated":"2024-09-06T10:49:46Z","published":"2024-09-06T10:49:46Z","title":"Combining LLMs and Knowledge Graphs to Reduce Hallucinations in Question\n Answering","summary":" Advancements in natural language processing have revolutionized the way we\ncan interact with digital information systems, such as databases, making them\nmore accessible. However, challenges persist, especially when accuracy is\ncritical, as in the biomedical domain. A key issue is the hallucination\nproblem, where models generate information unsupported by the underlying data,\npotentially leading to dangerous misinformation. This paper presents a novel\napproach designed to bridge this gap by combining Large Language Models (LLM)\nand Knowledge Graphs (KG) to improve the accuracy and reliability of\nquestion-answering systems, on the example of a biomedical KG. Built on the\nLangChain framework, our method incorporates a query checker that ensures the\nsyntactical and semantic validity of LLM-generated queries, which are then used\nto extract information from a Knowledge Graph, substantially reducing errors\nlike hallucinations. We evaluated the overall performance using a new benchmark\ndataset of 50 biomedical questions, testing several LLMs, including GPT-4 Turbo\nand llama3:70b. Our results indicate that while GPT-4 Turbo outperforms other\nmodels in generating accurate queries, open-source models like llama3:70b show\npromise with appropriate prompt engineering. To make this approach accessible,\na user-friendly web-based interface has been developed, allowing users to input\nnatural language queries, view generated and corrected Cypher queries, and\nverify the resulting paths for accuracy. Overall, this hybrid approach\neffectively addresses common issues such as data gaps and hallucinations,\noffering a reliable and intuitive solution for question answering systems. The\nsource code for generating the results of this paper and for the user-interface\ncan be found in our Git repository: https://git.zib.de/lpusch/cyphergenkg-gui\n","authors":["Larissa Pusch","Tim O. F. Conrad"],"pdf_url":"https://arxiv.org/pdf/2409.04181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02795v2","updated":"2024-09-06T10:30:36Z","published":"2024-09-04T15:11:55Z","title":"Towards a Unified View of Preference Learning for Large Language Models:\n A Survey","summary":" Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of\nthe crucial factors to achieve success is aligning the LLM's output with human\npreferences. This alignment process often requires only a small amount of data\nto efficiently enhance the LLM's performance. While effective, research in this\narea spans multiple domains, and the methods involved are relatively complex to\nunderstand. The relationships between different methods have been\nunder-explored, limiting the development of the preference alignment. In light\nof this, we break down the existing popular alignment strategies into different\ncomponents and provide a unified framework to study the current alignment\nstrategies, thereby establishing connections among them. In this survey, we\ndecompose all the strategies in preference learning into four components:\nmodel, data, feedback, and algorithm. This unified view offers an in-depth\nunderstanding of existing alignment algorithms and also opens up possibilities\nto synergize the strengths of different strategies. Furthermore, we present\ndetailed working examples of prevalent existing algorithms to facilitate a\ncomprehensive understanding for the readers. Finally, based on our unified\nperspective, we explore the challenges and future research directions for\naligning large language models with human preferences.\n","authors":["Bofei Gao","Feifan Song","Yibo Miao","Zefan Cai","Zhe Yang","Liang Chen","Helan Hu","Runxin Xu","Qingxiu Dong","Ce Zheng","Wen Xiao","Ge Zhang","Daoguang Zan","Keming Lu","Bowen Yu","Dayiheng Liu","Zeyu Cui","Jian Yang","Lei Sha","Houfeng Wang","Zhifang Sui","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2409.02795v2.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.07747v2","updated":"2024-09-06T10:19:10Z","published":"2024-03-12T15:32:39Z","title":"FineMath: A Fine-Grained Mathematical Evaluation Benchmark for Chinese\n Large Language Models","summary":" To thoroughly assess the mathematical reasoning abilities of Large Language\nModels (LLMs), we need to carefully curate evaluation datasets covering diverse\nmathematical concepts and mathematical problems at different difficulty levels.\nIn pursuit of this objective, we propose FineMath in this paper, a fine-grained\nmathematical evaluation benchmark dataset for assessing Chinese LLMs. FineMath\nis created to cover the major key mathematical concepts taught in elementary\nschool math, which are further divided into 17 categories of math word\nproblems, enabling in-depth analysis of mathematical reasoning abilities of\nLLMs. All the 17 categories of math word problems are manually annotated with\ntheir difficulty levels according to the number of reasoning steps required to\nsolve these problems. We conduct extensive experiments on a wide range of LLMs\non FineMath and find that there is still considerable room for improvements in\nterms of mathematical reasoning capability of Chinese LLMs. We also carry out\nan in-depth analysis on the evaluation process and methods that have been\noverlooked previously. These two factors significantly influence the model\nresults and our understanding of their mathematical reasoning capabilities. The\ndataset will be publicly available soon.\n","authors":["Yan Liu","Renren Jin","Ling Shi","Zheng Yao","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2403.07747v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04168v1","updated":"2024-09-06T10:09:41Z","published":"2024-09-06T10:09:41Z","title":"From Calculation to Adjudication: Examining LLM judges on Mathematical\n Reasoning Tasks","summary":" To reduce the need for human annotations, large language models (LLMs) have\nbeen proposed as judges of the quality of other candidate models. LLM judges\nare typically evaluated by measuring the correlation with human judgments on\ngeneration tasks such as summarization or machine translation. In contrast, we\nstudy LLM judges on mathematical reasoning tasks. These tasks require\nmulti-step reasoning, and the correctness of their solutions is verifiable,\nenabling a more objective evaluation. We perform a detailed performance\nanalysis and find that the used judges are mostly unable to improve task\nperformance but are able to pick the better model. Our analysis uncovers a\nstrong correlation between judgment performance and the candidate model task\nperformance. We observe that judges tend to choose the model of higher quality\neven if its answer is incorrect. Further, we show that it is possible to use\nstatistics, such as the task performances of the individual models, to predict\njudgment performance. In an ablation, we either swap or mask the candidate\nanswers and observe that judges often keep the original judgment, providing\nevidence that judges incorporate writing style in their judgments. In summary,\nwe find that regularities in the judgments are quantifiable using statistical\nmeasures and provide various angles on exploiting them.\n","authors":["Andreas Stephan","Dawei Zhu","Matthias Aßenmacher","Xiaoyu Shen","Benjamin Roth"],"pdf_url":"https://arxiv.org/pdf/2409.04168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04164v1","updated":"2024-09-06T10:03:49Z","published":"2024-09-06T10:03:49Z","title":"Can OpenSource beat ChatGPT? -- A Comparative Study of Large Language\n Models for Text-to-Code Generation","summary":" In recent years, large language models (LLMs) have emerged as powerful tools\nwith potential applications in various fields, including software engineering.\nWithin the scope of this research, we evaluate five different state-of-the-art\nLLMs - Bard, BingChat, ChatGPT, Llama2, and Code Llama - concerning their\ncapabilities for text-to-code generation. In an empirical study, we feed\nprompts with textual descriptions of coding problems sourced from the\nprogramming website LeetCode to the models with the task of creating solutions\nin Python. Subsequently, the quality of the generated outputs is assessed using\nthe testing functionalities of LeetCode. The results indicate large differences\nin performance between the investigated models. ChatGPT can handle these\ntypical programming challenges by far the most effectively, surpassing even\ncode-specialized models like Code Llama. To gain further insights, we measure\nthe runtime as well as the memory usage of the generated outputs and compared\nthem to the other code submissions on Leetcode. A detailed error analysis,\nencompassing a comparison of the differences concerning correct indentation and\nform of the generated code as well as an assignment of the incorrectly solved\ntasks to certain error categories allows us to obtain a more nuanced picture of\nthe results and potential for improvement. The results also show a clear\npattern of increasingly incorrect produced code when the models are facing a\nlot of context in the form of longer prompts.\n","authors":["Luis Mayer","Christian Heumann","Matthias Aßenmacher"],"pdf_url":"https://arxiv.org/pdf/2409.04164v1.pdf","comment":"Conference Paper accepted at the 9th SwissText Conference (2024)"},{"id":"http://arxiv.org/abs/2409.03381v2","updated":"2024-09-06T09:37:36Z","published":"2024-09-05T09:33:24Z","title":"CogniDual Framework: Self-Training Large Language Models within a\n Dual-System Theoretical Framework for Improving Cognitive Tasks","summary":" Cognitive psychology investigates perception, attention, memory, language,\nproblem-solving, decision-making, and reasoning. Kahneman's dual-system theory\nelucidates the human decision-making process, distinguishing between the rapid,\nintuitive System 1 and the deliberative, rational System 2. Recent advancements\nhave positioned large language Models (LLMs) as formidable tools nearing\nhuman-level proficiency in various cognitive tasks. Nonetheless, the presence\nof a dual-system framework analogous to human cognition in LLMs remains\nunexplored. This study introduces the \\textbf{CogniDual Framework for LLMs}\n(CFLLMs), designed to assess whether LLMs can, through self-training, evolve\nfrom deliberate deduction to intuitive responses, thereby emulating the human\nprocess of acquiring and mastering new information. Our findings reveal the\ncognitive mechanisms behind LLMs' response generation, enhancing our\nunderstanding of their capabilities in cognitive psychology. Practically,\nself-trained models can provide faster responses to certain queries, reducing\ncomputational demands during inference.\n","authors":["Yongxin Deng","Xihe Qiu","Xiaoyu Tan","Chao Qu","Jing Pan","Yuan Cheng","Yinghui Xu","Wei Chu"],"pdf_url":"https://arxiv.org/pdf/2409.03381v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04150v1","updated":"2024-09-06T09:26:45Z","published":"2024-09-06T09:26:45Z","title":"A Coin Has Two Sides: A Novel Detector-Corrector Framework for Chinese\n Spelling Correction","summary":" Chinese Spelling Correction (CSC) stands as a foundational Natural Language\nProcessing (NLP) task, which primarily focuses on the correction of erroneous\ncharacters in Chinese texts. Certain existing methodologies opt to disentangle\nthe error correction process, employing an additional error detector to\npinpoint error positions. However, owing to the inherent performance\nlimitations of error detector, precision and recall are like two sides of the\ncoin which can not be both facing up simultaneously. Furthermore, it is also\nworth investigating how the error position information can be judiciously\napplied to assist the error correction. In this paper, we introduce a novel\napproach based on error detector-corrector framework. Our detector is designed\nto yield two error detection results, each characterized by high precision and\nrecall. Given that the occurrence of errors is context-dependent and detection\noutcomes may be less precise, we incorporate the error detection results into\nthe CSC task using an innovative feature fusion strategy and a selective\nmasking strategy. Empirical experiments conducted on mainstream CSC datasets\nsubstantiate the efficacy of our proposed method.\n","authors":["Xiangke Zeng","Zuchao Li","Lefei Zhang","Ping Wang","Hongqiu Wu","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.04150v1.pdf","comment":"ECAI-2024"},{"id":"http://arxiv.org/abs/2409.04122v1","updated":"2024-09-06T08:43:10Z","published":"2024-09-06T08:43:10Z","title":"Prompt-based Personality Profiling: Reinforcement Learning for Relevance\n Filtering","summary":" Author profiling is the task of inferring characteristics about individuals\nby analyzing content they share. Supervised machine learning still dominates\nautomatic systems that perform this task, despite the popularity of prompting\nlarge language models to address natural language understanding tasks. One\nreason is that the classification instances consist of large amounts of posts,\npotentially a whole user profile, which may exceed the input length of\nTransformers. Even if a model can use a large context window, the entirety of\nposts makes the application of API-accessed black box systems costly and slow,\nnext to issues which come with such \"needle-in-the-haystack\" tasks. To mitigate\nthis limitation, we propose a new method for author profiling which aims at\ndistinguishing relevant from irrelevant content first, followed by the actual\nuser profiling only with relevant data. To circumvent the need for\nrelevance-annotated data, we optimize this relevance filter via reinforcement\nlearning with a reward function that utilizes the zero-shot capabilities of\nlarge language models. We evaluate our method for Big Five personality trait\nprediction on two Twitter corpora. On publicly available real-world data with a\nskewed label distribution, our method shows similar efficacy to using all posts\nin a user profile, but with a substantially shorter context. An evaluation on a\nversion of these data balanced with artificial posts shows that the filtering\nto relevant posts leads to a significantly improved accuracy of the\npredictions.\n","authors":["Jan Hofmann","Cornelia Sindermann","Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2409.04122v1.pdf","comment":"preprint, under review, supplementary material will be made available\n upon acceptance of the paper"},{"id":"http://arxiv.org/abs/2409.04117v1","updated":"2024-09-06T08:35:28Z","published":"2024-09-06T08:35:28Z","title":"Confidence-Aware Document OCR Error Detection","summary":" Optical Character Recognition (OCR) continues to face accuracy challenges\nthat impact subsequent applications. To address these errors, we explore the\nutility of OCR confidence scores for enhancing post-OCR error detection. Our\nstudy involves analyzing the correlation between confidence scores and error\nrates across different OCR systems. We develop ConfBERT, a BERT-based model\nthat incorporates OCR confidence scores into token embeddings and offers an\noptional pre-training phase for noise adjustment. Our experimental results\ndemonstrate that integrating OCR confidence scores can enhance error detection\ncapabilities. This work underscores the importance of OCR confidence scores in\nimproving detection accuracy and reveals substantial disparities in performance\nbetween commercial and open-source OCR technologies.\n","authors":["Arthur Hemmer","Mickaël Coustaty","Nicola Bartolo","Jean-Marc Ogier"],"pdf_url":"https://arxiv.org/pdf/2409.04117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04114v1","updated":"2024-09-06T08:31:18Z","published":"2024-09-06T08:31:18Z","title":"Multi-Programming Language Ensemble for Code Generation in Large\n Language Model","summary":" Large language models (LLMs) have significantly improved code generation,\nparticularly in one-pass code generation. However, most existing approaches\nfocus solely on generating code in a single programming language, overlooking\nthe potential of leveraging the multi-language capabilities of LLMs. LLMs have\nvarying patterns of errors across different languages, suggesting that a more\nrobust approach could be developed by leveraging these multi-language outputs.\nIn this study, we propose Multi-Programming Language Ensemble (MPLE), a novel\nensemble-based method that utilizes code generation across multiple programming\nlanguages to enhance overall performance. By treating each language-specific\ncode generation process as an individual \"weak expert\" and effectively\nintegrating their outputs, our method mitigates language-specific errors and\nbiases. This multi-language ensemble strategy leverages the complementary\nstrengths of different programming languages, enabling the model to produce\nmore accurate and robust code. Our approach can be seamlessly integrated with\ncommonly used techniques such as the reflection algorithm and Monte Carlo tree\nsearch to improve code generation quality further. Experimental results show\nthat our framework consistently enhances baseline performance by up to 17.92%\non existing benchmarks (HumanEval and HumanEval-plus), with a standout result\nof 96.25% accuracy on the HumanEval benchmark, achieving new state-of-the-art\nresults across various LLM models. The code will be released at\nhttps://github.com/NinjaTech-AI/MPLE\n","authors":["Tengfei Xue","Xuefeng Li","Tahir Azim","Roman Smirnov","Jianhui Yu","Arash Sadrieh","Babak Pahlavan"],"pdf_url":"https://arxiv.org/pdf/2409.04114v1.pdf","comment":"Code available at https://github.com/NinjaTech-AI/MPLE"},{"id":"http://arxiv.org/abs/2407.03637v4","updated":"2024-09-06T08:28:01Z","published":"2024-07-04T05:13:58Z","title":"QET: Enhancing Quantized LLM Parameters and KV cache Compression through\n Element Substitution and Residual Clustering","summary":" The matrix quantization entails representing matrix elements in a more\nspace-efficient form to reduce storage usage, with dequantization restoring the\noriginal matrix for use. We formulate the Quantization Error Minimization (QEM)\nproblem as minimizing the distance between a matrix before and after\nquantization, under the condition that the quantized matrix occupies the same\nmemory space. Matrix quantization is crucial in various applications, including\nLarge Language Models (LLMs) weight quantization, vector databases, KV cache\nquantization, graph compression, and image compression. Recent advancements in\nLLMs, such as GPT-4 and BERT, have highlighted the importance of matrix\ncompression due to the large size of parameters and KV cache, which are stored\nas matrices.\n We propose Quantum Entanglement Trees (QET) to address the QEM problem by\nleveraging the local orderliness of matrix elements, involving iterative\nelement swapping to form a locally ordered matrix. This matrix is then grouped\nand quantized by columns. To enhance QET, we introduce two optimizations:\nfurther quantizing residuals to reduce MSE, and using masking and batch\nprocessing to accelerate the algorithm.\n Experimental results demonstrate that QET can effectively reduce MSE to\n5.05%, 13.33%, and 11.89% of the current best method on the LLM dataset, K\ncache, and V cache, respectively. Our contributions include the abstraction of\nthe QEM problem, the design of the QET algorithm, and the proposal of two\noptimizations to improve accuracy and speed.\n","authors":["Yanshu Wang","Wang Li","Zhaoqian Yao","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03637v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04109v1","updated":"2024-09-06T08:25:03Z","published":"2024-09-06T08:25:03Z","title":"Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with\n 100+ NLP Researchers","summary":" Recent advancements in large language models (LLMs) have sparked optimism\nabout their potential to accelerate scientific discovery, with a growing number\nof works proposing research agents that autonomously generate and validate new\nideas. Despite this, no evaluations have shown that LLM systems can take the\nvery first step of producing novel, expert-level ideas, let alone perform the\nentire research process. We address this by establishing an experimental design\nthat evaluates research idea generation while controlling for confounders and\nperforms the first head-to-head comparison between expert NLP researchers and\nan LLM ideation agent. By recruiting over 100 NLP researchers to write novel\nideas and blind reviews of both LLM and human ideas, we obtain the first\nstatistically significant conclusion on current LLM capabilities for research\nideation: we find LLM-generated ideas are judged as more novel (p < 0.05) than\nhuman expert ideas while being judged slightly weaker on feasibility. Studying\nour agent baselines closely, we identify open problems in building and\nevaluating research agents, including failures of LLM self-evaluation and their\nlack of diversity in generation. Finally, we acknowledge that human judgements\nof novelty can be difficult, even by experts, and propose an end-to-end study\ndesign which recruits researchers to execute these ideas into full projects,\nenabling us to study whether these novelty and feasibility judgements result in\nmeaningful differences in research outcome.\n","authors":["Chenglei Si","Diyi Yang","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2409.04109v1.pdf","comment":"main paper is 20 pages"},{"id":"http://arxiv.org/abs/2409.04085v1","updated":"2024-09-06T07:53:33Z","published":"2024-09-06T07:53:33Z","title":"Structure and dynamics of growing networks of Reddit threads","summary":" Millions of people use online social networks to reinforce their sense of\nbelonging, for example by giving and asking for feedback as a form of social\nvalidation and self-recognition. It is common to observe disagreement among\npeople beliefs and points of view when expressing this feedback. Modeling and\nanalyzing such interactions is crucial to understand social phenomena that\nhappen when people face different opinions while expressing and discussing\ntheir values. In this work, we study a Reddit community in which people\nparticipate to judge or be judged with respect to some behavior, as it\nrepresents a valuable source to study how users express judgments online. We\nmodel threads of this community as complex networks of user interactions\ngrowing in time, and we analyze the evolution of their structural properties.\nWe show that the evolution of Reddit networks differ from other real social\nnetworks, despite falling in the same category. This happens because their\nglobal clustering coefficient is extremely small and the average shortest path\nlength increases over time. Such properties reveal how users discuss in\nthreads, i.e. with mostly one other user and often by a single message. We\nstrengthen such result by analyzing the role that disagreement and reciprocity\nplay in such conversations. We also show that Reddit thread's evolution over\ntime is governed by two subgraphs growing at different speeds. We discover\nthat, in the studied community, the difference of such speed is higher than in\nother communities because of the user guidelines enforcing specific user\ninteractions. Finally, we interpret the obtained results on user behavior\ndrawing back to Social Judgment Theory.\n","authors":["Diletta Goglia","Davide Vega"],"pdf_url":"https://arxiv.org/pdf/2409.04085v1.pdf","comment":"29 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2409.04081v1","updated":"2024-09-06T07:44:44Z","published":"2024-09-06T07:44:44Z","title":"UI-JEPA: Towards Active Perception of User Intent through Onscreen User\n Activity","summary":" Generating user intent from a sequence of user interface (UI) actions is a\ncore challenge in comprehensive UI understanding. Recent advancements in\nmultimodal large language models (MLLMs) have led to substantial progress in\nthis area, but their demands for extensive model parameters, computing power,\nand high latency makes them impractical for scenarios requiring lightweight,\non-device solutions with low latency or heightened privacy. Additionally, the\nlack of high-quality datasets has hindered the development of such lightweight\nmodels. To address these challenges, we propose UI-JEPA, a novel framework that\nemploys masking strategies to learn abstract UI embeddings from unlabeled data\nthrough self-supervised learning, combined with an LLM decoder fine-tuned for\nuser intent prediction. We also introduce two new UI-grounded multimodal\ndatasets, \"Intent in the Wild\" (IIW) and \"Intent in the Tame\" (IIT), designed\nfor few-shot and zero-shot UI understanding tasks. IIW consists of 1.7K videos\nacross 219 intent categories, while IIT contains 914 videos across 10\ncategories. We establish the first baselines for these datasets, showing that\nrepresentations learned using a JEPA-style objective, combined with an LLM\ndecoder, can achieve user intent predictions that match the performance of\nstate-of-the-art large MLLMs, but with significantly reduced annotation and\ndeployment resources. Measured by intent similarity scores, UI-JEPA outperforms\nGPT-4 Turbo and Claude 3.5 Sonnet by 10.0% and 7.2% respectively, averaged\nacross two datasets. Notably, UI-JEPA accomplishes the performance with a 50.5x\nreduction in computational cost and a 6.6x improvement in latency in the IIW\ndataset. These results underscore the effectiveness of UI-JEPA, highlighting\nits potential for lightweight, high-performance UI understanding.\n","authors":["Yicheng Fu","Raviteja Anantha","Prabal Vashisht","Jianpeng Cheng","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2409.04081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13110v4","updated":"2024-09-06T07:40:40Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v4.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes/formatting improvements. V3: improvements from journal\n revisions. V4: fix figures"},{"id":"http://arxiv.org/abs/2409.04073v1","updated":"2024-09-06T07:29:01Z","published":"2024-09-06T07:29:01Z","title":"AnyMatch -- Efficient Zero-Shot Entity Matching with a Small Language\n Model","summary":" Entity matching (EM) is the problem of determining whether two records refer\nto same real-world entity, which is crucial in data integration, e.g., for\nproduct catalogs or address databases. A major drawback of many EM approaches\nis their dependence on labelled examples. We thus focus on the challenging\nsetting of zero-shot entity matching where no labelled examples are available\nfor an unseen target dataset. Recently, large language models (LLMs) have shown\npromising results for zero-shot EM, but their low throughput and high\ndeployment cost limit their applicability and scalability.\n We revisit the zero-shot EM problem with AnyMatch, a small language model\nfine-tuned in a transfer learning setup. We propose several novel data\nselection techniques to generate fine-tuning data for our model, e.g., by\nselecting difficult pairs to match via an AutoML filter, by generating\nadditional attribute-level examples, and by controlling label imbalance in the\ndata.\n We conduct an extensive evaluation of the prediction quality and deployment\ncost of our model, in a comparison to thirteen baselines on nine benchmark\ndatasets. We find that AnyMatch provides competitive prediction quality despite\nits small parameter size: it achieves the second-highest F1 score overall, and\noutperforms several other approaches that employ models with hundreds of\nbillions of parameters. Furthermore, our approach exhibits major cost benefits:\nthe average prediction quality of AnyMatch is within 4.4% of the\nstate-of-the-art method MatchGPT with the proprietary trillion-parameter model\nGPT-4, yet AnyMatch requires four orders of magnitude less parameters and\nincurs a 3,899 times lower inference cost (in dollars per 1,000 tokens).\n","authors":["Zeyu Zhang","Paul Groth","Iacer Calixto","Sebastian Schelter"],"pdf_url":"https://arxiv.org/pdf/2409.04073v1.pdf","comment":"12 pages excluding references, 3 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2409.04057v1","updated":"2024-09-06T06:57:04Z","published":"2024-09-06T06:57:04Z","title":"Self-Harmonized Chain of Thought","summary":" Chain-of-Thought (CoT) prompting reveals that large language models are\ncapable of performing complex reasoning via intermediate steps. CoT prompting\nis primarily categorized into three approaches. The first approach utilizes\nstraightforward prompts like ``Let's think step by step'' to generate a\nsequential thought process before yielding an answer. The second approach makes\nuse of human-crafted, step-by-step demonstrations to guide the model's\nreasoning process. The third automates the generation of reasoned\ndemonstrations with the 'Let's think step by step'.This approach sometimes\nleads to reasoning errors, highlighting the need to diversify demonstrations to\nmitigate its misleading effects. However, diverse demonstrations pose\nchallenges for effective representations. In this work, we propose ECHO, a\nself-harmonized chain-of-thought prompting method. It consolidates diverse\nsolution paths into a uniform and effective solution pattern.ECHO demonstrates\nthe best overall performance across three reasoning domains.\n","authors":["Ziqi Jin","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2409.04057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04056v1","updated":"2024-09-06T06:53:45Z","published":"2024-09-06T06:53:45Z","title":"Refining Wikidata Taxonomy using Large Language Models","summary":" Due to its collaborative nature, Wikidata is known to have a complex\ntaxonomy, with recurrent issues like the ambiguity between instances and\nclasses, the inaccuracy of some taxonomic paths, the presence of cycles, and\nthe high level of redundancy across classes. Manual efforts to clean up this\ntaxonomy are time-consuming and prone to errors or subjective decisions. We\npresent WiKC, a new version of Wikidata taxonomy cleaned automatically using a\ncombination of Large Language Models (LLMs) and graph mining techniques.\nOperations on the taxonomy, such as cutting links or merging classes, are\nperformed with the help of zero-shot prompting on an open-source LLM. The\nquality of the refined taxonomy is evaluated from both intrinsic and extrinsic\nperspectives, on a task of entity typing for the latter, showing the practical\ninterest of WiKC.\n","authors":["Yiwen Peng","Thomas Bonald","Mehwish Alam"],"pdf_url":"https://arxiv.org/pdf/2409.04056v1.pdf","comment":"ACM International Conference on Information and Knowledge Management,\n Oct 2024, Boise, Idaho, United States"},{"id":"http://arxiv.org/abs/2409.03659v2","updated":"2024-09-06T06:50:32Z","published":"2024-09-05T16:12:29Z","title":"LLM-based multi-agent poetry generation in non-cooperative environments","summary":" Despite substantial progress of large language models (LLMs) for automatic\npoetry generation, the generated poetry lacks diversity while the training\nprocess differs greatly from human learning. Under the rationale that the\nlearning process of the poetry generation systems should be more human-like and\ntheir output more diverse and novel, we introduce a framework based on social\nlearning where we emphasize non-cooperative interactions besides cooperative\ninteractions to encourage diversity. Our experiments are the first attempt at\nLLM-based multi-agent systems in non-cooperative environments for poetry\ngeneration employing both TRAINING-BASED agents (GPT-2) and PROMPTING-BASED\nagents (GPT-3 and GPT-4). Our evaluation based on 96k generated poems shows\nthat our framework benefits the poetry generation process for TRAINING-BASED\nagents resulting in 1) a 3.0-3.7 percentage point (pp) increase in diversity\nand a 5.6-11.3 pp increase in novelty according to distinct and novel n-grams.\nThe generated poetry from TRAINING-BASED agents also exhibits group divergence\nin terms of lexicons, styles and semantics. PROMPTING-BASED agents in our\nframework also benefit from non-cooperative environments and a more diverse\nensemble of models with non-homogeneous agents has the potential to further\nenhance diversity, with an increase of 7.0-17.5 pp according to our\nexperiments. However, PROMPTING-BASED agents show a decrease in lexical\ndiversity over time and do not exhibit the group-based divergence intended in\nthe social network. Our paper argues for a paradigm shift in creative tasks\nsuch as automatic poetry generation to include social learning processes (via\nLLM-based agent modeling) similar to human interaction.\n","authors":["Ran Zhang","Steffen Eger"],"pdf_url":"https://arxiv.org/pdf/2409.03659v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.04043v1","updated":"2024-09-06T06:27:35Z","published":"2024-09-06T06:27:35Z","title":"Towards Safer Online Spaces: Simulating and Assessing Intervention\n Strategies for Eating Disorder Discussions","summary":" Eating disorders are complex mental health conditions that affect millions of\npeople around the world. Effective interventions on social media platforms are\ncrucial, yet testing strategies in situ can be risky. We present a novel\nLLM-driven experimental testbed for simulating and assessing intervention\nstrategies in ED-related discussions. Our framework generates synthetic\nconversations across multiple platforms, models, and ED-related topics,\nallowing for controlled experimentation with diverse intervention approaches.\nWe analyze the impact of various intervention strategies on conversation\ndynamics across four dimensions: intervention type, generative model, social\nmedia platform, and ED-related community/topic. We employ cognitive domain\nanalysis metrics, including sentiment, emotions, etc., to evaluate the\neffectiveness of interventions. Our findings reveal that civility-focused\ninterventions consistently improve positive sentiment and emotional tone across\nall dimensions, while insight-resetting approaches tend to increase negative\nemotions. We also uncover significant biases in LLM-generated conversations,\nwith cognitive metrics varying notably between models (Claude-3 Haiku $>$\nMistral $>$ GPT-3.5-turbo $>$ LLaMA3) and even between versions of the same\nmodel. These variations highlight the importance of model selection in\nsimulating realistic discussions related to ED. Our work provides valuable\ninformation on the complex dynamics of ED-related discussions and the\neffectiveness of various intervention strategies.\n","authors":["Louis Penafiel","Hsien-Te Kao","Isabel Erickson","David Chu","Robert McCormack","Kristina Lerman","Svitlana Volkova"],"pdf_url":"https://arxiv.org/pdf/2409.04043v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.17844v3","updated":"2024-09-06T05:29:33Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n Classification: A Systematic Review","summary":" Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy concerns. The goal of this systematic review is to explore the current\nlandscape of speech-based DL approaches for PD classification, based on 33\nscientific works published between January 2020 and March 2024. We discuss\ntheir available resources, capabilities, and potential limitations, and issues\nrelated to bias, explainability, and privacy. Furthermore, this review provides\nan overview of publicly accessible speech-based datasets and open-source\nmaterial for PD. The DL approaches identified are categorized into end-to-end\n(E2E) learning, transfer learning (TL), and deep acoustic feature extraction\n(DAFE). Among E2E approaches, Convolutional Neural Networks (CNNs) are\nprevalent, though Transformers are increasingly popular. E2E approaches face\nchallenges such as limited data and computational resources, especially with\nTransformers. TL addresses these issues by providing more robust PD diagnosis\nand better generalizability across languages. DAFE aims to improve the\nexplainability and interpretability of results by examining the specific\neffects of deep features on both other DL approaches and more traditional\nmachine learning (ML) methods. However, it often underperforms compared to E2E\nand TL approaches.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v3.pdf","comment":"van Gelderen, L., & Tejedor-Garc\\'ia, C. (2024). Innovative\n Speech-Based Deep Learning Approaches for Parkinson's Disease Classification:\n A Systematic Review. Applied Sciences, 14(17). doi:10.3390/app14177873 This\n research was funded by the NWO research programme NGF AiNed Fellowship Grants\n under the project Responsible AI for Voice Diagnostics (RAIVD) - grant number\n NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2311.04939v2","updated":"2024-09-06T05:06:51Z","published":"2023-11-08T01:45:37Z","title":"LooGLE: Can Long-Context Language Models Understand Long Contexts?","summary":" Large language models (LLMs), despite their impressive performance in various\nlanguage tasks, are typically limited to processing texts within context-window\nsize. This limitation has spurred significant research efforts to enhance LLMs'\nlong-context understanding with high-quality long-sequence benchmarks. However,\nprior datasets in this regard suffer from shortcomings, such as short context\nlength compared to the context window of modern LLMs; outdated documents that\nhave data leakage problems; and an emphasis on short dependency tasks rather\nthan long dependency tasks. In this paper, we present LooGLE, a Long Context\nGeneric Language Evaluation benchmark for LLMs' long context understanding.\nLooGLE features relatively new documents post-2022, with over 24,000 tokens per\ndocument and 6,000 newly generated questions spanning diverse domains. Human\nannotators meticulously crafted more than 1,100 high-quality question-answer\npairs to meet the long dependency requirements. These pairs underwent thorough\ncross-validation, yielding the most precise assessment of LLMs' long dependency\ncapabilities. The evaluation of eight state-of-the-art LLMs on LooGLE revealed\nkey findings: (i) commercial models outperformed open-sourced models; (ii) LLMs\nexcelled in short dependency tasks like short question-answering and cloze\ntasks but struggled with more intricate long dependency tasks; (iii) in-context\nlearning and chaining thoughts offered only marginal improvements; (iv)\nretrieval-based techniques demonstrated substantial benefits for short\nquestion-answering, while strategies for extending context window length had\nlimited impact on long context understanding. As such, LooGLE not only provides\na systematic and comprehensive evaluation schema on long-context LLMs, but also\nsheds light on future development of enhanced models towards \"true long-context\nunderstanding\".\n","authors":["Jiaqi Li","Mengmeng Wang","Zilong Zheng","Muhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.04939v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02834v2","updated":"2024-09-06T05:06:27Z","published":"2024-09-04T16:00:21Z","title":"CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the\n Mathematics Reasoning of Large Multimodal Models","summary":" Large language models (LLMs) have obtained promising results in mathematical\nreasoning, which is a foundational skill for human intelligence. Most previous\nstudies focus on improving and measuring the performance of LLMs based on\ntextual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few\nresearchers have released English multimodal math datasets (e.g., MATHVISTA and\nMATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In\nthis paper, we release a Chinese multimodal math (CMM-Math) dataset, including\nbenchmark and training parts, to evaluate and enhance the mathematical\nreasoning of LMMs. CMM-Math contains over 28,000 high-quality samples,\nfeaturing a variety of problem types (e.g., multiple-choice, fill-in-the-blank,\nand so on) with detailed solutions across 12 grade levels from elementary to\nhigh school in China. Specifically, the visual context may be present in the\nquestions or opinions, which makes this dataset more challenging. Through\ncomprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math\ndataset face challenges, emphasizing the necessity for further improvements in\nLMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to\nhandle the problems with mixed input of multiple images and text segments. We\ntrain our model using three stages, including foundational pre-training,\nfoundational fine-tuning, and mathematical fine-tuning. The extensive\nexperiments indicate that our model effectively improves math reasoning\nperformance by comparing it with the SOTA LMMs over three multimodal\nmathematical datasets.\n","authors":["Wentao Liu","Qianjun Pan","Yi Zhang","Zhuo Liu","Ji Wu","Jie Zhou","Aimin Zhou","Qin Chen","Bo Jiang","Liang He"],"pdf_url":"https://arxiv.org/pdf/2409.02834v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01345v2","updated":"2024-09-06T03:35:21Z","published":"2024-09-02T15:58:27Z","title":"Language Models Benefit from Preparation with Elicited Knowledge","summary":" The zero-shot chain of thought (CoT) approach is often used in question\nanswering (QA) by language models (LMs) for tasks that require multiple\nreasoning steps, typically enhanced by the prompt \"Let's think step by step.\"\nHowever, some QA tasks hinge more on accessing relevant knowledge than on\nchaining reasoning steps. We introduce a simple general prompting technique,\ncalled PREP, that involves using two instances of LMs: the first (LM1)\ngenerates relevant information, and the second (LM2) answers the question based\non this information. PREP is designed to be general and independent of the\nuser's domain knowledge, making it applicable across various QA tasks without\nthe need for specialized prompt engineering. To evaluate the effectiveness of\nour prompting method, we create a dataset of 100 binary-choice questions,\nderived from an extensive schematic dataset on artifact parts and material\ncomposition. These questions ask which of two artifacts is less likely to share\nmaterials with another artifact. Such questions probe the LM's knowledge of\nshared materials in the part structure of different artifacts. We test our\nmethod on our dataset and three published commonsense reasoning datasets. The\naverage accuracy of our method is consistently higher than that of all the\nother tested methods across all the tested datasets.\n","authors":["Jiacan Yu","Hannah An","Lenhart K. Schubert"],"pdf_url":"https://arxiv.org/pdf/2409.01345v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04009v1","updated":"2024-09-06T03:28:38Z","published":"2024-09-06T03:28:38Z","title":"Large Margin Prototypical Network for Few-shot Relation Classification\n with Fine-grained Features","summary":" Relation classification (RC) plays a pivotal role in both natural language\nunderstanding and knowledge graph completion. It is generally formulated as a\ntask to recognize the relationship between two entities of interest appearing\nin a free-text sentence. Conventional approaches on RC, regardless of feature\nengineering or deep learning based, can obtain promising performance on\ncategorizing common types of relation leaving a large proportion of\nunrecognizable long-tail relations due to insufficient labeled instances for\ntraining. In this paper, we consider few-shot learning is of great practical\nsignificance to RC and thus improve a modern framework of metric learning for\nfew-shot RC. Specifically, we adopt the large-margin ProtoNet with fine-grained\nfeatures, expecting they can generalize well on long-tail relations. Extensive\nexperiments were conducted by FewRel, a large-scale supervised few-shot RC\ndataset, to evaluate our framework: LM-ProtoNet (FGF). The results demonstrate\nthat it can achieve substantial improvements over many baseline approaches.\n","authors":["Miao Fan","Yeqi Bai","Mingming Sun","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2409.04009v1.pdf","comment":"Accepted by CIKM'19"},{"id":"http://arxiv.org/abs/2310.18365v3","updated":"2024-09-06T03:08:49Z","published":"2023-10-25T01:07:50Z","title":"Using GPT-4 to Augment Unbalanced Data for Automatic Scoring","summary":" Machine learning-based automatic scoring faces challenges with unbalanced\nstudent responses across scoring categories. To address this, we introduce a\nnovel text data augmentation framework leveraging GPT-4, a generative large\nlanguage model, specifically tailored for unbalanced datasets in automatic\nscoring. Our experimental dataset comprised student written responses to four\nscience items. We crafted prompts for GPT-4 to generate responses, especially\nfor minority scoring classes, enhancing the data set. We then finetuned\nDistillBERT for automatic scoring based on the augmented and original datasets.\nModel performance was assessed using accuracy, precision, recall, and F1\nmetrics. Our findings revealed that incorporating GPT-4-augmented data\nremarkedly improved model performance, particularly for precision and F1\nscores. Interestingly, the extent of improvement varied depending on the\nspecific dataset and the proportion of augmented data used. Notably, we found\nthat a varying amount of augmented data (20%-40%) was needed to obtain stable\nimprovement for automatic scoring. Comparisons with models trained on\nadditional student-written responses suggest that GPT-4 augmented models match\nthose trained with student data. This research underscores the potential and\neffectiveness of data augmentation techniques utilizing generative large\nlanguage models like GPT-4 in addressing unbalanced datasets within automated\nassessment.\n","authors":["Luyang Fang","Gyeong-Geon Lee","Xiaoming Zhai"],"pdf_url":"https://arxiv.org/pdf/2310.18365v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14418v2","updated":"2024-09-06T03:05:29Z","published":"2024-08-26T17:04:00Z","title":"MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR\n Errors with LLM-generated Synthetic Dialogues","summary":" Automatic Speech Recognition (ASR) systems are pivotal in transcribing speech\ninto text, yet the errors they introduce can significantly degrade the\nperformance of downstream tasks like summarization. This issue is particularly\npronounced in clinical dialogue summarization, a low-resource domain where\nsupervised data for fine-tuning is scarce, necessitating the use of ASR models\nas black-box solutions. Employing conventional data augmentation for enhancing\nthe noise robustness of summarization models is not feasible either due to the\nunavailability of sufficient medical dialogue audio recordings and\ncorresponding ASR transcripts. To address this challenge, we propose MEDSAGE,\nan approach for generating synthetic samples for data augmentation using Large\nLanguage Models (LLMs). Specifically, we leverage the in-context learning\ncapabilities of LLMs and instruct them to generate ASR-like errors based on a\nfew available medical dialogue examples with audio recordings. Experimental\nresults show that LLMs can effectively model ASR noise, and incorporating this\nnoisy data into the training process significantly improves the robustness and\naccuracy of medical dialogue summarization systems. This approach addresses the\nchallenges of noisy ASR outputs in critical applications, offering a robust\nsolution to enhance the reliability of clinical dialogue summarization.\n","authors":["Kuluhan Binici","Abhinav Ramesh Kashyap","Viktor Schlegel","Andy T. Liu","Vijay Prakash Dwivedi","Thanh-Tung Nguyen","Xiaoxue Gao","Nancy F. Chen","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2408.14418v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08651v2","updated":"2024-09-06T01:52:02Z","published":"2024-08-16T10:34:50Z","title":"Reasoning Beyond Bias: A Study on Counterfactual Prompting and Chain of\n Thought Reasoning","summary":" Language models are known to absorb biases from their training data, leading\nto predictions driven by statistical regularities rather than semantic\nrelevance. We investigate the impact of these biases on answer choice\npreferences in the Massive Multi-Task Language Understanding (MMLU) task. Our\nfindings reveal that differences in learned regularities across answer options\nare predictive of model preferences and mirror human test-taking strategies. To\naddress this issue, we introduce two novel methods: Counterfactual Prompting\nwith Chain of Thought (CoT) and Counterfactual Prompting with Agnostically\nPrimed CoT (APriCoT). We demonstrate that while Counterfactual Prompting with\nCoT alone is insufficient to mitigate bias, our novel Primed Counterfactual\nPrompting with CoT approach effectively reduces the influence of base-rate\nprobabilities while improving overall accuracy. Our results suggest that\nmitigating bias requires a \"System-2\" like process and that CoT reasoning is\nsusceptible to confirmation bias under some prompting methodologies. Our\ncontributions offer practical solutions for developing more robust and fair\nlanguage models.\n","authors":["Kyle Moore","Jesse Roberts","Thao Pham","Douglas Fisher"],"pdf_url":"https://arxiv.org/pdf/2408.08651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04298v3","updated":"2024-09-06T01:14:26Z","published":"2024-04-04T20:27:37Z","title":"SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated\n Responses","summary":" Can LLMs consistently improve their previous outputs for better results? For\nthis to be true, LLMs would need to be better at discriminating among\npreviously-generated alternatives, than generating initial responses. We\nexplore the validity of this hypothesis in practice. We first formulate a\nunified framework that allows us to compare the generative and discriminative\ncapability of any model on any task. In our resulting experimental analysis of\nseveral open-source and industrial LLMs, we observe that models are not\nreliably better at discriminating among previously-generated alternatives than\ngenerating initial responses. This finding challenges the notion that LLMs may\nbe able to enhance their performance only through their own judgment.\n","authors":["Dongwei Jiang","Jingyu Zhang","Orion Weller","Nathaniel Weir","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.04298v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09932v2","updated":"2024-09-06T00:46:40Z","published":"2024-04-15T16:58:28Z","title":"Foundational Challenges in Assuring Alignment and Safety of Large\n Language Models","summary":" This work identifies 18 foundational challenges in assuring the alignment and\nsafety of large language models (LLMs). These challenges are organized into\nthree different categories: scientific understanding of LLMs, development and\ndeployment methods, and sociotechnical challenges. Based on the identified\nchallenges, we pose $200+$ concrete research questions.\n","authors":["Usman Anwar","Abulhair Saparov","Javier Rando","Daniel Paleka","Miles Turpin","Peter Hase","Ekdeep Singh Lubana","Erik Jenner","Stephen Casper","Oliver Sourbut","Benjamin L. Edelman","Zhaowei Zhang","Mario Günther","Anton Korinek","Jose Hernandez-Orallo","Lewis Hammond","Eric Bigelow","Alexander Pan","Lauro Langosco","Tomasz Korbak","Heidi Zhang","Ruiqi Zhong","Seán Ó hÉigeartaigh","Gabriel Recchia","Giulio Corsi","Alan Chan","Markus Anderljung","Lilian Edwards","Aleksandar Petrov","Christian Schroeder de Witt","Sumeet Ramesh Motwan","Yoshua Bengio","Danqi Chen","Philip H. S. Torr","Samuel Albanie","Tegan Maharaj","Jakob Foerster","Florian Tramer","He He","Atoosa Kasirzadeh","Yejin Choi","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2404.09932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03946v1","updated":"2024-09-06T00:02:09Z","published":"2024-09-06T00:02:09Z","title":"On The Role of Prompt Construction In Enhancing Efficacy and Efficiency\n of LLM-Based Tabular Data Generation","summary":" LLM-based data generation for real-world tabular data can be challenged by\nthe lack of sufficient semantic context in feature names used to describe\ncolumns. We hypothesize that enriching prompts with domain-specific insights\ncan improve both the quality and efficiency of data generation. To test this\nhypothesis, we explore three prompt construction protocols: Expert-guided,\nLLM-guided, and Novel-Mapping. Through empirical studies with the recently\nproposed GReaT framework, we find that context-enriched prompts lead to\nsignificantly improved data generation quality and training efficiency.\n","authors":["Banooqa Banday","Kowshik Thopalli","Tanzima Z. Islam","Jayaraman J. Thiagarajan"],"pdf_url":"https://arxiv.org/pdf/2409.03946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16911v3","updated":"2024-09-06T22:45:43Z","published":"2023-08-31T17:59:46Z","title":"PointLLM: Empowering Large Language Models to Understand Point Clouds","summary":" The unprecedented advancements in Large Language Models (LLMs) have shown a\nprofound impact on natural language processing but are yet to fully embrace the\nrealm of 3D understanding. This paper introduces PointLLM, a preliminary effort\nto fill this gap, enabling LLMs to understand point clouds and offering a new\navenue beyond 2D visual data. PointLLM understands colored object point clouds\nwith human instructions and generates contextually appropriate responses,\nillustrating its grasp of point clouds and common sense. Specifically, it\nleverages a point cloud encoder with a powerful LLM to effectively fuse\ngeometric, appearance, and linguistic information. We collect a novel dataset\ncomprising 660K simple and 70K complex point-text instruction pairs to enable a\ntwo-stage training strategy: aligning latent spaces and subsequently\ninstruction-tuning the unified model. To rigorously evaluate the perceptual and\ngeneralization capabilities of PointLLM, we establish two benchmarks:\nGenerative 3D Object Classification and 3D Object Captioning, assessed through\nthree different methods, including human evaluation, GPT-4/ChatGPT evaluation,\nand traditional metrics. Experimental results reveal PointLLM's superior\nperformance over existing 2D and 3D baselines, with a notable achievement in\nhuman-evaluated object captioning tasks where it surpasses human annotators in\nover 50% of the samples. Codes, datasets, and benchmarks are available at\nhttps://github.com/OpenRobotLab/PointLLM .\n","authors":["Runsen Xu","Xiaolong Wang","Tai Wang","Yilun Chen","Jiangmiao Pang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.16911v3.pdf","comment":"ECCV 2024 Oral Camera Ready. This version includes clearer writing\n and additional experimental results compared to previous versions. Project\n page: https://runsenxu.com/projects/PointLLM"},{"id":"http://arxiv.org/abs/2406.10273v5","updated":"2024-09-06T22:28:37Z","published":"2024-06-11T19:20:27Z","title":"Beyond Words: On Large Language Models Actionability in Mission-Critical\n Risk Analysis","summary":" Context. Risk analysis assesses potential risks in specific scenarios. Risk\nanalysis principles are context-less; the same methodology can be applied to a\nrisk connected to health and information technology security. Risk analysis\nrequires a vast knowledge of national and international regulations and\nstandards and is time and effort-intensive. A large language model can quickly\nsummarize information in less time than a human and can be fine-tuned to\nspecific tasks.\n Aim. Our empirical study aims to investigate the effectiveness of\nRetrieval-Augmented Generation and fine-tuned LLM in risk analysis. To our\nknowledge, no prior study has explored its capabilities in risk analysis.\n Method. We manually curated 193 unique scenarios leading to 1283\nrepresentative samples from over 50 mission-critical analyses archived by the\nindustrial context team in the last five years. We compared the base GPT-3.5\nand GPT-4 models versus their Retrieval-Augmented Generation and fine-tuned\ncounterparts. We employ two human experts as competitors of the models and\nthree other human experts to review the models and the former human experts'\nanalysis. The reviewers analyzed 5,000 scenario analyses.\n Results and Conclusions. Human experts demonstrated higher accuracy, but LLMs\nare quicker and more actionable. Moreover, our findings show that RAG-assisted\nLLMs have the lowest hallucination rates, effectively uncovering hidden risks\nand complementing human expertise. Thus, the choice of model depends on\nspecific needs, with FTMs for accuracy, RAG for hidden risks discovery, and\nbase models for comprehensiveness and actionability. Therefore, experts can\nleverage LLMs as an effective complementing companion in risk analysis within a\ncondensed timeframe. They can also save costs by averting unnecessary expenses\nassociated with implementing unwarranted countermeasures.\n","authors":["Matteo Esposito","Francesco Palagiano","Valentina Lenarduzzi","Davide Taibi"],"pdf_url":"https://arxiv.org/pdf/2406.10273v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04617v1","updated":"2024-09-06T21:00:57Z","published":"2024-09-06T21:00:57Z","title":"Sparse Rewards Can Self-Train Dialogue Agents","summary":" Recent advancements in state-of-the-art (SOTA) Large Language Model (LLM)\nagents, especially in multi-turn dialogue tasks, have been primarily driven by\nsupervised fine-tuning and high-quality human feedback. However, as base LLM\nmodels continue to improve, acquiring meaningful human feedback has become\nincreasingly challenging and costly. In certain domains, base LLM agents may\neventually exceed human capabilities, making traditional feedback-driven\nmethods impractical. In this paper, we introduce a novel self-improvement\nparadigm that empowers LLM agents to autonomously enhance their performance\nwithout external human feedback. Our method, Juxtaposed Outcomes for Simulation\nHarvesting (JOSH), is a self-alignment algorithm that leverages a sparse reward\nsimulation environment to extract ideal behaviors and further train the LLM on\nits own outputs. We present ToolWOZ, a sparse reward tool-calling simulation\nenvironment derived from MultiWOZ. We demonstrate that models trained with\nJOSH, both small and frontier, significantly improve tool-based interactions\nwhile preserving general model capabilities across diverse benchmarks. Our code\nand data are publicly available on GitHub.\n","authors":["Barrett Martin Lattimer","Varun Gangal","Ryan McDonald","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2409.04617v1.pdf","comment":"Minor but nontrivial changes likely"},{"id":"http://arxiv.org/abs/2409.04599v1","updated":"2024-09-06T20:12:34Z","published":"2024-09-06T20:12:34Z","title":"BPE Gets Picky: Efficient Vocabulary Refinement During Tokenizer\n Training","summary":" Language models can largely benefit from efficient tokenization. However,\nthey still mostly utilize the classical BPE algorithm, a simple and reliable\nmethod. This has been shown to cause such issues as under-trained tokens and\nsub-optimal compression that may affect the downstream performance. We\nintroduce Picky BPE, a modified BPE algorithm that carries out vocabulary\nrefinement during tokenizer training. Our method improves vocabulary\nefficiency, eliminates under-trained tokens, and does not compromise text\ncompression. Our experiments show that our method does not reduce the\ndownstream performance, and in several cases improves it.\n","authors":["Pavel Chizhov","Catherine Arnett","Elizaveta Korotkova","Ivan P. Yamshchikov"],"pdf_url":"https://arxiv.org/pdf/2409.04599v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.04593v1","updated":"2024-09-06T20:04:04Z","published":"2024-09-06T20:04:04Z","title":"Paper Copilot: A Self-Evolving and Efficient LLM System for Personalized\n Academic Assistance","summary":" As scientific research proliferates, researchers face the daunting task of\nnavigating and reading vast amounts of literature. Existing solutions, such as\ndocument QA, fail to provide personalized and up-to-date information\nefficiently. We present Paper Copilot, a self-evolving, efficient LLM system\ndesigned to assist researchers, based on thought-retrieval, user profile and\nhigh performance optimization. Specifically, Paper Copilot can offer\npersonalized research services, maintaining a real-time updated database.\nQuantitative evaluation demonstrates that Paper Copilot saves 69.92\\% of time\nafter efficient deployment. This paper details the design and implementation of\nPaper Copilot, highlighting its contributions to personalized academic support\nand its potential to streamline the research process.\n","authors":["Guanyu Lin","Tao Feng","Pengrui Han","Ge Liu","Jiaxuan You"],"pdf_url":"https://arxiv.org/pdf/2409.04593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11157v2","updated":"2024-09-06T19:30:26Z","published":"2023-02-22T05:41:27Z","title":"FiNER-ORD: Financial Named Entity Recognition Open Research Dataset","summary":" Over the last two decades, the development of the CoNLL-2003 named entity\nrecognition (NER) dataset has helped enhance the capabilities of deep learning\nand natural language processing (NLP). The finance domain, characterized by its\nunique semantic and lexical variations for the same entities, presents specific\nchallenges to the NER task; thus, a domain-specific customized dataset is\ncrucial for advancing research in this field. In our work, we develop the first\nhigh-quality English Financial NER Open Research Dataset (FiNER-ORD). We\nbenchmark multiple pre-trained language models (PLMs) and large-language models\n(LLMs) on FiNER-ORD. We believe our proposed FiNER-ORD dataset will open future\nopportunities to use FiNER-ORD as a benchmark for financial domain-specific NER\nand NLP tasks. Our dataset, models, and code are publicly available on GitHub\nand Hugging Face under CC BY-NC 4.0 license.\n","authors":["Agam Shah","Abhinav Gullapalli","Ruchit Vithani","Michael Galarnyk","Sudheer Chava"],"pdf_url":"https://arxiv.org/pdf/2302.11157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04574v1","updated":"2024-09-06T19:25:18Z","published":"2024-09-06T19:25:18Z","title":"Customizing Large Language Model Generation Style using\n Parameter-Efficient Finetuning","summary":" One-size-fits-all large language models (LLMs) are increasingly being used to\nhelp people with their writing. However, the style these models are trained to\nwrite in may not suit all users or use cases. LLMs would be more useful as\nwriting assistants if their idiolect could be customized to match each user. In\nthis paper, we explore whether parameter-efficient finetuning (PEFT) with\nLow-Rank Adaptation can effectively guide the style of LLM generations. We use\nthis method to customize LLaMA-2 to ten different authors and show that the\ngenerated text has lexical, syntactic, and surface alignment with the target\nauthor but struggles with content memorization. Our findings highlight the\npotential of PEFT to support efficient, user-level customization of LLMs.\n","authors":["Xinyue Liu","Harshita Diddee","Daphne Ippolito"],"pdf_url":"https://arxiv.org/pdf/2409.04574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03140v2","updated":"2024-09-06T18:41:50Z","published":"2024-09-05T00:25:37Z","title":"GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase\n Recommendation","summary":" Online sellers and advertisers are recommended keyphrases for their listed\nproducts, which they bid on to enhance their sales. One popular paradigm that\ngenerates such recommendations is Extreme Multi-Label Classification (XMC),\nwhich involves tagging/mapping keyphrases to items. We outline the limitations\nof using traditional item-query based tagging or mapping techniques for\nkeyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an\ninnovative graph-based approach that recommends keyphrases to sellers using\nextraction of token permutations from item titles. Additionally, we demonstrate\nthat relying on traditional metrics such as precision/recall can be misleading\nin practical applications, thereby necessitating a combination of metrics to\nevaluate performance in real-world scenarios. These metrics are designed to\nassess the relevance of keyphrases to items and the potential for buyer\noutreach. GraphEx outperforms production models at eBay, achieving the\nobjectives mentioned above. It supports near real-time inferencing in\nresource-constrained production environments and scales effectively for\nbillions of items.\n","authors":["Ashirbad Mishra","Soumik Dey","Marshall Wu","Jinyu Zhao","He Yu","Kaichen Ni","Binbin Li","Kamesh Madduri"],"pdf_url":"https://arxiv.org/pdf/2409.03140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04556v1","updated":"2024-09-06T18:33:38Z","published":"2024-09-06T18:33:38Z","title":"How Does Code Pretraining Affect Language Model Task Performance?","summary":" Large language models are increasingly trained on corpora containing both\nnatural language and non-linguistic data like source code. Aside from aiding\nprogramming-related tasks, anecdotal evidence suggests that including code in\npretraining corpora may improve performance on other, unrelated tasks, yet to\ndate no work has been able to establish a causal connection by controlling\nbetween language and code data. Here we do just this. We pretrain language\nmodels on datasets which interleave natural language and code in two different\nsettings: additive, in which the total volume of data seen during pretraining\nis held constant; and competitive, in which the volume of language data is held\nconstant. We study how the pretraining mixture affects performance on (a) a\ndiverse collection of tasks included in the BigBench benchmark, and (b)\ncompositionality, measured by generalization accuracy on semantic parsing and\nsyntactic transformations. We find that pretraining on higher proportions of\ncode improves performance on compositional tasks involving structured output\n(like semantic parsing), and mathematics. Conversely, increase code mixture can\nharm performance on other tasks, including on tasks that requires sensitivity\nto linguistic structure such as syntax or morphology, and tasks measuring\nreal-world knowledge.\n","authors":["Jackson Petty","Sjoerd van Steenkiste","Tal Linzen"],"pdf_url":"https://arxiv.org/pdf/2409.04556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04512v1","updated":"2024-09-06T17:15:17Z","published":"2024-09-06T17:15:17Z","title":"Chain-of-Translation Prompting (CoTR): A Novel Prompting Technique for\n Low Resource Languages","summary":" This paper introduces Chain of Translation Prompting (CoTR), a novel strategy\ndesigned to enhance the performance of language models in low-resource\nlanguages. CoTR restructures prompts to first translate the input context from\na low-resource language into a higher-resource language, such as English. The\nspecified task like generation, classification, or any other NLP function is\nthen performed on the translated text, with the option to translate the output\nback to the original language if needed. All these steps are specified in a\nsingle prompt. We demonstrate the effectiveness of this method through a case\nstudy on the low-resource Indic language Marathi. The CoTR strategy is applied\nto various tasks, including sentiment analysis, hate speech classification,\nsubject classification and text generation, and its efficacy is showcased by\ncomparing it with regular prompting methods. Our results underscore the\npotential of translation-based prompting strategies to significantly improve\nmultilingual LLM performance in low-resource languages, offering valuable\ninsights for future research and applications. We specifically see the highest\naccuracy improvements with the hate speech detection task. The technique also\nhas the potential to enhance the quality of synthetic data generation for\nunderrepresented languages using LLMs.\n","authors":["Tejas Deshpande","Nidhi Kowtal","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.04512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04507v1","updated":"2024-09-06T16:32:46Z","published":"2024-09-06T16:32:46Z","title":"3D Data Long-Term Preservation in Cultural Heritage","summary":" The report explores the challenges and strategies for preserving 3D digital\ndata in cultural heritage. It discusses the issue of technological\nobsolescence, emphasising the need for ustainable storage solutions and ongoing\ndata management strategies. Key topics include understanding technological\nobsolescence, the lifecycle of digital content, digital continuity, data\nmanagement plans (DMP), FAIR principles, and the use of public repositories.\nThe report also covers the importance of metadata in long-term digital\npreservation, including types of metadata and strategies for building valuable\nmetadata. It examines the evolving standards and interoperability in 3D format\npreservation and the importance of managing metadata and paradata. The document\nprovides a comprehensive overview of the challenges and solutions for\npreserving 3D cultural heritage data in the long term.\n","authors":["Nicola Amico","Achille Felicetti"],"pdf_url":"https://arxiv.org/pdf/2409.04507v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.04440v1","updated":"2024-09-06T17:59:01Z","published":"2024-09-06T17:59:01Z","title":"Synergy and Synchrony in Couple Dances","summary":" This paper asks to what extent social interaction influences one's behavior.\nWe study this in the setting of two dancers dancing as a couple. We first\nconsider a baseline in which we predict a dancer's future moves conditioned\nonly on their past motion without regard to their partner. We then investigate\nthe advantage of taking social information into account by conditioning also on\nthe motion of their dancing partner. We focus our analysis on Swing, a dance\ngenre with tight physical coupling for which we present an in-the-wild video\ndataset. We demonstrate that single-person future motion prediction in this\ncontext is challenging. Instead, we observe that prediction greatly benefits\nfrom considering the interaction partners' behavior, resulting in surprisingly\ncompelling couple dance synthesis results (see supp. video). Our contributions\nare a demonstration of the advantages of socially conditioned future motion\nprediction and an in-the-wild, couple dance video dataset to enable future\nresearch in this direction. Video results are available on the project website:\nhttps://von31.github.io/synNsync\n","authors":["Vongani Maluleke","Lea Müller","Jathushan Rajasegaran","Georgios Pavlakos","Shiry Ginosar","Angjoo Kanazawa","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2409.04440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04429v1","updated":"2024-09-06T17:49:56Z","published":"2024-09-06T17:49:56Z","title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and\n Generation","summary":" VILA-U is a Unified foundation model that integrates Video, Image, Language\nunderstanding and generation. Traditional visual language models (VLMs) use\nseparate modules for understanding and generating visual content, which can\nlead to misalignment and increased complexity. In contrast, VILA-U employs a\nsingle autoregressive next-token prediction framework for both tasks,\neliminating the need for additional components like diffusion models. This\napproach not only simplifies the model but also achieves near state-of-the-art\nperformance in visual language understanding and generation. The success of\nVILA-U is attributed to two main factors: the unified vision tower that aligns\ndiscrete visual tokens with textual inputs during pretraining, which enhances\nvisual perception, and autoregressive image generation can achieve similar\nquality as diffusion models with high-quality dataset. This allows VILA-U to\nperform comparably to more complex models using a fully token-based\nautoregressive framework.\n","authors":["Yecheng Wu","Zhuoyang Zhang","Junyu Chen","Haotian Tang","Dacheng Li","Yunhao Fang","Ligeng Zhu","Enze Xie","Hongxu Yin","Li Yi","Song Han","Yao Lu"],"pdf_url":"https://arxiv.org/pdf/2409.04429v1.pdf","comment":"11 pages, 7 figures, 8 tables"},{"id":"http://arxiv.org/abs/2409.04424v1","updated":"2024-09-06T17:36:08Z","published":"2024-09-06T17:36:08Z","title":"Exploring Foundation Models for Synthetic Medical Imaging: A Study on\n Chest X-Rays and Fine-Tuning Techniques","summary":" Machine learning has significantly advanced healthcare by aiding in disease\nprevention and treatment identification. However, accessing patient data can be\nchallenging due to privacy concerns and strict regulations. Generating\nsynthetic, realistic data offers a potential solution for overcoming these\nlimitations, and recent studies suggest that fine-tuning foundation models can\nproduce such data effectively. In this study, we explore the potential of\nfoundation models for generating realistic medical images, particularly chest\nx-rays, and assess how their performance improves with fine-tuning. We propose\nusing a Latent Diffusion Model, starting with a pre-trained foundation model\nand refining it through various configurations. Additionally, we performed\nexperiments with input from a medical professional to assess the realism of the\nimages produced by each trained model.\n","authors":["Davide Clode da Silva","Marina Musse Bernardes","Nathalia Giacomini Ceretta","Gabriel Vaz de Souza","Gabriel Fonseca Silva","Rafael Heitor Bordini","Soraia Raupp Musse"],"pdf_url":"https://arxiv.org/pdf/2409.04424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16879v2","updated":"2024-09-06T17:17:16Z","published":"2024-08-29T20:05:02Z","title":"MSLIQA: Enhancing Learning Representations for Image Quality Assessment\n through Multi-Scale Learning","summary":" No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due\nto the diversity of distortions and the lack of large annotated datasets. Many\nstudies have attempted to tackle these challenges by developing more accurate\nNR-IQA models, often employing complex and computationally expensive networks,\nor by bridging the domain gap between various distortions to enhance\nperformance on test datasets. In our work, we improve the performance of a\ngeneric lightweight NR-IQA model by introducing a novel augmentation strategy\nthat boosts its performance by almost 28\\%. This augmentation strategy enables\nthe network to better discriminate between different distortions in various\nparts of the image by zooming in and out. Additionally, the inclusion of\ntest-time augmentation further enhances performance, making our lightweight\nnetwork's results comparable to the current state-of-the-art models, simply\nthrough the use of augmentations.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.16879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17057v2","updated":"2024-09-06T17:15:49Z","published":"2024-08-30T07:32:19Z","title":"LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality\n Assessment Model","summary":" Recent advancements in the field of No-Reference Image Quality Assessment\n(NR-IQA) using deep learning techniques demonstrate high performance across\nmultiple open-source datasets. However, such models are typically very large\nand complex making them not so suitable for real-world deployment, especially\non resource- and battery-constrained mobile devices. To address this\nlimitation, we propose a compact, lightweight NR-IQA model that achieves\nstate-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation\nand test datasets while being also nearly 5.7 times faster than the fastest\nSOTA model. Our model features a dual-branch architecture, with each branch\nseparately trained on synthetically and authentically distorted images which\nenhances the model's generalizability across different distortion types. To\nimprove robustness under diverse real-world visual conditions, we additionally\nincorporate multiple color spaces during the training process. We also\ndemonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks\n(KANs) for final quality regression as compared to the conventional Multi-Layer\nPerceptrons (MLPs). Our evaluation considering various open-source datasets\nhighlights the practical, high-accuracy, and robust performance of our proposed\nlightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.17057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04410v1","updated":"2024-09-06T17:14:53Z","published":"2024-09-06T17:14:53Z","title":"Open-MAGVIT2: An Open-Source Project Toward Democratizing\n Auto-regressive Visual Generation","summary":" We present Open-MAGVIT2, a family of auto-regressive image generation models\nranging from 300M to 1.5B. The Open-MAGVIT2 project produces an open-source\nreplication of Google's MAGVIT-v2 tokenizer, a tokenizer with a super-large\ncodebook (i.e., $2^{18}$ codes), and achieves the state-of-the-art\nreconstruction performance (1.17 rFID) on ImageNet $256 \\times 256$.\nFurthermore, we explore its application in plain auto-regressive models and\nvalidate scalability properties. To assist auto-regressive models in predicting\nwith a super-large vocabulary, we factorize it into two sub-vocabulary of\ndifferent sizes by asymmetric token factorization, and further introduce \"next\nsub-token prediction\" to enhance sub-token interaction for better generation\nquality. We release all models and codes to foster innovation and creativity in\nthe field of auto-regressive visual generation.\n","authors":["Zhuoyan Luo","Fengyuan Shi","Yixiao Ge","Yujiu Yang","Limin Wang","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2409.04410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04409v1","updated":"2024-09-06T17:13:14Z","published":"2024-09-06T17:13:14Z","title":"Train Till You Drop: Towards Stable and Robust Source-free Unsupervised\n 3D Domain Adaptation","summary":" We tackle the challenging problem of source-free unsupervised domain\nadaptation (SFUDA) for 3D semantic segmentation. It amounts to performing\ndomain adaptation on an unlabeled target domain without any access to source\ndata; the available information is a model trained to achieve good performance\non the source domain. A common issue with existing SFUDA approaches is that\nperformance degrades after some training time, which is a by product of an\nunder-constrained and ill-posed problem. We discuss two strategies to alleviate\nthis issue. First, we propose a sensible way to regularize the learning\nproblem. Second, we introduce a novel criterion based on agreement with a\nreference model. It is used (1) to stop the training when appropriate and (2)\nas validator to select hyperparameters without any knowledge on the target\ndomain. Our contributions are easy to implement and readily amenable for all\nSFUDA methods, ensuring stable improvements over all baselines. We validate our\nfindings on various 3D lidar settings, achieving state-of-the-art performance.\nThe project repository (with code) is: github.com/valeoai/TTYD.\n","authors":["Björn Michele","Alexandre Boulch","Tuan-Hung Vu","Gilles Puy","Renaud Marlet","Nicolas Courty"],"pdf_url":"https://arxiv.org/pdf/2409.04409v1.pdf","comment":"Accepted to ECCV 2024. Project repository: github.com/valeoai/TTYD"},{"id":"http://arxiv.org/abs/2409.02919v2","updated":"2024-09-06T16:51:09Z","published":"2024-09-04T17:58:08Z","title":"HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical\n MLLM Prompts","summary":" The potential for higher-resolution image generation using pretrained\ndiffusion models is immense, yet these models often struggle with issues of\nobject repetition and structural artifacts especially when scaling to 4K\nresolution and higher. We figure out that the problem is caused by that, a\nsingle prompt for the generation of multiple scales provides insufficient\nefficacy. In response, we propose HiPrompt, a new tuning-free solution that\ntackles the above problems by introducing hierarchical prompts. The\nhierarchical prompts offer both global and local guidance. Specifically, the\nglobal guidance comes from the user input that describes the overall content,\nwhile the local guidance utilizes patch-wise descriptions from MLLMs to\nelaborately guide the regional structure and texture generation. Furthermore,\nduring the inverse denoising process, the generated noise is decomposed into\nlow- and high-frequency spatial components. These components are conditioned on\nmultiple prompt levels, including detailed patch-wise descriptions and broader\nimage-level prompts, facilitating prompt-guided denoising under hierarchical\nsemantic guidance. It further allows the generation to focus more on local\nspatial regions and ensures the generated images maintain coherent local and\nglobal semantics, structures, and textures with high definition. Extensive\nexperiments demonstrate that HiPrompt outperforms state-of-the-art works in\nhigher-resolution image generation, significantly reducing object repetition\nand enhancing structural quality.\n","authors":["Xinyu Liu","Yingqing He","Lanqing Guo","Xiang Li","Bu Jin","Peng Li","Yan Li","Chi-Min Chan","Qifeng Chen","Wei Xue","Wenhan Luo","Qingfeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2409.02919v2.pdf","comment":"https://liuxinyv.github.io/HiPrompt/"},{"id":"http://arxiv.org/abs/2409.04398v1","updated":"2024-09-06T16:43:04Z","published":"2024-09-06T16:43:04Z","title":"HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale\n Space Using Wearable IMUs and LiDAR","summary":" We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture\nmethod, aimed at accurately and efficiently creating a dynamic digital world,\ncontaining large-scale indoor-outdoor scenes, diverse human motions, rich\nhuman-human interactions, and human-environment interactions. By utilizing\nbody-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human\nmotions in unconstrained space without the need for external devices and\npre-built maps. This affords great flexibility and accessibility for\nhuman-centered interaction and 4D scene capturing in various environments.\nTaking into account that IMUs can capture human spatially unrestricted poses\nbut are prone to drifting for long-period using, and while LiDAR is stable for\nglobal localization but rough for local positions and orientations, HiSC4D\nemploys a joint optimization method, harmonizing all sensors and utilizing\nenvironment cues, yielding promising results for long-term capture in large\nscenes. To promote research of egocentric human interaction in large scenes and\nfacilitate downstream tasks, we also present a dataset, containing 8 sequences\nin 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D\nhuman motions with SMPL annotations and dynamic scenes, 31k frames of cropped\nhuman point clouds, and scene mesh of the environment. A variety of scenarios,\nsuch as the basketball gym and commercial street, alongside challenging human\nmotions, such as daily greeting, one-on-one basketball playing, and tour\nguiding, demonstrate the effectiveness and the generalization ability of\nHiSC4D. The dataset and code will be publicated on\nwww.lidarhumanmotion.net/hisc4d available for research purposes.\n","authors":["Yudi Dai","Zhiyong Wang","Xiping Lin","Chenglu Wen","Lan Xu","Siqi Shen","Yuexin Ma","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04398v1.pdf","comment":"17 pages, 10 figures, Jornal"},{"id":"http://arxiv.org/abs/2409.04390v1","updated":"2024-09-06T16:29:04Z","published":"2024-09-06T16:29:04Z","title":"Future Does Matter: Boosting 3D Object Detection with Temporal Motion\n Estimation in Point Cloud Sequences","summary":" Accurate and robust LiDAR 3D object detection is essential for comprehensive\nscene understanding in autonomous driving. Despite its importance, LiDAR\ndetection performance is limited by inherent constraints of point cloud data,\nparticularly under conditions of extended distances and occlusions. Recently,\ntemporal aggregation has been proven to significantly enhance detection\naccuracy by fusing multi-frame viewpoint information and enriching the spatial\nrepresentation of objects. In this work, we introduce a novel LiDAR 3D object\ndetection framework, namely LiSTM, to facilitate spatial-temporal feature\nlearning with cross-frame motion forecasting information. We aim to improve the\nspatial-temporal interpretation capabilities of the LiDAR detector by\nincorporating a dynamic prior, generated from a non-learnable motion estimation\nmodel. Specifically, Motion-Guided Feature Aggregation (MGFA) is proposed to\nutilize the object trajectory from previous and future motion states to model\nspatial-temporal correlations into gaussian heatmap over a driving sequence.\nThis motion-based heatmap then guides the temporal feature fusion, enriching\nthe proposed object features. Moreover, we design a Dual Correlation Weighting\nModule (DCWM) that effectively facilitates the interaction between past and\nprospective frames through scene- and channel-wise feature abstraction. In the\nend, a cascade cross-attention-based decoder is employed to refine the 3D\nprediction. We have conducted experiments on the Waymo and nuScenes datasets to\ndemonstrate that the proposed framework achieves superior 3D detection\nperformance with effective spatial-temporal feature learning.\n","authors":["Rui Yu","Runkai Zhao","Cong Nie","Heng Wang","HuaiCheng Yan","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04388v1","updated":"2024-09-06T16:27:52Z","published":"2024-09-06T16:27:52Z","title":"Question-Answering Dense Video Events","summary":" Multimodal Large Language Models (MLLMs) have shown excellent performance in\nquestion-answering of single-event videos. In this paper, we present\nquestion-answering dense video events, a novel task that requires answering and\ngrounding the dense-event questions in long videos, thus challenging MLLMs to\nfaithfully comprehend and reason about multiple events occurring over extended\ntime periods. To facilitate the study, we construct DeVE-QA - a dataset\nfeaturing 78K questions about 26K events on 10.6K long videos. We then\nbenchmark and show that existing MLLMs excelling at single-event QA struggle to\nperform well in DeVE-QA. For improvement, we propose DeVi, a novel\ntraining-free MLLM approach that highlights a hierarchical captioning module, a\ntemporal event memory module, and a self-consistency checking module to\nrespectively detect, contextualize and memorize, and ground dense-events in\nlong videos for question answering. Extensive experiments show that DeVi is\nsuperior at answering dense-event questions and grounding relevant video\nmoments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1\npercent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA\nrespectively.\n","authors":["Hangyu Qin","Junbin Xiao","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2409.04388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15761v3","updated":"2024-09-06T16:23:26Z","published":"2024-02-24T08:20:39Z","title":"Res-VMamba: Fine-Grained Food Category Visual Classification Using\n Selective State Space Models with Deep Residual Learning","summary":" Food classification is the foundation for developing food vision tasks and\nplays a key role in the burgeoning field of computational nutrition. Due to the\ncomplexity of food requiring fine-grained classification, recent academic\nresearch mainly modifies Convolutional Neural Networks (CNNs) and/or Vision\nTransformers (ViTs) to perform food category classification. However, to learn\nfine-grained features, the CNN backbone needs additional structural design,\nwhereas ViT, containing the self-attention module, has increased computational\ncomplexity. In recent months, a new Sequence State Space (S4) model, through a\nSelection mechanism and computation with a Scan (S6), colloquially termed\nMamba, has demonstrated superior performance and computation efficiency\ncompared to the Transformer architecture. The VMamba model, which incorporates\nthe Mamba mechanism into image tasks (such as classification), currently\nestablishes the state-of-the-art (SOTA) on the ImageNet dataset. In this\nresearch, we introduce an academically underestimated food dataset CNFOOD-241,\nand pioneer the integration of a residual learning framework within the VMamba\nmodel to concurrently harness both global and local state features inherent in\nthe original VMamba architectural design. The research results show that VMamba\nsurpasses current SOTA models in fine-grained and food classification. The\nproposed Res-VMamba further improves the classification accuracy to 79.54\\%\nwithout pretrained weight. Our findings elucidate that our proposed methodology\nestablishes a new benchmark for SOTA performance in food recognition on the\nCNFOOD-241 dataset. The code can be obtained on GitHub:\nhttps://github.com/ChiShengChen/ResVMamba.\n","authors":["Chi-Sheng Chen","Guan-Ying Chen","Dong Zhou","Di Jiang","Dai-Shi Chen"],"pdf_url":"https://arxiv.org/pdf/2402.15761v3.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.00381v2","updated":"2024-09-06T16:20:24Z","published":"2024-08-31T08:17:24Z","title":"3D Gaussian Splatting for Large-scale 3D Surface Reconstruction from\n Aerial Images","summary":" Recently, 3D Gaussian Splatting (3DGS) has garnered significant attention.\nHowever, the unstructured nature of 3DGS poses challenges for large-scale\nsurface reconstruction from aerial images. To address this gap, we propose the\nfirst large-scale surface reconstruction method for multi-view stereo (MVS)\naerial images based on 3DGS, named Aerial Gaussian Splatting (AGS). Initially,\nwe introduce a data chunking method tailored for large-scale aerial imagery,\nmaking the modern 3DGS technology feasible for surface reconstruction over\nextensive scenes. Additionally, we integrate the Ray-Gaussian Intersection\nmethod to obtain normal and depth information, facilitating geometric\nconstraints. Finally, we introduce a multi-view geometric consistency\nconstraint to enhance global geometric consistency and improve reconstruction\naccuracy. Our experiments on multiple datasets demonstrate for the first time\nthat the GS-based technique can match traditional aerial MVS methods on\ngeometric accuracy, and beat state-of-the-art GS-based methods on geometry and\nrendering quality.\n","authors":["YuanZheng Wu","Jin Liu","Shunping Ji"],"pdf_url":"https://arxiv.org/pdf/2409.00381v2.pdf","comment":"In the writing, some parts of the book were wrong and needed a large\n revision"},{"id":"http://arxiv.org/abs/2409.04384v1","updated":"2024-09-06T16:20:24Z","published":"2024-09-06T16:20:24Z","title":"Empirical Bayesian image restoration by Langevin sampling with a\n denoising diffusion implicit prior","summary":" Score-based diffusion methods provide a powerful strategy to solve image\nrestoration tasks by flexibly combining a pre-trained foundational prior model\nwith a likelihood function specified during test time. Such methods are\npredominantly derived from two stochastic processes: reversing\nOrnstein-Uhlenbeck, which underpins the celebrated denoising diffusion\nprobabilistic models (DDPM) and denoising diffusion implicit models (DDIM), and\nthe Langevin diffusion process. The solutions delivered by DDPM and DDIM are\noften remarkably realistic, but they are not always consistent with\nmeasurements because of likelihood intractability issues and the associated\nrequired approximations. Alternatively, using a Langevin process circumvents\nthe intractable likelihood issue, but usually leads to restoration results of\ninferior quality and longer computing times. This paper presents a novel and\nhighly computationally efficient image restoration method that carefully embeds\na foundational DDPM denoiser within an empirical Bayesian Langevin algorithm,\nwhich jointly calibrates key model hyper-parameters as it estimates the model's\nposterior mean. Extensive experimental results on three canonical tasks (image\ndeblurring, super-resolution, and inpainting) demonstrate that the proposed\napproach improves on state-of-the-art strategies both in image estimation\naccuracy and computing time.\n","authors":["Charlesquin Kemajou Mbakam","Jean-Francois Giovannelli","Marcelo Pereyra"],"pdf_url":"https://arxiv.org/pdf/2409.04384v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2409.04381v1","updated":"2024-09-06T16:19:01Z","published":"2024-09-06T16:19:01Z","title":"Enhancing Skin Lesion Diagnosis with Ensemble Learning","summary":" Skin lesions are an increasingly significant medical concern, varying widely\nin severity from benign to cancerous. Accurate diagnosis is essential for\nensuring timely and appropriate treatment. This study examines the\nimplementation of deep learning methods to assist in the diagnosis of skin\nlesions using the HAM10000 dataset, which contains seven distinct types of\nlesions. First, we evaluated three pre-trained models: MobileNetV2, ResNet18,\nand VGG11, achieving accuracies of 0.798, 0.802, and 0.805, respectively. To\nfurther enhance classification accuracy, we developed ensemble models employing\nmax voting, average voting, and stacking, resulting in accuracies of 0.803,\n0.82, and 0.83. Building on the best-performing ensemble learning model,\nstacking, we developed our proposed model, SkinNet, which incorporates a\ncustomized architecture and fine-tuning, achieving an accuracy of 0.867 and an\nAUC of 0.96. This substantial improvement over individual models demonstrates\nthe effectiveness of ensemble learning in improving skin lesion classification.\n","authors":["Xiaoyi Liu","Zhou Yu","Lianghao Tan","Yafeng Yan","Ge Shi"],"pdf_url":"https://arxiv.org/pdf/2409.04381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04368v1","updated":"2024-09-06T15:59:30Z","published":"2024-09-06T15:59:30Z","title":"The Impact of Scanner Domain Shift on Deep Learning Performance in\n Medical Imaging: an Experimental Study","summary":" Purpose: Medical images acquired using different scanners and protocols can\ndiffer substantially in their appearance. This phenomenon, scanner domain\nshift, can result in a drop in the performance of deep neural networks which\nare trained on data acquired by one scanner and tested on another. This\nsignificant practical issue is well-acknowledged, however, no systematic study\nof the issue is available across different modalities and diagnostic tasks.\nMaterials and Methods: In this paper, we present a broad experimental study\nevaluating the impact of scanner domain shift on convolutional neural network\nperformance for different automated diagnostic tasks. We evaluate this\nphenomenon in common radiological modalities, including X-ray, CT, and MRI.\nResults: We find that network performance on data from a different scanner is\nalmost always worse than on same-scanner data, and we quantify the degree of\nperformance drop across different datasets. Notably, we find that this drop is\nmost severe for MRI, moderate for X-ray, and quite small for CT, on average,\nwhich we attribute to the standardized nature of CT acquisition systems which\nis not present in MRI or X-ray. We also study how injecting varying amounts of\ntarget domain data into the training set, as well as adding noise to the\ntraining data, helps with generalization. Conclusion: Our results provide\nextensive experimental evidence and quantification of the extent of performance\ndrop caused by scanner domain shift in deep learning across different\nmodalities, with the goal of guiding the future development of robust deep\nlearning models for medical image analysis.\n","authors":["Gregory Szumel","Brian Guo","Darui Lu","Rongze Gui","Tingyu Wang","Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2409.04368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03035v2","updated":"2024-09-06T15:52:16Z","published":"2024-08-06T08:31:34Z","title":"Training-Free Condition Video Diffusion Models for single frame\n Spatial-Semantic Echocardiogram Synthesis","summary":" Conditional video diffusion models (CDM) have shown promising results for\nvideo synthesis, potentially enabling the generation of realistic\nechocardiograms to address the problem of data scarcity. However, current CDMs\nrequire a paired segmentation map and echocardiogram dataset. We present a new\nmethod called Free-Echo for generating realistic echocardiograms from a single\nend-diastolic segmentation map without additional training data. Our method is\nbased on the 3D-Unet with Temporal Attention Layers model and is conditioned on\nthe segmentation map using a training-free conditioning method based on SDEdit.\nWe evaluate our model on two public echocardiogram datasets, CAMUS and\nEchoNet-Dynamic. We show that our model can generate plausible echocardiograms\nthat are spatially aligned with the input segmentation map, achieving\nperformance comparable to training-based CDMs. Our work opens up new\npossibilities for generating echocardiograms from a single segmentation map,\nwhich can be used for data augmentation, domain adaptation, and other\napplications in medical imaging. Our code is available at\n\\url{https://github.com/gungui98/echo-free}\n","authors":["Van Phi Nguyen","Tri Nhan Luong Ha","Huy Hieu Pham","Quoc Long Tran"],"pdf_url":"https://arxiv.org/pdf/2408.03035v2.pdf","comment":"Accepted to MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.04363v1","updated":"2024-09-06T15:49:49Z","published":"2024-09-06T15:49:49Z","title":"RCNet: Deep Recurrent Collaborative Network for Multi-View Low-Light\n Image Enhancement","summary":" Scene observation from multiple perspectives would bring a more comprehensive\nvisual experience. However, in the context of acquiring multiple views in the\ndark, the highly correlated views are seriously alienated, making it\nchallenging to improve scene understanding with auxiliary views. Recent single\nimage-based enhancement methods may not be able to provide consistently\ndesirable restoration performance for all views due to the ignorance of\npotential feature correspondence among different views. To alleviate this\nissue, we make the first attempt to investigate multi-view low-light image\nenhancement. First, we construct a new dataset called Multi-View Low-light\nTriplets (MVLT), including 1,860 pairs of triple images with large illumination\nranges and wide noise distribution. Each triplet is equipped with three\ndifferent viewpoints towards the same scene. Second, we propose a deep\nmulti-view enhancement framework based on the Recurrent Collaborative Network\n(RCNet). Specifically, in order to benefit from similar texture correspondence\nacross different views, we design the recurrent feature enhancement, alignment\nand fusion (ReEAF) module, in which intra-view feature enhancement (Intra-view\nEN) followed by inter-view feature alignment and fusion (Inter-view AF) is\nperformed to model the intra-view and inter-view feature propagation\nsequentially via multi-view collaboration. In addition, two different modules\nfrom enhancement to alignment (E2A) and from alignment to enhancement (A2E) are\ndeveloped to enable the interactions between Intra-view EN and Inter-view AF,\nwhich explicitly utilize attentive feature weighting and sampling for\nenhancement and alignment, respectively. Experimental results demonstrate that\nour RCNet significantly outperforms other state-of-the-art methods. All of our\ndataset, code, and model will be available at https://github.com/hluo29/RCNet.\n","authors":["Hao Luo","Baoliang Chen","Lingyu Zhu","Peilin Chen","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04363v1.pdf","comment":"14 Pages, 10 Figures, Under Review"},{"id":"http://arxiv.org/abs/2409.04360v1","updated":"2024-09-06T15:42:10Z","published":"2024-09-06T15:42:10Z","title":"Connectivity-Inspired Network for Context-Aware Recognition","summary":" The aim of this paper is threefold. We inform the AI practitioner about the\nhuman visual system with an extensive literature review; we propose a novel\nbiologically motivated neural network for image classification; and, finally,\nwe present a new plug-and-play module to model context awareness. We focus on\nthe effect of incorporating circuit motifs found in biological brains to\naddress visual recognition. Our convolutional architecture is inspired by the\nconnectivity of human cortical and subcortical streams, and we implement\nbottom-up and top-down modulations that mimic the extensive afferent and\nefferent connections between visual and cognitive areas. Our Contextual\nAttention Block is simple and effective and can be integrated with any\nfeed-forward neural network. It infers weights that multiply the feature maps\naccording to their causal influence on the scene, modeling the co-occurrence of\ndifferent objects in the image. We place our module at different bottlenecks to\ninfuse a hierarchical context awareness into the model. We validated our\nproposals through image classification experiments on benchmark data and found\na consistent improvement in performance and the robustness of the produced\nexplanations via class activation. Our code is available at\nhttps://github.com/gianlucarloni/CoCoReco.\n","authors":["Gianluca Carloni","Sara Colantonio"],"pdf_url":"https://arxiv.org/pdf/2409.04360v1.pdf","comment":"ECCV 2024 - HCV Workshop, Accepted for presentation, Submitted\n Manuscript Version (adapted to include author names, Acknowledgements, and\n reference DOIs): the version of the manuscript improved after peer review\n will appear in the Proceedings later"},{"id":"http://arxiv.org/abs/2409.04356v1","updated":"2024-09-06T15:40:47Z","published":"2024-09-06T15:40:47Z","title":"Serp-Mamba: Advancing High-Resolution Retinal Vessel Segmentation with\n Selective State-Space Model","summary":" Ultra-Wide-Field Scanning Laser Ophthalmoscopy (UWF-SLO) images capture\nhigh-resolution views of the retina with typically 200 spanning degrees.\nAccurate segmentation of vessels in UWF-SLO images is essential for detecting\nand diagnosing fundus disease. Recent studies have revealed that the selective\nState Space Model (SSM) in Mamba performs well in modeling long-range\ndependencies, which is crucial for capturing the continuity of elongated vessel\nstructures. Inspired by this, we propose the first Serpentine Mamba\n(Serp-Mamba) network to address this challenging task. Specifically, we\nrecognize the intricate, varied, and delicate nature of the tubular structure\nof vessels. Furthermore, the high-resolution of UWF-SLO images exacerbates the\nimbalance between the vessel and background categories. Based on the above\nobservations, we first devise a Serpentine Interwoven Adaptive (SIA) scan\nmechanism, which scans UWF-SLO images along curved vessel structures in a\nsnake-like crawling manner. This approach, consistent with vascular texture\ntransformations, ensures the effective and continuous capture of curved\nvascular structure features. Second, we propose an Ambiguity-Driven Dual\nRecalibration (ADDR) module to address the category imbalance problem\nintensified by high-resolution images. Our ADDR module delineates pixels by two\nlearnable thresholds and refines ambiguous pixels through a dual-driven\nstrategy, thereby accurately distinguishing vessels and background regions.\nExperiment results on three datasets demonstrate the superior performance of\nour Serp-Mamba on high-resolution vessel segmentation. We also conduct a series\nof ablation studies to verify the impact of our designs. Our code shall be\nreleased upon publication of this work.\n","authors":["Hongqiu Wang","Yixian Chen","Wu Chen","Huihui Xu","Haoyu Zhao","Bin Sheng","Huazhu Fu","Guang Yang","Lei Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.04356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04345v1","updated":"2024-09-06T15:27:51Z","published":"2024-09-06T15:27:51Z","title":"Computer-Generated Sand Mixtures and Sand-based Images","summary":" This paper aims to verify the effectiveness of the software implementation of\nthe proposed algorithm in creating computer-generated images of sand mixtures\nusing a photograph of sand as an input and its effectiveness in converting\ndigital pictures into sand-based images out of the mixtures it generated. The\nmethod of this paper is to visually compare the photographed image of the\nactual mixtures to its computer-generated counterpart to verify if the mixture\ngeneration produces results as expected and compare the computer-generated\nsand-based images with its source to verify image reproduction maintains same\nimage content. The results of the mixture comparison shows that the actual and\nthe computer-generated ones have similar overall shade and color. Still, the\ngenerated one has a rougher texture and higher contrast due to the method of\ninheriting visual features by pixel, not by individual sand particles. The\ncomparison of the sand-based image and its source has demonstrated the\nsoftware's ability to maintain the essence of its contents during conversion\nwhile replacing its texture with the visual properties of the generated sand\nmixture. The result have shown that the software implementation of the proposed\nalgorithm can effectively use the images of sand to generate images of its\nmixtures and use those mixture images to convert a digital picture into a\ncomputer-generated sand-based image.\n","authors":["Ryan A. Subong","Alma Jean D. Subong"],"pdf_url":"https://arxiv.org/pdf/2409.04345v1.pdf","comment":"12 pages, 8 figures, 2nd International Research Conference on\n Computer Engineering and Technology Education"},{"id":"http://arxiv.org/abs/2408.16005v3","updated":"2024-09-06T15:15:08Z","published":"2024-08-13T20:00:36Z","title":"Many-Worlds Inverse Rendering","summary":" Discontinuous visibility changes remain a major bottleneck when optimizing\nsurfaces within a physically-based inverse renderer. Many previous works have\nproposed sophisticated algorithms and data structures to sample visibility\nsilhouettes more efficiently.\n Our work presents another solution: instead of differentiating a tentative\nsurface locally, we differentiate a volumetric perturbation of a surface. We\nrefer this as a many-worlds representation because it models a non-interacting\nsuperposition of conflicting explanations (worlds) of the input dataset. Each\nworld is optically isolated from others, leading to a new transport law that\ndistinguishes our method from prior work based on exponential random media.\n The resulting Monte Carlo algorithm is simpler and more efficient than prior\nmethods. We demonstrate that our method promotes rapid convergence, both in\nterms of the total iteration count and the cost per iteration.\n","authors":["Ziyi Zhang","Nicolas Roussel","Wenzel Jakob"],"pdf_url":"https://arxiv.org/pdf/2408.16005v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14874v4","updated":"2024-09-06T15:11:19Z","published":"2024-04-01T14:18:15Z","title":"Open-Vocabulary Object Detectors: Robustness Challenges under\n Distribution Shifts","summary":" The challenge of Out-Of-Distribution (OOD) robustness remains a critical\nhurdle towards deploying deep vision models. Vision-Language Models (VLMs) have\nrecently achieved groundbreaking results. VLM-based open-vocabulary object\ndetection extends the capabilities of traditional object detection frameworks,\nenabling the recognition and classification of objects beyond predefined\ncategories. Investigating OOD robustness in recent open-vocabulary object\ndetection is essential to increase the trustworthiness of these models. This\nstudy presents a comprehensive robustness evaluation of the zero-shot\ncapabilities of three recent open-vocabulary (OV) foundation object detection\nmodels: OWL-ViT, YOLO World, and Grounding DINO. Experiments carried out on the\nrobustness benchmarks COCO-O, COCO-DC, and COCO-C encompassing distribution\nshifts due to information loss, corruption, adversarial attacks, and\ngeometrical deformation, highlighting the challenges of the model's robustness\nto foster the research for achieving robustness. Project page:\nhttps://prakashchhipa.github.io/projects/ovod_robustness\n","authors":["Prakash Chandra Chhipa","Kanjar De","Meenakshi Subhash Chippa","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2405.14874v4.pdf","comment":"Accepted at 2024 European Conference on Computer Vision Workshops\n (ECCVW). Project page -\n https://prakashchhipa.github.io/projects/ovod_robustness"},{"id":"http://arxiv.org/abs/2401.08281v2","updated":"2024-09-06T15:08:03Z","published":"2024-01-16T11:12:36Z","title":"The Faiss library","summary":" Vector databases typically manage large collections of embedding vectors.\nCurrently, AI applications are growing rapidly, and so is the number of\nembeddings that need to be stored and indexed. The Faiss library is dedicated\nto vector similarity search, a core functionality of vector databases. Faiss is\na toolkit of indexing methods and related primitives used to search, cluster,\ncompress and transform vectors. This paper describes the trade-off space of\nvector search and the design principles of Faiss in terms of structure,\napproach to optimization and interfacing. We benchmark key features of the\nlibrary and discuss a few selected applications to highlight its broad\napplicability.\n","authors":["Matthijs Douze","Alexandr Guzhva","Chengqi Deng","Jeff Johnson","Gergely Szilvasy","Pierre-Emmanuel Mazaré","Maria Lomeli","Lucas Hosseini","Hervé Jégou"],"pdf_url":"https://arxiv.org/pdf/2401.08281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04330v1","updated":"2024-09-06T15:05:32Z","published":"2024-09-06T15:05:32Z","title":"How to Identify Good Superpixels for Deforestation Detection on Tropical\n Rainforests","summary":" The conservation of tropical forests is a topic of significant social and\necological relevance due to their crucial role in the global ecosystem.\nUnfortunately, deforestation and degradation impact millions of hectares\nannually, requiring government or private initiatives for effective forest\nmonitoring. However, identifying deforested regions in satellite images is\nchallenging due to data imbalance, image resolution, low-contrast regions, and\nocclusion. Superpixel segmentation can overcome these drawbacks, reducing\nworkload and preserving important image boundaries. However, most works for\nremote sensing images do not exploit recent superpixel methods. In this work,\nwe evaluate 16 superpixel methods in satellite images to support a\ndeforestation detection system in tropical forests. We also assess the\nperformance of superpixel methods for the target task, establishing a\nrelationship with segmentation methodological evaluation. According to our\nresults, ERS, GMMSP, and DISF perform best on UE, BR, and SIRS, respectively,\nwhereas ERS has the best trade-off with CO and Reg. In classification, SH,\nDISF, and ISF perform best on RGB, UMDA, and PCA compositions, respectively.\nAccording to our experiments, superpixel methods with better trade-offs between\ndelineation, homogeneity, compactness, and regularity are more suitable for\nidentifying good superpixels for deforestation detection tasks.\n","authors":["Isabela Borlido","Eduardo Bouhid","Victor Sundermann","Hugo Resende","Alvaro Luiz Fazenda","Fabio Faria","Silvio Jamil F. Guimarães"],"pdf_url":"https://arxiv.org/pdf/2409.04330v1.pdf","comment":"8 pages, 3 figures, paper accepted for publication at the IEEE GRSL"},{"id":"http://arxiv.org/abs/2406.04888v2","updated":"2024-09-06T14:55:48Z","published":"2024-06-07T12:33:59Z","title":"Zero-Shot Video Editing through Adaptive Sliding Score Distillation","summary":" The rapidly evolving field of Text-to-Video generation (T2V) has catalyzed\nrenewed interest in controllable video editing research. While the application\nof editing prompts to guide diffusion model denoising has gained prominence,\nmirroring advancements in image editing, this noise-based inference process\ninherently compromises the original video's integrity, resulting in unintended\nover-editing and temporal discontinuities. To address these challenges, this\nstudy proposes a novel paradigm of video-based score distillation, facilitating\ndirect manipulation of original video content. Specifically, distinguishing it\nfrom image-based score distillation, we propose an Adaptive Sliding Score\nDistillation strategy, which incorporates both global and local video guidance\nto reduce the impact of editing errors. Combined with our proposed Image-based\nJoint Guidance mechanism, it has the ability to mitigate the inherent\ninstability of the T2V model and single-step sampling. Additionally, we design\na Weighted Attention Fusion module to further preserve the key features of the\noriginal video and avoid over-editing. Extensive experiments demonstrate that\nthese strategies effectively address existing challenges, achieving superior\nperformance compared to current state-of-the-art methods.\n","authors":["Lianghan Zhu","Yanqi Bao","Jing Huo","Jing Wu","Yu-Kun Lai","Wenbin Li","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2406.04888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.05939v2","updated":"2024-09-06T14:44:12Z","published":"2024-08-12T06:27:29Z","title":"UniPortrait: A Unified Framework for Identity-Preserving Single- and\n Multi-Human Image Personalization","summary":" This paper presents UniPortrait, an innovative human image personalization\nframework that unifies single- and multi-ID customization with high face\nfidelity, extensive facial editability, free-form input description, and\ndiverse layout generation. UniPortrait consists of only two plug-and-play\nmodules: an ID embedding module and an ID routing module. The ID embedding\nmodule extracts versatile editable facial features with a decoupling strategy\nfor each ID and embeds them into the context space of diffusion models. The ID\nrouting module then combines and distributes these embeddings adaptively to\ntheir respective regions within the synthesized image, achieving the\ncustomization of single and multiple IDs. With a carefully designed two-stage\ntraining scheme, UniPortrait achieves superior performance in both single- and\nmulti-ID customization. Quantitative and qualitative experiments demonstrate\nthe advantages of our method over existing approaches as well as its good\nscalability, e.g., the universal compatibility with existing generative control\ntools. The project page is at\nhttps://aigcdesigngroup.github.io/UniPortrait-Page/ .\n","authors":["Junjie He","Yifeng Geng","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2408.05939v2.pdf","comment":"Tech report; Project page:\n https://aigcdesigngroup.github.io/UniPortrait-Page/"},{"id":"http://arxiv.org/abs/2407.08061v3","updated":"2024-09-06T14:35:57Z","published":"2024-07-10T21:51:50Z","title":"Geospecific View Generation -- Geometry-Context Aware High-resolution\n Ground View Inference from Satellite Views","summary":" Predicting realistic ground views from satellite imagery in urban scenes is a\nchallenging task due to the significant view gaps between satellite and\nground-view images. We propose a novel pipeline to tackle this challenge, by\ngenerating geospecifc views that maximally respect the weak geometry and\ntexture from multi-view satellite images. Different from existing approaches\nthat hallucinate images from cues such as partial semantics or geometry from\noverhead satellite images, our method directly predicts ground-view images at\ngeolocation by using a comprehensive set of information from the satellite\nimage, resulting in ground-level images with a resolution boost at a factor of\nten or more. We leverage a novel building refinement method to reduce geometric\ndistortions in satellite data at ground level, which ensures the creation of\naccurate conditions for view synthesis using diffusion networks. Moreover, we\nproposed a novel geospecific prior, which prompts distribution learning of\ndiffusion models to respect image samples that are closer to the geolocation of\nthe predicted images. We demonstrate our pipeline is the first to generate\nclose-to-real and geospecific ground views merely based on satellite images.\n","authors":["Ningli Xu","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2407.08061v3.pdf","comment":"11 figures"},{"id":"http://arxiv.org/abs/2409.04310v1","updated":"2024-09-06T14:35:04Z","published":"2024-09-06T14:35:04Z","title":"Advancing SEM Based Nano-Scale Defect Analysis in Semiconductor\n Manufacturing for Advanced IC Nodes","summary":" In this research, we introduce a unified end-to-end Automated Defect\nClassification-Detection-Segmentation (ADCDS) framework for classifying,\ndetecting, and segmenting multiple instances of semiconductor defects for\nadvanced nodes. This framework consists of two modules: (a) a defect detection\nmodule, followed by (b) a defect segmentation module. The defect detection\nmodule employs Deformable DETR to aid in the classification and detection of\nnano-scale defects, while the segmentation module utilizes BoxSnake. BoxSnake\nfacilitates box-supervised instance segmentation of nano-scale defects,\nsupported by the former module. This simplifies the process by eliminating the\nlaborious requirement for ground-truth pixel-wise mask annotation by human\nexperts, which is typically associated with training conventional segmentation\nmodels. We have evaluated the performance of our ADCDS framework using two\ndistinct process datasets from real wafers, as ADI and AEI, specifically\nfocusing on Line-space patterns. We have demonstrated the applicability and\nsignificance of our proposed methodology, particularly in the nano-scale\nsegmentation and generation of binary defect masks, using the challenging ADI\nSEM dataset where ground-truth pixelwise segmentation annotations were\nunavailable. Furthermore, we have presented a comparative analysis of our\nproposed framework against previous approaches to demonstrate its\neffectiveness. Our proposed framework achieved an overall mAP@IoU0.5 of 72.19\nfor detection and 78.86 for segmentation on the ADI dataset. Similarly, for the\nAEI dataset, these metrics were 90.38 for detection and 95.48 for segmentation.\nThus, our proposed framework effectively fulfils the requirements of advanced\ndefect analysis while addressing significant constraints.\n","authors":["Bappaditya Dey","Matthias Monden","Victor Blanco","Sandip Halder","Stefan De Gendt"],"pdf_url":"https://arxiv.org/pdf/2409.04310v1.pdf","comment":"Accepted in ECCV 2024 2nd workshop on Vision-based InduStrial\n InspectiON (VISION)"},{"id":"http://arxiv.org/abs/2409.04298v1","updated":"2024-09-06T14:17:09Z","published":"2024-09-06T14:17:09Z","title":"FS-MedSAM2: Exploring the Potential of SAM2 for Few-Shot Medical Image\n Segmentation without Fine-tuning","summary":" The Segment Anything Model 2 (SAM2) has recently demonstrated exceptional\nperformance in zero-shot prompt segmentation for natural images and videos.\nHowever, it faces significant challenges when applied to medical images. Since\nits release, many attempts have been made to adapt SAM2's segmentation\ncapabilities to the medical imaging domain. These efforts typically involve\nusing a substantial amount of labeled data to fine-tune the model's weights. In\nthis paper, we explore SAM2 from a different perspective via making the full\nuse of its trained memory attention module and its ability of processing mask\nprompts. We introduce FS-MedSAM2, a simple yet effective framework that enables\nSAM2 to achieve superior medical image segmentation in a few-shot setting,\nwithout the need for fine-tuning. Our framework outperforms the current\nstate-of-the-arts on two publicly available medical image datasets. The code is\navailable at https://github.com/DeepMed-Lab-ECNU/FS_MedSAM2.\n","authors":["Yunhao Bai","Qinji Yu","Boxiang Yun","Dakai Jin","Yingda Xia","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04298v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.03209v2","updated":"2024-09-06T14:15:29Z","published":"2024-09-05T03:07:26Z","title":"iSeg: An Iterative Refinement-based Framework for Training-free\n Segmentation","summary":" Stable diffusion has demonstrated strong image synthesis ability to given\ntext descriptions, suggesting it to contain strong semantic clue for grouping\nobjects. Inspired by this, researchers have explored employing stable diffusion\nfor trainingfree segmentation. Most existing approaches either simply employ\ncross-attention map or refine it by self-attention map, to generate\nsegmentation masks. We believe that iterative refinement with self-attention\nmap would lead to better results. However, we mpirically demonstrate that such\na refinement is sub-optimal likely due to the self-attention map containing\nirrelevant global information which hampers accurately refining cross-attention\nmap with multiple iterations. To address this, we propose an iterative\nrefinement framework for training-free segmentation, named iSeg, having an\nentropy-reduced self-attention module which utilizes a gradient descent scheme\nto reduce the entropy of self-attention map, thereby suppressing the weak\nresponses corresponding to irrelevant global information. Leveraging the\nentropy-reduced self-attention module, our iSeg stably improves refined\ncrossattention map with iterative refinement. Further, we design a\ncategory-enhanced cross-attention module to generate accurate cross-attention\nmap, providing a better initial input for iterative refinement. Extensive\nexperiments across different datasets and diverse segmentation tasks reveal the\nmerits of proposed contributions, leading to promising performance on diverse\nsegmentation tasks. For unsupervised semantic segmentation on Cityscapes, our\niSeg achieves an absolute gain of 3.8% in terms of mIoU compared to the best\nexisting training-free approach in literature. Moreover, our proposed iSeg can\nsupport segmentation with different kind of images and interactions.\n","authors":["Lin Sun","Jiale Cao","Jin Xie","Fahad Shahbaz Khan","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2409.03209v2.pdf","comment":"Project Page: https://linsun449.github.io/iSeg/ Code:\n https://github.com/linsun449/iseg.code"},{"id":"http://arxiv.org/abs/2310.12092v2","updated":"2024-09-06T14:06:30Z","published":"2023-10-18T16:37:01Z","title":"HSTR-Net: Reference Based Video Super-resolution with Dual Cameras","summary":" High-spatio-temporal resolution (HSTR) video recording plays a crucial role\nin enhancing various imagery tasks that require fine-detailed information.\nState-of-the-art cameras provide this required high frame-rate and high spatial\nresolution together, albeit at a high cost. To alleviate this issue, this paper\nproposes a dual camera system for the generation of HSTR video using\nreference-based super-resolution (RefSR). One camera captures high spatial\nresolution low frame rate (HSLF) video while the other captures low spatial\nresolution high frame rate (LSHF) video simultaneously for the same scene. A\nnovel deep learning architecture is proposed to fuse HSLF and LSHF video feeds\nand synthesize HSTR video frames. The proposed model combines optical flow\nestimation and (channel-wise and spatial) attention mechanisms to capture the\nfine motion and complex dependencies between frames of the two video feeds.\nSimulations show that the proposed model provides significant improvement over\nexisting reference-based SR techniques in terms of PSNR and SSIM metrics. The\nmethod also exhibits sufficient frames per second (FPS) for aerial monitoring\nwhen deployed on a power-constrained drone equipped with dual cameras.\n","authors":["H. Umut Suluhan","Abdullah Enes Doruk","Hasan F. Ates","Bahadir K. Gunturk"],"pdf_url":"https://arxiv.org/pdf/2310.12092v2.pdf","comment":"15 pages, 8 figures, 8 tables"},{"id":"http://arxiv.org/abs/2212.12741v2","updated":"2024-09-06T13:49:30Z","published":"2022-12-24T14:19:44Z","title":"LMFLOSS: A Hybrid Loss For Imbalanced Medical Image Classification","summary":" With advances in digital technology, the classification of medical images has\nbecome a crucial step for image-based clinical decision support systems.\nAutomatic medical image classification represents a pivotal domain where the\nuse of AI holds the potential to create a significant social impact. However,\nseveral challenges act as obstacles to the development of practical and\neffective solutions. One of these challenges is the prevalent class imbalance\nproblem in most medical imaging datasets. As a result, existing AI techniques,\nparticularly deep-learning-based methodologies, often underperform in such\nscenarios. In this study, we propose a novel framework called Large Margin\naware Focal (LMF) loss to mitigate the class imbalance problem in medical\nimaging. The LMF loss represents a linear combination of two loss functions\noptimized by two hyperparameters. This framework harnesses the distinct\ncharacteristics of both loss functions by enforcing wider margins for minority\nclasses while simultaneously emphasizing challenging samples found in the\ndatasets. We perform rigorous experiments on three neural network architectures\nand with four medical imaging datasets. We provide empirical evidence that our\nproposed framework consistently outperforms other baseline methods, showing an\nimprovement of 2%-9% in macro-f1 scores. Through class-wise analysis of f1\nscores, we also demonstrate how the proposed framework can significantly\nimprove performance for minority classes. The results of our experiments show\nthat our proposed framework can perform consistently well across different\narchitectures and datasets. Overall, our study demonstrates a simple and\neffective approach to addressing the class imbalance problem in medical imaging\ndatasets. We hope our work will inspire new research toward a more generalized\napproach to medical image classification.\n","authors":["Abu Adnan Sadi","Labib Chowdhury","Nusrat Jahan","Mohammad Newaz Sharif Rafi","Radeya Chowdhury","Faisal Ahamed Khan","Nabeel Mohammed"],"pdf_url":"https://arxiv.org/pdf/2212.12741v2.pdf","comment":"21 pages, 4 figures, a detailed version of our previous submission\n with additional findings"},{"id":"http://arxiv.org/abs/2409.04272v1","updated":"2024-09-06T13:28:05Z","published":"2024-09-06T13:28:05Z","title":"Cycle Pixel Difference Network for Crisp Edge Detection","summary":" Edge detection, as a fundamental task in computer vision, has garnered\nincreasing attention. The advent of deep learning has significantly advanced\nthis field. However, recent deep learning-based methods which rely on\nlarge-scale pre-trained weights cannot be trained from scratch, with very\nlimited research addressing this issue. This paper proposes a novel cycle pixel\ndifference convolution (CPDC), which effectively integrates image gradient\ninformation with modern convolution operations. Based on the CPDC, we develop a\nU-shape encoder-decoder model named CPD-Net, which is a purely end-to-end\nnetwork. Additionally, to address the issue of edge thickness produced by most\nexisting methods, we construct a multi-scale information enhancement module\n(MSEM) to enhance the discriminative ability of the model, thereby generating\ncrisp and clean contour maps. Comprehensive experiments conducted on three\nstandard benchmarks demonstrate that our method achieves competitive\nperformance on the BSDS500 dataset (ODS=0.813), NYUD-V2 (ODS=0.760), and BIPED\ndataset (ODS=0.898). Our approach provides a novel perspective for addressing\nthese challenges in edge detection.\n","authors":["Changsong Liu","Wei Zhang","Yanyan Liu","Mingyang Li","Wenlin Li","Yimeng Fan","Xiangnan Bai","Liang Zhangd"],"pdf_url":"https://arxiv.org/pdf/2409.04272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12908v2","updated":"2024-09-06T13:25:56Z","published":"2023-12-20T10:45:22Z","title":"A Unified Representation Framework for the Evaluation of Optical Music\n Recognition Systems","summary":" Modern-day Optical Music Recognition (OMR) is a fairly fragmented field. Most\nOMR approaches use datasets that are independent and incompatible between each\nother, making it difficult to both combine them and compare recognition systems\nbuilt upon them. In this paper we identify the need of a common music\nrepresentation language and propose the Music Tree Notation (MTN) format, with\nthe idea to construct a common endpoint for OMR research that allows\ncoordination, reuse of technology and fair evaluation of community efforts.\nThis format represents music as a set of primitives that group together into\nhigher-abstraction nodes, a compromise between the expression of fully\ngraph-based and sequential notation formats. We have also developed a specific\nset of OMR metrics and a typeset score dataset as a proof of concept of this\nidea.\n","authors":["Pau Torras","Sanket Biswas","Alicia Fornés"],"pdf_url":"https://arxiv.org/pdf/2312.12908v2.pdf","comment":"18 pages, 4 figures, 3 tables, submitted (under review) for the\n International Journal in Document Analysis and Recognition"},{"id":"http://arxiv.org/abs/2309.15638v2","updated":"2024-09-06T13:21:12Z","published":"2023-09-27T13:14:57Z","title":"RSF-Conv: Rotation-and-Scale Equivariant Fourier Parameterized\n Convolution for Retinal Vessel Segmentation","summary":" Retinal vessel segmentation is of great clinical significance for the\ndiagnosis of many eye-related diseases, but it is still a formidable challenge\ndue to the intricate vascular morphology. With the skillful characterization of\nthe translation symmetry existing in retinal vessels, convolutional neural\nnetworks (CNNs) have achieved great success in retinal vessel segmentation.\nHowever, the rotation-and-scale symmetry, as a more widespread image prior in\nretinal vessels, fails to be characterized by CNNs. Therefore, we propose a\nrotation-and-scale equivariant Fourier parameterized convolution (RSF-Conv)\nspecifically for retinal vessel segmentation, and provide the corresponding\nequivariance analysis. As a general module, RSF-Conv can be integrated into\nexisting networks in a plug-and-play manner while significantly reducing the\nnumber of parameters. For instance, we replace the traditional convolution\nfilters in U-Net and Iter-Net with RSF-Convs, and faithfully conduct\ncomprehensive experiments. RSF-Conv+U-Net and RSF-Conv+Iter-Net not only have\nslight advantages under in-domain evaluation, but more importantly, outperform\nall comparison methods by a significant margin under out-of-domain evaluation.\nIt indicates the remarkable generalization of RSF-Conv, which holds greater\npractical clinical significance for the prevalent cross-device and\ncross-hospital challenges in clinical practice. To comprehensively demonstrate\nthe effectiveness of RSF-Conv, we also apply RSF-Conv+U-Net and\nRSF-Conv+Iter-Net to retinal artery/vein classification and achieve promising\nperformance as well, indicating its clinical application potential.\n","authors":["Zihong Sun","Hong Wang","Qi Xie","Yefeng Zheng","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2309.15638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00342v2","updated":"2024-09-06T13:00:56Z","published":"2024-08-31T03:53:57Z","title":"AdaNAT: Exploring Adaptive Policy for Token-Based Image Generation","summary":" Recent studies have demonstrated the effectiveness of token-based methods for\nvisual content generation. As a representative work, non-autoregressive\nTransformers (NATs) are able to synthesize images with decent quality in a\nsmall number of steps. However, NATs usually necessitate configuring a\ncomplicated generation policy comprising multiple manually-designed scheduling\nrules. These heuristic-driven rules are prone to sub-optimality and come with\nthe requirements of expert knowledge and labor-intensive efforts. Moreover,\ntheir one-size-fits-all nature cannot flexibly adapt to the diverse\ncharacteristics of each individual sample. To address these issues, we propose\nAdaNAT, a learnable approach that automatically configures a suitable policy\ntailored for every sample to be generated. In specific, we formulate the\ndetermination of generation policies as a Markov decision process. Under this\nframework, a lightweight policy network for generation can be learned via\nreinforcement learning. Importantly, we demonstrate that simple reward designs\nsuch as FID or pre-trained reward models, may not reliably guarantee the\ndesired quality or diversity of generated samples. Therefore, we propose an\nadversarial reward design to guide the training of policy networks effectively.\nComprehensive experiments on four benchmark datasets, i.e., ImageNet-256 & 512,\nMS-COCO, and CC3M, validate the effectiveness of AdaNAT. Code and pre-trained\nmodels will be released at https://github.com/LeapLabTHU/AdaNAT.\n","authors":["Zanlin Ni","Yulin Wang","Renping Zhou","Rui Lu","Jiayi Guo","Jinyi Hu","Zhiyuan Liu","Yuan Yao","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2409.00342v2.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2409.04243v1","updated":"2024-09-06T12:49:34Z","published":"2024-09-06T12:49:34Z","title":"Hybrid Cost Volume for Memory-Efficient Optical Flow","summary":" Current state-of-the-art flow methods are mostly based on dense all-pairs\ncost volumes. However, as image resolution increases, the computational and\nspatial complexity of constructing these cost volumes grows at a quartic rate,\nmaking these methods impractical for high-resolution images. In this paper, we\npropose a novel Hybrid Cost Volume for memory-efficient optical flow, named\nHCV. To construct HCV, we first propose a Top-k strategy to separate the 4D\ncost volume into two global 3D cost volumes. These volumes significantly reduce\nmemory usage while retaining a substantial amount of matching information. We\nfurther introduce a local 4D cost volume with a local search space to\nsupplement the local information for HCV. Based on HCV, we design a\nmemory-efficient optical flow network, named HCVFlow. Compared to the recurrent\nflow methods based the all-pairs cost volumes, our HCVFlow significantly\nreduces memory consumption while ensuring high accuracy. We validate the\neffectiveness and efficiency of our method on the Sintel and KITTI datasets and\nreal-world 4K (2160*3840) resolution images. Extensive experiments show that\nour HCVFlow has very low memory usage and outperforms other memory-efficient\nmethods in terms of accuracy. The code is publicly available at\nhttps://github.com/gangweiX/HCVFlow.\n","authors":["Yang Zhao","Gangwei Xu","Gang Wu"],"pdf_url":"https://arxiv.org/pdf/2409.04243v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.04241v1","updated":"2024-09-06T12:46:43Z","published":"2024-09-06T12:46:43Z","title":"Calibration of Network Confidence for Unsupervised Domain Adaptation\n Using Estimated Accuracy","summary":" This study addresses the problem of calibrating network confidence while\nadapting a model that was originally trained on a source domain to a target\ndomain using unlabeled samples from the target domain. The absence of labels\nfrom the target domain makes it impossible to directly calibrate the adapted\nnetwork on the target domain. To tackle this challenge, we introduce a\ncalibration procedure that relies on estimating the network's accuracy on the\ntarget domain. The network accuracy is first computed on the labeled source\ndata and then is modified to represent the actual accuracy of the model on the\ntarget domain. The proposed algorithm calibrates the prediction confidence\ndirectly in the target domain by minimizing the disparity between the estimated\naccuracy and the computed confidence. The experimental results show that our\nmethod significantly outperforms existing methods, which rely on importance\nweighting, across several standard datasets.\n","authors":["Coby Penso","Jacob Goldberger"],"pdf_url":"https://arxiv.org/pdf/2409.04241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04234v1","updated":"2024-09-06T12:40:19Z","published":"2024-09-06T12:40:19Z","title":"UniDet3D: Multi-dataset Indoor 3D Object Detection","summary":" Growing customer demand for smart solutions in robotics and augmented reality\nhas attracted considerable attention to 3D object detection from point clouds.\nYet, existing indoor datasets taken individually are too small and\ninsufficiently diverse to train a powerful and general 3D object detection\nmodel. In the meantime, more general approaches utilizing foundation models are\nstill inferior in quality to those based on supervised training for a specific\ntask. In this work, we propose \\ours{}, a simple yet effective 3D object\ndetection model, which is trained on a mixture of indoor datasets and is\ncapable of working in various indoor environments. By unifying different label\nspaces, \\ours{} enables learning a strong representation across multiple\ndatasets through a supervised joint training scheme. The proposed network\narchitecture is built upon a vanilla transformer encoder, making it easy to\nrun, customize and extend the prediction pipeline for practical use. Extensive\nexperiments demonstrate that \\ours{} obtains significant gains over existing 3D\nobject detection methods in 6 indoor benchmarks: ScanNet (+1.1 mAP50),\nARKitScenes (+19.4 mAP25), S3DIS (+9.1 mAP50), MultiScan (+9.3 mAP50), 3RScan\n(+3.2 mAP50), and ScanNet++ (+2.7 mAP50). Code is available at\nhttps://github.com/filapro/unidet3d .\n","authors":["Maksim Kolodiazhnyi","Anna Vorontsova","Matvey Skripkin","Danila Rukhovich","Anton Konushin"],"pdf_url":"https://arxiv.org/pdf/2409.04234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04218v1","updated":"2024-09-06T12:17:23Z","published":"2024-09-06T12:17:23Z","title":"MpoxMamba: A Grouped Mamba-based Lightweight Hybrid Network for Mpox\n Detection","summary":" Due to the lack of effective mpox detection tools, the mpox virus continues\nto spread worldwide and has once again been declared a public health emergency\nof international concern by the World Health Organization. Deep learning-based\nmpox detection tools are crucial to alleviate mpox outbreak. However, existing\nmethods have difficulty in achieving a good trade-off between detection\nperformance, parameter size, and model complexity, which is crucial for\npractical applications and widespread deployment, especially in\nresource-limited scenarios. Given that the success of Mamba in modeling\nlong-range dependencies and its linear complexity, we proposed a lightweight\nhybrid architecture called MpoxMamba. MpoxMamba utilizes deep separable\nconvolutions to extract local feature representations in mpox skin lesions, and\ngreatly enhances the model's ability to model the global contextual information\nby grouped Mamba modules. Experimental results on two widely recognized mpox\ndatasets demonstrate that MpoxMamba outperforms existing mpox detection methods\nand state-of-the-art lightweight models. We also developed a web-based online\napplication to provide free mpox detection services to the public in the\nepidemic areas (http://5227i971s5.goho.co:30290). The source codes of MpoxMamba\nare available at https://github.com/YubiaoYue/MpoxMamba.\n","authors":["Yubiao Yue","Jun Xue","Haihuang Liang","Zhenzhang Li","Yufeng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04214v1","updated":"2024-09-06T12:11:06Z","published":"2024-09-06T12:11:06Z","title":"Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver","summary":" Mathematical reasoning remains an ongoing challenge for AI models, especially\nfor geometry problems that require both linguistic and visual signals. As the\nvision encoders of most MLLMs are trained on natural scenes, they often\nstruggle to understand geometric diagrams, performing no better in geometry\nproblem solving than LLMs that only process text. This limitation is amplified\nby the lack of effective methods for representing geometric relationships. To\naddress these issues, we introduce the Diagram Formalization Enhanced Geometry\nProblem Solver (DFE-GPS), a new framework that integrates visual features,\ngeometric formal language, and natural language representations. We propose a\nnovel synthetic data approach and create a large-scale geometric dataset,\nSynthGeo228K, annotated with both formal and natural language captions,\ndesigned to enhance the vision encoder for a better understanding of geometric\nstructures. Our framework improves MLLMs' ability to process geometric diagrams\nand extends their application to open-ended tasks on the formalgeo7k dataset.\n","authors":["Zeren Zhang","Jo-Ku Cheng","Jingyang Deng","Lu Tian","Jinwen Ma","Ziran Qin","Xiaokai Zhang","Na Zhu","Tuo Leng"],"pdf_url":"https://arxiv.org/pdf/2409.04214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08108v2","updated":"2024-09-06T12:10:50Z","published":"2024-03-12T22:33:02Z","title":"TaskCLIP: Extend Large Vision-Language Model for Task Oriented Object\n Detection","summary":" Task-oriented object detection aims to find objects suitable for\naccomplishing specific tasks. As a challenging task, it requires simultaneous\nvisual data processing and reasoning under ambiguous semantics. Recent\nsolutions are mainly all-in-one models. However, the object detection backbones\nare pre-trained without text supervision. Thus, to incorporate task\nrequirements, their intricate models undergo extensive learning on a highly\nimbalanced and scarce dataset, resulting in capped performance, laborious\ntraining, and poor generalizability. In contrast, we propose TaskCLIP, a more\nnatural two-stage design composed of general object detection and task-guided\nobject selection. Particularly for the latter, we resort to the recently\nsuccessful large Vision-Language Models (VLMs) as our backbone, which provides\nrich semantic knowledge and a uniform embedding space for images and texts.\nNevertheless, the naive application of VLMs leads to sub-optimal quality, due\nto the misalignment between embeddings of object images and their visual\nattributes, which are mainly adjective phrases. To this end, we design a\ntransformer-based aligner after the pre-trained VLMs to re-calibrate both\nembeddings. Finally, we employ a trainable score function to post-process the\nVLM matching results for object selection. Experimental results demonstrate\nthat our TaskCLIP outperforms the state-of-the-art DETR-based model TOIST by\n3.5% and only requires a single NVIDIA RTX 4090 for both training and\ninference.\n","authors":["Hanning Chen","Wenjun Huang","Yang Ni","Sanggeon Yun","Yezi Liu","Fei Wen","Alvaro Velasquez","Hugo Latapie","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2403.08108v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06841v3","updated":"2024-09-06T12:09:03Z","published":"2023-04-13T22:20:54Z","title":"Video alignment using unsupervised learning of local and global features","summary":" In this paper, we tackle the problem of video alignment, the process of\nmatching the frames of a pair of videos containing similar actions. The main\nchallenge in video alignment is that accurate correspondence should be\nestablished despite the differences in the execution processes and appearances\nbetween the two videos. We introduce an unsupervised method for alignment that\nuses global and local features of the frames. In particular, we introduce\neffective features for each video frame by means of three machine vision tools:\nperson detection, pose estimation, and VGG network. Then the features are\nprocessed and combined to construct a multidimensional time series that\nrepresent the video. The resulting time series are used to align videos of the\nsame actions using a novel version of dynamic time warping named Diagonalized\nDynamic Time Warping(DDTW). The main advantage of our approach is that no\ntraining is required, which makes it applicable for any new type of action\nwithout any need to collect training samples for it. Additionally, our approach\ncan be used for framewise labeling of action phases in a dataset with only a\nfew labeled videos. For evaluation, we considered video synchronization and\nphase classification tasks on the Penn action and subset of UCF101 datasets.\nAlso, for an effective evaluation of the video synchronization task, we present\na new metric called Enclosed Area Error(EAE). The results show that our method\noutperforms previous state-of-the-art methods, such as TCC, and other\nself-supervised and weakly supervised methods.\n","authors":["Niloufar Fakhfour","Mohammad ShahverdiKondori","Sajjad Hashembeiki","Mohammadjavad Norouzi","Hoda Mohammadzade"],"pdf_url":"https://arxiv.org/pdf/2304.06841v3.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.04208v1","updated":"2024-09-06T11:57:17Z","published":"2024-09-06T11:57:17Z","title":"Learning to Learn Transferable Generative Attack for Person\n Re-Identification","summary":" Deep learning-based person re-identification (re-id) models are widely\nemployed in surveillance systems and inevitably inherit the vulnerability of\ndeep networks to adversarial attacks. Existing attacks merely consider\ncross-dataset and cross-model transferability, ignoring the cross-test\ncapability to perturb models trained in different domains. To powerfully\nexamine the robustness of real-world re-id models, the Meta Transferable\nGenerative Attack (MTGA) method is proposed, which adopts meta-learning\noptimization to promote the generative attacker producing highly transferable\nadversarial examples by learning comprehensively simulated transfer-based\ncross-model\\&dataset\\&test black-box meta attack tasks. Specifically,\ncross-model\\&dataset black-box attack tasks are first mimicked by selecting\ndifferent re-id models and datasets for meta-train and meta-test attack\nprocesses. As different models may focus on different feature regions, the\nPerturbation Random Erasing module is further devised to prevent the attacker\nfrom learning to only corrupt model-specific features. To boost the attacker\nlearning to possess cross-test transferability, the Normalization Mix strategy\nis introduced to imitate diverse feature embedding spaces by mixing\nmulti-domain statistics of target models. Extensive experiments show the\nsuperiority of MTGA, especially in cross-model\\&dataset and\ncross-model\\&dataset\\&test attacks, our MTGA outperforms the SOTA methods by\n21.5\\% and 11.3\\% on mean mAP drop rate, respectively. The code of MTGA will be\nreleased after the paper is accepted.\n","authors":["Yuan Bian","Min Liu","Xueping Wang","Yunfeng Ma","Yaonan Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04205v1","updated":"2024-09-06T11:52:42Z","published":"2024-09-06T11:52:42Z","title":"Introducing Gating and Context into Temporal Action Detection","summary":" Temporal Action Detection (TAD), the task of localizing and classifying\nactions in untrimmed video, remains challenging due to action overlaps and\nvariable action durations. Recent findings suggest that TAD performance is\ndependent on the structural design of transformers rather than on the\nself-attention mechanism. Building on this insight, we propose a refined\nfeature extraction process through lightweight, yet effective operations.\nFirst, we employ a local branch that employs parallel convolutions with varying\nwindow sizes to capture both fine-grained and coarse-grained temporal features.\nThis branch incorporates a gating mechanism to select the most relevant\nfeatures. Second, we introduce a context branch that uses boundary frames as\nkey-value pairs to analyze their relationship with the central frame through\ncross-attention. The proposed method captures temporal dependencies and\nimproves contextual understanding. Evaluations of the gating mechanism and\ncontext branch on challenging datasets (THUMOS14 and EPIC-KITCHEN 100) show a\nconsistent improvement over the baseline and existing methods.\n","authors":["Aglind Reka","Diana Laura Borza","Dominick Reilly","Michal Balazia","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2409.04205v1.pdf","comment":"Accepted for publication at the ECCV 2024 ABAW Workshop"},{"id":"http://arxiv.org/abs/2409.04196v1","updated":"2024-09-06T11:34:24Z","published":"2024-09-06T11:34:24Z","title":"GST: Precise 3D Human Body from a Single Image with Gaussian Splatting\n Transformers","summary":" Reconstructing realistic 3D human models from monocular images has\nsignificant applications in creative industries, human-computer interfaces, and\nhealthcare. We base our work on 3D Gaussian Splatting (3DGS), a scene\nrepresentation composed of a mixture of Gaussians. Predicting such mixtures for\na human from a single input image is challenging, as it is a non-uniform\ndensity (with a many-to-one relationship with input pixels) with strict\nphysical constraints. At the same time, it needs to be flexible to accommodate\na variety of clothes and poses. Our key observation is that the vertices of\nstandardized human meshes (such as SMPL) can provide an adequate density and\napproximate initial position for Gaussians. We can then train a transformer\nmodel to jointly predict comparatively small adjustments to these positions, as\nwell as the other Gaussians' attributes and the SMPL parameters. We show\nempirically that this combination (using only multi-view supervision) can\nachieve fast inference of 3D human models from a single image without test-time\noptimization, expensive diffusion models, or 3D points supervision. We also\nshow that it can improve 3D pose estimation by better fitting human models that\naccount for clothes and other variations. The code is available on the project\nwebsite https://abdullahamdi.com/gst/ .\n","authors":["Lorenza Prospero","Abdullah Hamdi","Joao F. Henriques","Christian Rupprecht"],"pdf_url":"https://arxiv.org/pdf/2409.04196v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.06493v3","updated":"2024-09-06T11:28:04Z","published":"2023-08-12T07:46:50Z","title":"EgoPoser: Robust Real-Time Egocentric Pose Estimation from Sparse and\n Intermittent Observations Everywhere","summary":" Full-body egocentric pose estimation from head and hand poses alone has\nbecome an active area of research to power articulate avatar representations on\nheadset-based platforms. However, existing methods over-rely on the indoor\nmotion-capture spaces in which datasets were recorded, while simultaneously\nassuming continuous joint motion capture and uniform body dimensions. We\npropose EgoPoser to overcome these limitations with four main contributions. 1)\nEgoPoser robustly models body pose from intermittent hand position and\norientation tracking only when inside a headset's field of view. 2) We rethink\ninput representations for headset-based ego-pose estimation and introduce a\nnovel global motion decomposition method that predicts full-body pose\nindependent of global positions. 3) We enhance pose estimation by capturing\nlonger motion time series through an efficient SlowFast module design that\nmaintains computational efficiency. 4) EgoPoser generalizes across various body\nshapes for different users. We experimentally evaluate our method and show that\nit outperforms state-of-the-art methods both qualitatively and quantitatively\nwhile maintaining a high inference speed of over 600fps. EgoPoser establishes a\nrobust baseline for future work where full-body pose estimation no longer needs\nto rely on outside-in capture and can scale to large-scale and unseen\nenvironments.\n","authors":["Jiaxi Jiang","Paul Streli","Manuel Meier","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2308.06493v3.pdf","comment":"Accepted by ECCV 2024, Code: https://siplab.org/projects/EgoPoser"},{"id":"http://arxiv.org/abs/2408.08632v2","updated":"2024-09-06T11:20:13Z","published":"2024-08-16T09:52:02Z","title":"A Survey on Benchmarks of Multimodal Large Language Models","summary":" Multimodal Large Language Models (MLLMs) are gaining increasing popularity in\nboth academia and industry due to their remarkable performance in various\napplications such as visual question answering, visual perception,\nunderstanding, and reasoning. Over the past few years, significant efforts have\nbeen made to examine MLLMs from multiple perspectives. This paper presents a\ncomprehensive review of 200 benchmarks and evaluations for MLLMs, focusing on\n(1)perception and understanding, (2)cognition and reasoning, (3)specific\ndomains, (4)key capabilities, and (5)other modalities. Finally, we discuss the\nlimitations of the current evaluation methods for MLLMs and explore promising\nfuture directions. Our key argument is that evaluation should be regarded as a\ncrucial discipline to support the development of MLLMs better. For more\ndetails, please visit our GitHub repository:\nhttps://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey.\n","authors":["Jian Li","Weiheng Lu","Hao Fei","Meng Luo","Ming Dai","Min Xia","Yizhang Jin","Zhenye Gan","Ding Qi","Chaoyou Fu","Ying Tai","Wankou Yang","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.08632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09225v2","updated":"2024-09-06T11:15:23Z","published":"2024-02-14T15:09:01Z","title":"Is my Data in your AI Model? Membership Inference Test with Application\n to Face Images","summary":" This article introduces the Membership Inference Test (MINT), a novel\napproach that aims to empirically assess if given data was used during the\ntraining of AI/ML models. Specifically, we propose two MINT architectures\ndesigned to learn the distinct activation patterns that emerge when an Audited\nModel is exposed to data used during its training process. These architectures\nare based on Multilayer Perceptrons (MLPs) and Convolutional Neural Networks\n(CNNs). The experimental framework focuses on the challenging task of Face\nRecognition, considering three state-of-the-art Face Recognition systems.\nExperiments are carried out using six publicly available databases, comprising\nover 22 million face images in total. Different experimental scenarios are\nconsidered depending on the context of the AI model to test. Our proposed MINT\napproach achieves promising results, with up to 90% accuracy, indicating the\npotential to recognize if an AI model has been trained with specific data. The\nproposed MINT approach can serve to enforce privacy and fairness in several AI\napplications, e.g., revealing if sensitive or private data was used for\ntraining or tuning Large Language Models (LLMs).\n","authors":["Daniel DeAlcala","Aythami Morales","Julian Fierrez","Gonzalo Mancera","Ruben Tolosana","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2402.09225v2.pdf","comment":"12 pages including references and authors"},{"id":"http://arxiv.org/abs/2409.04187v1","updated":"2024-09-06T11:05:12Z","published":"2024-09-06T11:05:12Z","title":"LITE: A Paradigm Shift in Multi-Object Tracking with Efficient ReID\n Feature Integration","summary":" The Lightweight Integrated Tracking-Feature Extraction (LITE) paradigm is\nintroduced as a novel multi-object tracking (MOT) approach. It enhances\nReID-based trackers by eliminating inference, pre-processing, post-processing,\nand ReID model training costs. LITE uses real-time appearance features without\ncompromising speed. By integrating appearance feature extraction directly into\nthe tracking pipeline using standard CNN-based detectors such as YOLOv8m, LITE\ndemonstrates significant performance improvements. The simplest implementation\nof LITE on top of classic DeepSORT achieves a HOTA score of 43.03% at 28.3 FPS\non the MOT17 benchmark, making it twice as fast as DeepSORT on MOT17 and four\ntimes faster on the more crowded MOT20 dataset, while maintaining similar\naccuracy. Additionally, a new evaluation framework for tracking-by-detection\napproaches reveals that conventional trackers like DeepSORT remain competitive\nwith modern state-of-the-art trackers when evaluated under fair conditions. The\ncode will be available post-publication at https://github.com/Jumabek/LITE.\n","authors":["Jumabek Alikhanov","Dilshod Obidov","Hakil Kim"],"pdf_url":"https://arxiv.org/pdf/2409.04187v1.pdf","comment":"15 pages, 6 figures, to be published in ICONIP-2024"},{"id":"http://arxiv.org/abs/2409.04178v1","updated":"2024-09-06T10:43:34Z","published":"2024-09-06T10:43:34Z","title":"Reprojection Errors as Prompts for Efficient Scene Coordinate Regression","summary":" Scene coordinate regression (SCR) methods have emerged as a promising area of\nresearch due to their potential for accurate visual localization. However, many\nexisting SCR approaches train on samples from all image regions, including\ndynamic objects and texture-less areas. Utilizing these areas for optimization\nduring training can potentially hamper the overall performance and efficiency\nof the model. In this study, we first perform an in-depth analysis to validate\nthe adverse impacts of these areas. Drawing inspiration from our analysis, we\nthen introduce an error-guided feature selection (EGFS) mechanism, in tandem\nwith the use of the Segment Anything Model (SAM). This mechanism seeds low\nreprojection areas as prompts and expands them into error-guided masks, and\nthen utilizes these masks to sample points and filter out problematic areas in\nan iterative manner. The experiments demonstrate that our method outperforms\nexisting SCR approaches that do not rely on 3D information on the Cambridge\nLandmarks and Indoor6 datasets.\n","authors":["Ting-Ru Liu","Hsuan-Kung Yang","Jou-Min Liu","Chun-Wei Huang","Tsung-Chih Chiang","Quan Kong","Norimasa Kobori","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2409.04178v1.pdf","comment":"ECCV2024"},{"id":"http://arxiv.org/abs/2409.04175v1","updated":"2024-09-06T10:34:06Z","published":"2024-09-06T10:34:06Z","title":"CISCA and CytoDArk0: a Cell Instance Segmentation and Classification\n method for histo(patho)logical image Analyses and a new, open, Nissl-stained\n dataset for brain cytoarchitecture studies","summary":" Delineating and classifying individual cells in microscopy tissue images is a\ncomplex task, yet it is a pivotal endeavor in various medical and biological\ninvestigations. We propose a new deep learning framework (CISCA) for automatic\ncell instance segmentation and classification in histological slices to support\ndetailed morphological and structural analysis or straightforward cell counting\nin digital pathology workflows and brain cytoarchitecture studies. At the core\nof CISCA lies a network architecture featuring a lightweight U-Net with three\nheads in the decoder. The first head classifies pixels into boundaries between\nneighboring cells, cell bodies, and background, while the second head regresses\nfour distance maps along four directions. The network outputs from the first\nand second heads are integrated through a tailored post-processing step, which\nultimately yields the segmentation of individual cells. A third head enables\nsimultaneous classification of cells into relevant classes, if required. We\nshowcase the effectiveness of our method using four datasets, including CoNIC,\nPanNuke, and MoNuSeg, which are publicly available H\\&E datasets. Additionally,\nwe introduce CytoDArk0, a novel dataset consisting of Nissl-stained images of\nthe cortex, cerebellum, and hippocampus from mammals belonging to the orders\nCetartiodactyla and Primates. We evaluate CISCA in comparison to other\nstate-of-the-art methods, demonstrating CISCA's robustness and accuracy in\nsegmenting and classifying cells across diverse tissue types, magnifications,\nand staining techniques.\n","authors":["Valentina Vadori","Jean-Marie Graïc","Antonella Peruffo","Giulia Vadori","Livio Finos","Enrico Grisan"],"pdf_url":"https://arxiv.org/pdf/2409.04175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10126v2","updated":"2024-09-06T10:25:23Z","published":"2024-06-14T15:33:00Z","title":"Training-free Camera Control for Video Generation","summary":" We propose a training-free and robust solution to offer camera movement\ncontrol for off-the-shelf video diffusion models. Unlike previous work, our\nmethod does not require any supervised finetuning on camera-annotated datasets\nor self-supervised training via data augmentation. Instead, it can be plugged\nand played with most pretrained video diffusion models and generate camera\ncontrollable videos with a single image or text prompt as input. The\ninspiration of our work comes from the layout prior that intermediate latents\nhold towards generated results, thus rearranging noisy pixels in them will make\noutput content reallocated as well. As camera move could also be seen as a kind\nof pixel rearrangement caused by perspective change, videos could be\nreorganized following specific camera motion if their noisy latents change\naccordingly. Established on this, we propose our method CamTrol, which enables\nrobust camera control for video diffusion models. It is achieved by a two-stage\nprocess. First, we model image layout rearrangement through explicit camera\nmovement in 3D point cloud space. Second, we generate videos with camera motion\nusing layout prior of noisy latents formed by a series of rearranged images.\nExtensive experiments have demonstrated the robustness our method holds in\ncontrolling camera motion of generated videos. Furthermore, we show that our\nmethod can produce impressive results in generating 3D rotation videos with\ndynamic content. Project page at https://lifedecoder.github.io/CamTrol/.\n","authors":["Chen Hou","Guoqiang Wei","Yan Zeng","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2406.10126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13567v3","updated":"2024-09-06T10:16:17Z","published":"2024-07-18T14:40:33Z","title":"Hyp2Nav: Hyperbolic Planning and Curiosity for Crowd Navigation","summary":" Autonomous robots are increasingly becoming a strong fixture in social\nenvironments. Effective crowd navigation requires not only safe yet fast\nplanning, but should also enable interpretability and computational efficiency\nfor working in real-time on embedded devices. In this work, we advocate for\nhyperbolic learning to enable crowd navigation and we introduce Hyp2Nav.\nDifferent from conventional reinforcement learning-based crowd navigation\nmethods, Hyp2Nav leverages the intrinsic properties of hyperbolic geometry to\nbetter encode the hierarchical nature of decision-making processes in\nnavigation tasks. We propose a hyperbolic policy model and a hyperbolic\ncuriosity module that results in effective social navigation, best success\nrates, and returns across multiple simulation settings, using up to 6 times\nfewer parameters than competitor state-of-the-art models. With our approach, it\nbecomes even possible to obtain policies that work in 2-dimensional embedding\nspaces, opening up new possibilities for low-resource crowd navigation and\nmodel interpretability. Insightfully, the internal hyperbolic representation of\nHyp2Nav correlates with how much attention the robot pays to the surrounding\ncrowds, e.g. due to multiple people occluding its pathway or to a few of them\nshowing colliding plans, rather than to its own planned route. The code is\navailable at https://github.com/GDam90/hyp2nav.\n","authors":["Guido Maria D'Amely di Melendugno","Alessandro Flaborea","Pascal Mettes","Fabio Galasso"],"pdf_url":"https://arxiv.org/pdf/2407.13567v3.pdf","comment":"Accepted as oral at IROS 2024"},{"id":"http://arxiv.org/abs/2403.08498v2","updated":"2024-09-06T10:06:46Z","published":"2024-03-13T13:06:31Z","title":"Gaussian Splatting in Style","summary":" 3D scene stylization extends the work of neural style transfer to 3D. A vital\nchallenge in this problem is to maintain the uniformity of the stylized\nappearance across multiple views. A vast majority of the previous works achieve\nthis by training a 3D model for every stylized image and a set of multi-view\nimages. In contrast, we propose a novel architecture trained on a collection of\nstyle images that, at test time, produces real time high-quality stylized novel\nviews. We choose the underlying 3D scene representation for our model as 3D\nGaussian splatting. We take the 3D Gaussians and process them using a\nmulti-resolution hash grid and a tiny MLP to obtain stylized views. The MLP is\nconditioned on different style codes for generalization to different styles\nduring test time. The explicit nature of 3D Gaussians gives us inherent\nadvantages over NeRF-based methods, including geometric consistency and a fast\ntraining and rendering regime. This enables our method to be useful for various\npractical use cases, such as augmented or virtual reality. We demonstrate that\nour method achieves state-of-the-art performance with superior visual quality\non various indoor and outdoor real-world data.\n","authors":["Abhishek Saroha","Mariia Gladkova","Cecilia Curreli","Dominik Muhle","Tarun Yenamandra","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2403.08498v2.pdf","comment":"GCPR 2024"},{"id":"http://arxiv.org/abs/2408.09126v3","updated":"2024-09-06T09:27:39Z","published":"2024-08-17T07:27:14Z","title":"Barbie: Text to Barbie-Style 3D Avatars","summary":" Recent advances in text-guided 3D avatar generation have made substantial\nprogress by distilling knowledge from diffusion models. Despite the plausible\ngenerated appearance, existing methods cannot achieve fine-grained\ndisentanglement or high-fidelity modeling between inner body and outfit. In\nthis paper, we propose Barbie, a novel framework for generating 3D avatars that\ncan be dressed in diverse and high-quality Barbie-like garments and\naccessories. Instead of relying on a holistic model, Barbie achieves\nfine-grained disentanglement on avatars by semantic-aligned separated models\nfor human body and outfits. These disentangled 3D representations are then\noptimized by different expert models to guarantee the domain-specific fidelity.\nTo balance geometry diversity and reasonableness, we propose a series of losses\nfor template-preserving and human-prior evolving. The final avatar is enhanced\nby unified texture refinement for superior texture consistency. Extensive\nexperiments demonstrate that Barbie outperforms existing methods in both\ndressed human and outfit generation, supporting flexible apparel combination\nand animation. The code will be released for research purposes. Our project\npage is: https://xiaokunsun.github.io/Barbie.github.io/.\n","authors":["Xiaokun Sun","Zhenyu Zhang","Ying Tai","Qian Wang","Hao Tang","Zili Yi","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2408.09126v3.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2301.06084v2","updated":"2024-09-06T09:22:17Z","published":"2022-12-16T09:00:42Z","title":"Scattering-induced entropy boost for highly-compressed optical sensing\n and encryption","summary":" Image sensing often relies on a high-quality machine vision system with a\nlarge field of view and high resolution. It requires fine imaging optics, has\nhigh computational costs, and requires a large communication bandwidth between\nimage sensors and computing units. In this paper, we propose a novel image-free\nsensing framework for resource-efficient image classification, where the\nrequired number of measurements can be reduced by up to two orders of\nmagnitude. In the proposed framework for single-pixel detection, the optical\nfield for a target is first scattered by an optical diffuser and then\ntwo-dimensionally modulated by a spatial light modulator. The optical diffuser\nsimultaneously serves as a compressor and an encryptor for the target\ninformation, effectively narrowing the field of view and improving the system's\nsecurity. The one-dimensional sequence of intensity values, which is measured\nwith time-varying patterns on the spatial light modulator, is then used to\nextract semantic information based on end-to-end deep learning. The proposed\nsensing framework is shown to obtain over a 95\\% accuracy at sampling rates of\n1% and 5% for classification on the MNIST dataset and the recognition of\nChinese license plates, respectively, and the framework is up to 24% more\nefficient than the approach without an optical diffuser. The proposed framework\nrepresents a significant breakthrough in high-throughput machine intelligence\nfor scene analysis with low bandwidth, low costs, and strong encryption.\n","authors":["Xinrui Zhan","Xuyang Chang","Daoyu Li","Rong Yan","Yinuo Zhang","Liheng Bian"],"pdf_url":"https://arxiv.org/pdf/2301.06084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04137v1","updated":"2024-09-06T09:04:02Z","published":"2024-09-06T09:04:02Z","title":"Optical Coherence Tomography Angiography-OCTA dataset for the study of\n Diabetic Retinopathy","summary":" This study presents a dataset consisting of 268 retinal images from 179\nindividuals, including 133 left-eye and 135 right-eye images, collected from\nNatasha Eye Care and Research Institute in Pune, Maharashtra, India. The images\nwere captured using a nonmydriatic Optical Coherence Tomography Angiography\n(OCTA) device, specifically the Optovue Avanti Edition machine as per the\nprotocol mentioned in this paper. Two ophthalmologists then annotated the\nimages. This dataset can be used by researchers and doctors to develop\nautomated diagnostic tools for early detection of diabetic retinopathy (DR).\n","authors":["Pooja Bidwai","Shilpa Gite","Biswajeet Pradhan","Aditi Gupta","Kishore pahuja"],"pdf_url":"https://arxiv.org/pdf/2409.04137v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2409.04133v1","updated":"2024-09-06T08:58:21Z","published":"2024-09-06T08:58:21Z","title":"Secure Traffic Sign Recognition: An Attention-Enabled Universal Image\n Inpainting Mechanism against Light Patch Attacks","summary":" Traffic sign recognition systems play a crucial role in assisting drivers to\nmake informed decisions while driving. However, due to the heavy reliance on\ndeep learning technologies, particularly for future connected and autonomous\ndriving, these systems are susceptible to adversarial attacks that pose\nsignificant safety risks to both personal and public transportation. Notably,\nresearchers recently identified a new attack vector to deceive sign recognition\nsystems: projecting well-designed adversarial light patches onto traffic signs.\nIn comparison with traditional adversarial stickers or graffiti, these emerging\nlight patches exhibit heightened aggression due to their ease of implementation\nand outstanding stealthiness. To effectively counter this security threat, we\npropose a universal image inpainting mechanism, namely, SafeSign. It relies on\nattention-enabled multi-view image fusion to repair traffic signs contaminated\nby adversarial light patches, thereby ensuring the accurate sign recognition.\nHere, we initially explore the fundamental impact of malicious light patches on\nthe local and global feature spaces of authentic traffic signs. Then, we design\na binary mask-based U-Net image generation pipeline outputting diverse\ncontaminated sign patterns, to provide our image inpainting model with needed\ntraining data. Following this, we develop an attention mechanism-enabled neural\nnetwork to jointly utilize the complementary information from multi-view images\nto repair contaminated signs. Finally, extensive experiments are conducted to\nevaluate SafeSign's effectiveness in resisting potential light patch-based\nattacks, bringing an average accuracy improvement of 54.8% in three widely-used\nsign recognition models\n","authors":["Hangcheng Cao","Longzhi Yuan","Guowen Xu","Ziyang He","Zhengru Fang","Yuguang Fang"],"pdf_url":"https://arxiv.org/pdf/2409.04133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16410v2","updated":"2024-09-06T08:54:50Z","published":"2023-12-27T04:47:03Z","title":"Segment Change Model (SCM) for Unsupervised Change detection in VHR\n Remote Sensing Images: a Case Study of Buildings","summary":" The field of Remote Sensing (RS) widely employs Change Detection (CD) on\nvery-high-resolution (VHR) images. A majority of extant deep-learning-based\nmethods hinge on annotated samples to complete the CD process. Recently, the\nemergence of Vision Foundation Model (VFM) enables zero-shot predictions in\nparticular vision tasks. In this work, we propose an unsupervised CD method\nnamed Segment Change Model (SCM), built upon the Segment Anything Model (SAM)\nand Contrastive Language-Image Pre-training (CLIP). Our method recalibrates\nfeatures extracted at different scales and integrates them in a top-down manner\nto enhance discriminative change edges. We further design an innovative\nPiecewise Semantic Attention (PSA) scheme, which can offer semantic\nrepresentation without training, thereby minimize pseudo change phenomenon.\nThrough conducting experiments on two public datasets, the proposed SCM\nincreases the mIoU from 46.09% to 53.67% on the LEVIR-CD dataset, and from\n47.56% to 52.14% on the WHU-CD dataset. Our codes are available at\nhttps://github.com/StephenApX/UCD-SCM.\n","authors":["Xiaoliang Tan","Guanzhou Chen","Tong Wang","Jiaqi Wang","Xiaodong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.16410v2.pdf","comment":"Published in: IGARSS 2024 - 2024 IEEE International Geoscience and\n Remote Sensing Symposium"},{"id":"http://arxiv.org/abs/2409.04117v1","updated":"2024-09-06T08:35:28Z","published":"2024-09-06T08:35:28Z","title":"Confidence-Aware Document OCR Error Detection","summary":" Optical Character Recognition (OCR) continues to face accuracy challenges\nthat impact subsequent applications. To address these errors, we explore the\nutility of OCR confidence scores for enhancing post-OCR error detection. Our\nstudy involves analyzing the correlation between confidence scores and error\nrates across different OCR systems. We develop ConfBERT, a BERT-based model\nthat incorporates OCR confidence scores into token embeddings and offers an\noptional pre-training phase for noise adjustment. Our experimental results\ndemonstrate that integrating OCR confidence scores can enhance error detection\ncapabilities. This work underscores the importance of OCR confidence scores in\nimproving detection accuracy and reveals substantial disparities in performance\nbetween commercial and open-source OCR technologies.\n","authors":["Arthur Hemmer","Mickaël Coustaty","Nicola Bartolo","Jean-Marc Ogier"],"pdf_url":"https://arxiv.org/pdf/2409.04117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04116v1","updated":"2024-09-06T08:33:26Z","published":"2024-09-06T08:33:26Z","title":"Smooth-edged Perturbations Improve Perturbation-based Image Explanations","summary":" Perturbation-based post-hoc image explanation methods are commonly used to\nexplain image prediction models by perturbing parts of the input to measure how\nthose parts affect the output. Due to the intractability of perturbing each\npixel individually, images are typically attributed to larger segments. The\nRandomized Input Sampling for Explanations (RISE) method solved this issue by\nusing smooth perturbation masks.\n While this method has proven effective and popular, it has not been\ninvestigated which parts of the method are responsible for its success. This\nwork tests many combinations of mask sampling, segmentation techniques,\nsmoothing, and attribution calculation. The results show that the RISE-style\npixel attribution is beneficial to all evaluated methods. Furthermore, it is\nshown that attribution calculation is the least impactful parameter.\n The implementation of this work is available online:\nhttps://github.com/guspih/post-hoc-image-perturbation.\n","authors":["Gustav Grund Pihlgren","Kary Främling"],"pdf_url":"https://arxiv.org/pdf/2409.04116v1.pdf","comment":"This manuscript have been submitted to NLDL 2025"},{"id":"http://arxiv.org/abs/2409.04104v1","updated":"2024-09-06T08:14:58Z","published":"2024-09-06T08:14:58Z","title":"MixNet: Joining Force of Classical and Modern Approaches Toward the\n Comprehensive Pipeline in Motor Imagery EEG Classification","summary":" Recent advances in deep learning (DL) have significantly impacted motor\nimagery (MI)-based brain-computer interface (BCI) systems, enhancing the\ndecoding of electroencephalography (EEG) signals. However, most studies\nstruggle to identify discriminative patterns across subjects during MI tasks,\nlimiting MI classification performance. In this article, we propose MixNet, a\nnovel classification framework designed to overcome this limitation by\nutilizing spectral-spatial signals from MI data, along with a multitask\nlearning architecture named MIN2Net, for classification. Here, the\nspectral-spatial signals are generated using the filter-bank common spatial\npatterns (FBCSPs) method on MI data. Since the multitask learning architecture\nis used for the classification task, the learning in each task may exhibit\ndifferent generalization rates and potential overfitting across tasks. To\naddress this issue, we implement adaptive gradient blending, simultaneously\nregulating multiple loss weights and adjusting the learning pace for each task\nbased on its generalization/overfitting tendencies. Experimental results on six\nbenchmark data sets of different data sizes demonstrate that MixNet\nconsistently outperforms all state-of-the-art algorithms in subject-dependent\nand -independent settings. Finally, the low-density EEG MI classification\nresults show that MixNet outperforms all state-of-the-art algorithms, offering\npromising implications for Internet of Thing (IoT) applications, such as\nlightweight and portable EEG wearable devices based on low-density montages.\n","authors":["Phairot Autthasan","Rattanaphon Chaisaen","Huy Phan","Maarten De Vos","Theerawit Wilaiprasitporn"],"pdf_url":"https://arxiv.org/pdf/2409.04104v1.pdf","comment":"Supplementary materials and source codes are available on-line at\n https://github.com/Max-Phairot-A/MixNet"},{"id":"http://arxiv.org/abs/2409.04095v1","updated":"2024-09-06T08:02:43Z","published":"2024-09-06T08:02:43Z","title":"UNIT: Unifying Image and Text Recognition in One Vision Encoder","summary":" Currently, vision encoder models like Vision Transformers (ViTs) typically\nexcel at image recognition tasks but cannot simultaneously support text\nrecognition like human visual recognition. To address this limitation, we\npropose UNIT, a novel training framework aimed at UNifying Image and Text\nrecognition within a single model. Starting with a vision encoder pre-trained\nwith image recognition tasks, UNIT introduces a lightweight language decoder\nfor predicting text outputs and a lightweight vision decoder to prevent\ncatastrophic forgetting of the original image encoding capabilities. The\ntraining process comprises two stages: intra-scale pretraining and inter-scale\nfinetuning. During intra-scale pretraining, UNIT learns unified representations\nfrom multi-scale inputs, where images and documents are at their commonly used\nresolution, to enable fundamental recognition capability. In the inter-scale\nfinetuning stage, the model introduces scale-exchanged data, featuring images\nand documents at resolutions different from the most commonly used ones, to\nenhance its scale robustness. Notably, UNIT retains the original vision encoder\narchitecture, making it cost-free in terms of inference and deployment.\nExperiments across multiple benchmarks confirm that our method significantly\noutperforms existing methods on document-related tasks (e.g., OCR and DocQA)\nwhile maintaining the performances on natural images, demonstrating its ability\nto substantially enhance text recognition without compromising its core image\nrecognition capabilities.\n","authors":["Yi Zhu","Yanpeng Zhou","Chunwei Wang","Yang Cao","Jianhua Han","Lu Hou","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2409.04095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11625v2","updated":"2024-09-06T07:56:21Z","published":"2024-07-16T11:41:24Z","title":"Beware of Validation by Eye: Visual Validation of Linear Trends in\n Scatterplots","summary":" Visual validation of regression models in scatterplots is a common practice\nfor assessing model quality, yet its efficacy remains unquantified. We\nconducted two empirical experiments to investigate individuals' ability to\nvisually validate linear regression models (linear trends) and to examine the\nimpact of common visualization designs on validation quality. The first\nexperiment showed that the level of accuracy for visual estimation of slope\n(i.e., fitting a line to data) is higher than for visual validation of slope\n(i.e., accepting a shown line). Notably, we found bias toward slopes that are\n\"too steep\" in both cases. This lead to novel insights that participants\nnaturally assessed regression with orthogonal distances between the points and\nthe line (i.e., ODR regression) rather than the common vertical distances (OLS\nregression). In the second experiment, we investigated whether incorporating\ncommon designs for regression visualization (error lines, bounding boxes, and\nconfidence intervals) would improve visual validation. Even though error lines\nreduced validation bias, results failed to show the desired improvements in\naccuracy for any design. Overall, our findings suggest caution in using visual\nmodel validation for linear trends in scatterplots.\n","authors":["Daniel Braun","Remco Chang","Michael Gleicher","Tatiana von Landesberger"],"pdf_url":"https://arxiv.org/pdf/2407.11625v2.pdf","comment":"Preprint and Author Version of a Full Paper, accepted to the 2024\n IEEE Visualization Conference (VIS)"},{"id":"http://arxiv.org/abs/2409.04086v1","updated":"2024-09-06T07:55:24Z","published":"2024-09-06T07:55:24Z","title":"Introducing a Class-Aware Metric for Monocular Depth Estimation: An\n Automotive Perspective","summary":" The increasing accuracy reports of metric monocular depth estimation models\nlead to a growing interest from the automotive domain. Current model\nevaluations do not provide deeper insights into the models' performance, also\nin relation to safety-critical or unseen classes. Within this paper, we present\na novel approach for the evaluation of depth estimation models. Our proposed\nmetric leverages three components, a class-wise component, an edge and corner\nimage feature component, and a global consistency retaining component. Classes\nare further weighted on their distance in the scene and on criticality for\nautomotive applications. In the evaluation, we present the benefits of our\nmetric through comparison to classical metrics, class-wise analytics, and the\nretrieval of critical situations. The results show that our metric provides\ndeeper insights into model results while fulfilling safety-critical\nrequirements. We release the code and weights on the following repository:\n\\href{https://github.com/leisemann/ca_mmde}\n","authors":["Tim Bader","Leon Eisemann","Adrian Pogorzelski","Namrata Jangid","Attila-Balazs Kis"],"pdf_url":"https://arxiv.org/pdf/2409.04086v1.pdf","comment":"Accepted at the European Conference on Computer Vision (ECCV) 2024\n Workshop on Out Of Distribution Generalization in Computer Vision"},{"id":"http://arxiv.org/abs/2409.04082v1","updated":"2024-09-06T07:48:18Z","published":"2024-09-06T07:48:18Z","title":"SDformerFlow: Spatiotemporal swin spikeformer for event-based optical\n flow estimation","summary":" Event cameras generate asynchronous and sparse event streams capturing\nchanges in light intensity. They offer significant advantages over conventional\nframe-based cameras, such as a higher dynamic range and an extremely faster\ndata rate, making them particularly useful in scenarios involving fast motion\nor challenging lighting conditions. Spiking neural networks (SNNs) share\nsimilar asynchronous and sparse characteristics and are well-suited for\nprocessing data from event cameras. Inspired by the potential of transformers\nand spike-driven transformers (spikeformers) in other computer vision tasks, we\npropose two solutions for fast and robust optical flow estimation for event\ncameras: STTFlowNet and SDformerFlow. STTFlowNet adopts a U-shaped artificial\nneural network (ANN) architecture with spatiotemporal shifted window\nself-attention (swin) transformer encoders, while SDformerFlow presents its\nfully spiking counterpart, incorporating swin spikeformer encoders.\nFurthermore, we present two variants of the spiking version with different\nneuron models. Our work is the first to make use of spikeformers for dense\noptical flow estimation. We conduct end-to-end training for all models using\nsupervised learning. Our results yield state-of-the-art performance among\nSNN-based event optical flow methods on both the DSEC and MVSEC datasets, and\nshow significant reduction in power consumption compared to the equivalent\nANNs.\n","authors":["Yi Tian","Juan Andrade-Cetto"],"pdf_url":"https://arxiv.org/pdf/2409.04082v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2408.02049v3","updated":"2024-09-06T07:48:03Z","published":"2024-08-04T14:57:28Z","title":"3D Single-object Tracking in Point Clouds with High Temporal Variation","summary":" The high temporal variation of the point clouds is the key challenge of 3D\nsingle-object tracking (3D SOT). Existing approaches rely on the assumption\nthat the shape variation of the point clouds and the motion of the objects\nacross neighboring frames are smooth, failing to cope with high temporal\nvariation data. In this paper, we present a novel framework for 3D SOT in point\nclouds with high temporal variation, called HVTrack. HVTrack proposes three\nnovel components to tackle the challenges in the high temporal variation\nscenario: 1) A Relative-Pose-Aware Memory module to handle temporal point cloud\nshape variations; 2) a Base-Expansion Feature Cross-Attention module to deal\nwith similar object distractions in expanded search areas; 3) a Contextual\nPoint Guided Self-Attention module for suppressing heavy background noise. We\nconstruct a dataset with high temporal variation (KITTI-HV) by setting\ndifferent frame intervals for sampling in the KITTI dataset. On the KITTI-HV\nwith 5 frame intervals, our HVTrack surpasses the state-of-the-art tracker\nCXTracker by 11.3%/15.7% in Success/Precision.\n","authors":["Qiao Wu","Kun Sun","Pei An","Mathieu Salzmann","Yanning Zhang","Jiaqi Yang"],"pdf_url":"https://arxiv.org/pdf/2408.02049v3.pdf","comment":"Accepted by ECCV24"},{"id":"http://arxiv.org/abs/2312.00826v3","updated":"2024-09-06T07:42:55Z","published":"2023-11-30T18:58:44Z","title":"DEVIAS: Learning Disentangled Video Representations of Action and Scene","summary":" Video recognition models often learn scene-biased action representation due\nto the spurious correlation between actions and scenes in the training data.\nSuch models show poor performance when the test data consists of videos with\nunseen action-scene combinations. Although scene-debiased action recognition\nmodels might address the issue, they often overlook valuable scene information\nin the data. To address this challenge, we propose to learn DisEntangled VIdeo\nrepresentations of Action and Scene (DEVIAS), for more holistic video\nunderstanding. We propose an encoder-decoder architecture to learn disentangled\naction and scene representations with a single model. The architecture consists\nof a disentangling encoder (DE), an action mask decoder (AMD), and a prediction\nhead. The key to achieving the disentanglement is employing both DE and AMD\nduring training time. The DE uses the slot attention mechanism to learn\ndisentangled action and scene representations. For further disentanglement, an\nAMD learns to predict action masks, given an action slot. With the resulting\ndisentangled representations, we can achieve robust performance across diverse\nscenarios, including both seen and unseen action-scene combinations. We\nrigorously validate the proposed method on the UCF-101, Kinetics-400, and HVU\ndatasets for the seen, and the SCUBA, HAT, and HVU datasets for unseen\naction-scene combination scenarios. Furthermore, DEVIAS provides flexibility to\nadjust the emphasis on action or scene information depending on dataset\ncharacteristics for downstream tasks. DEVIAS shows favorable performance in\nvarious downstream tasks: Diving48, Something-Something-V2, UCF-101, and\nActivityNet. The code is available at https://github.com/KHU-VLL/DEVIAS.\n","authors":["Kyungho Bae","Geo Ahn","Youngrae Kim","Jinwoo Choi"],"pdf_url":"https://arxiv.org/pdf/2312.00826v3.pdf","comment":"Accepted to ECCV 2024 (Oral). Project page :\n https://khu-vll.github.io/DEVIAS/"},{"id":"http://arxiv.org/abs/2311.13110v4","updated":"2024-09-06T07:40:40Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v4.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes/formatting improvements. V3: improvements from journal\n revisions. V4: fix figures"},{"id":"http://arxiv.org/abs/2408.09928v2","updated":"2024-09-06T07:20:10Z","published":"2024-08-19T12:07:24Z","title":"DiscoNeRF: Class-Agnostic Object Field for 3D Object Discovery","summary":" Neural Radiance Fields (NeRFs) have become a powerful tool for modeling 3D\nscenes from multiple images. However, NeRFs remain difficult to segment into\nsemantically meaningful regions. Previous approaches to 3D segmentation of\nNeRFs either require user interaction to isolate a single object, or they rely\non 2D semantic masks with a limited number of classes for supervision. As a\nconsequence, they generalize poorly to class-agnostic masks automatically\ngenerated in real scenes. This is attributable to the ambiguity arising from\nzero-shot segmentation, yielding inconsistent masks across views. In contrast,\nwe propose a method that is robust to inconsistent segmentations and\nsuccessfully decomposes the scene into a set of objects of any class. By\nintroducing a limited number of competing object slots against which masks are\nmatched, a meaningful object representation emerges that best explains the 2D\nsupervision and minimizes an additional regularization term. Our experiments\ndemonstrate the ability of our method to generate 3D panoptic segmentations on\ncomplex scenes, and extract high-quality 3D assets from NeRFs that can then be\nused in virtual 3D environments.\n","authors":["Corentin Dumery","Aoxiang Fan","Ren Li","Nicolas Talabot","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2408.09928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04068v1","updated":"2024-09-06T07:19:26Z","published":"2024-09-06T07:19:26Z","title":"Site-Specific Color Features of Green Coffee Beans","summary":" Coffee is one of the most valuable primary commodities. Despite this, the\ncommon selection technique of green coffee beans relies on personnel visual\ninspection, which is labor-intensive and subjective. Therefore, an efficient\nway to evaluate the quality of beans is needed. In this paper, we demonstrate a\nsite-independent approach to find site-specific color features of the seed coat\nin qualified green coffee beans. We then propose two evaluation schemes for\ngreen coffee beans based on this site-specific color feature of qualified\nbeans. Due to the site-specific properties of these color features, machine\nlearning classifiers indicate that compared with the existing evaluation\nschemes of beans, our evaluation schemes have the advantages of being simple,\nhaving less computational costs, and having universal applicability. Finally,\nthis site-specific color feature can distinguish qualified beans from different\ngrowing sites. Moreover, this function can prevent cheating in the coffee\nbusiness and is unique to our evaluation scheme of beans.\n","authors":["Shu-Min Tan","Shih-Hsun Hung","Je-Chiang Tsai"],"pdf_url":"https://arxiv.org/pdf/2409.04068v1.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.04060v1","updated":"2024-09-06T07:04:27Z","published":"2024-09-06T07:04:27Z","title":"D4: Text-guided diffusion model-based domain adaptive data augmentation\n for vineyard shoot detection","summary":" In an agricultural field, plant phenotyping using object detection models is\ngaining attention. However, collecting the training data necessary to create\ngeneric and high-precision models is extremely challenging due to the\ndifficulty of annotation and the diversity of domains. Furthermore, it is\ndifficult to transfer training data across different crops, and although\nmachine learning models effective for specific environments, conditions, or\ncrops have been developed, they cannot be widely applied in actual fields. In\nthis study, we propose a generative data augmentation method (D4) for vineyard\nshoot detection. D4 uses a pre-trained text-guided diffusion model based on a\nlarge number of original images culled from video data collected by unmanned\nground vehicles or other means, and a small number of annotated datasets. The\nproposed method generates new annotated images with background information\nadapted to the target domain while retaining annotation information necessary\nfor object detection. In addition, D4 overcomes the lack of training data in\nagriculture, including the difficulty of annotation and diversity of domains.\nWe confirmed that this generative data augmentation method improved the mean\naverage precision by up to 28.65% for the BBox detection task and the average\nprecision by up to 13.73% for the keypoint detection task for vineyard shoot\ndetection. Our generative data augmentation method D4 is expected to\nsimultaneously solve the cost and domain diversity issues of training data\ngeneration in agriculture and improve the generalization performance of\ndetection models.\n","authors":["Kentaro Hirahara","Chikahito Nakane","Hajime Ebisawa","Tsuyoshi Kuroda","Yohei Iwaki","Tomoyoshi Utsumi","Yuichiro Nomura","Makoto Koike","Hiroshi Mineno"],"pdf_url":"https://arxiv.org/pdf/2409.04060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03431v2","updated":"2024-09-06T06:57:59Z","published":"2024-09-05T11:23:41Z","title":"UV-Mamba: A DCN-Enhanced State Space Model for Urban Village Boundary\n Identification in High-Resolution Remote Sensing Images","summary":" Owing to the diverse geographical environments, intricate landscapes, and\nhigh-density settlements, the automatic identification of urban village\nboundaries using remote sensing images is a highly challenging task. This paper\nproposes a novel and efficient neural network model called UV-Mamba for\naccurate boundary detection in high-resolution remote sensing images. UV-Mamba\nmitigates the memory loss problem in long sequence modeling, which arises in\nstate space model (SSM) with increasing image size, by incorporating deformable\nconvolutions (DCN). Its architecture utilizes an encoder-decoder framework,\nincludes an encoder with four deformable state space augmentation (DSSA) blocks\nfor efficient multi-level semantic extraction and a decoder to integrate the\nextracted semantic information. We conducted experiments on the Beijing and\nXi'an datasets, and the results show that UV-Mamba achieves state-of-the-art\nperformance. Specifically, our model achieves 73.3% and 78.1% IoU on the\nBeijing and Xi'an datasets, respectively, representing improvements of 1.2% and\n3.4% IoU over the previous best model, while also being 6x faster in inference\nspeed and 40x smaller in parameter count. Source code and pre-trained models\nare available in the supplementary material.\n","authors":["Lulin Li","Ben Chen","Xuechao Zou","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2409.03431v2.pdf","comment":"5 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2409.04053v1","updated":"2024-09-06T06:49:55Z","published":"2024-09-06T06:49:55Z","title":"COLUMBUS: Evaluating COgnitive Lateral Understanding through\n Multiple-choice reBUSes","summary":" While visual question-answering (VQA) benchmarks have catalyzed the\ndevelopment of reasoning techniques, they have focused on vertical thinking.\nEffective problem-solving also necessitates lateral thinking, which remains\nunderstudied in AI and has not been used to test visual perception systems. To\nbridge this gap, we formulate visual lateral thinking as a multiple-choice\nquestion-answering task and describe a three-step taxonomy-driven methodology\nfor instantiating task examples. Then, we develop COLUMBUS, a synthetic\nbenchmark that applies the task pipeline to create QA sets with text and icon\nrebus puzzles based on publicly available collections of compounds and common\nphrases. COLUMBUS comprises over 1,000 puzzles, each with four answer\ncandidates. While the SotA vision-language models (VLMs) achieve decent\nperformance, our evaluation demonstrates a substantial gap between humans and\nmodels. VLMs benefit from human-curated descriptions but struggle to\nself-generate such representations at the right level of abstraction.\n","authors":["Koen Kraaijveld","Yifan Jiang","Kaixin Ma","Filip Ilievski"],"pdf_url":"https://arxiv.org/pdf/2409.04053v1.pdf","comment":"18 pages, 10 figures, submitted to AAAI-25"},{"id":"http://arxiv.org/abs/2409.04050v1","updated":"2024-09-06T06:46:01Z","published":"2024-09-06T06:46:01Z","title":"EigenSR: Eigenimage-Bridged Pre-Trained RGB Learners for Single\n Hyperspectral Image Super-Resolution","summary":" Single hyperspectral image super-resolution (single-HSI-SR) aims to improve\nthe resolution of a single input low-resolution HSI. Due to the bottleneck of\ndata scarcity, the development of single-HSI-SR lags far behind that of RGB\nnatural images. In recent years, research on RGB SR has shown that models\npre-trained on large-scale benchmark datasets can greatly improve performance\non unseen data, which may stand as a remedy for HSI. But how can we transfer\nthe pre-trained RGB model to HSI, to overcome the data-scarcity bottleneck?\nBecause of the significant difference in the channels between the pre-trained\nRGB model and the HSI, the model cannot focus on the correlation along the\nspectral dimension, thus limiting its ability to utilize on HSI. Inspired by\nthe HSI spatial-spectral decoupling, we propose a new framework that first\nfine-tunes the pre-trained model with the spatial components (known as\neigenimages), and then infers on unseen HSI using an iterative spectral\nregularization (ISR) to maintain the spectral correlation. The advantages of\nour method lie in: 1) we effectively inject the spatial texture processing\ncapabilities of the pre-trained RGB model into HSI while keeping spectral\nfidelity, 2) learning in the spectral-decorrelated domain can improve the\ngeneralizability to spectral-agnostic data, and 3) our inference in the\neigenimage domain naturally exploits the spectral low-rank property of HSI,\nthereby reducing the complexity. This work bridges the gap between pre-trained\nRGB models and HSI via eigenimages, addressing the issue of limited HSI\ntraining data, hence the name EigenSR. Extensive experiments show that EigenSR\noutperforms the state-of-the-art (SOTA) methods in both spatial and spectral\nmetrics. Our code will be released.\n","authors":["Xi Su","Xiangfei Shen","Mingyang Wan","Jing Nie","Lihui Chen","Haijun Liu","Xichuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.04050v1.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2403.17712v2","updated":"2024-09-06T06:40:16Z","published":"2024-03-26T13:58:47Z","title":"Invisible Gas Detection: An RGB-Thermal Cross Attention Network and A\n New Benchmark","summary":" The widespread use of various chemical gases in industrial processes\nnecessitates effective measures to prevent their leakage during transportation\nand storage, given their high toxicity. Thermal infrared-based computer vision\ndetection techniques provide a straightforward approach to identify gas leakage\nareas. However, the development of high-quality algorithms has been challenging\ndue to the low texture in thermal images and the lack of open-source datasets.\nIn this paper, we present the RGB-Thermal Cross Attention Network (RT-CAN),\nwhich employs an RGB-assisted two-stream network architecture to integrate\ntexture information from RGB images and gas area information from thermal\nimages. Additionally, to facilitate the research of invisible gas detection, we\nintroduce Gas-DB, an extensive open-source gas detection database including\nabout 1.3K well-annotated RGB-thermal images with eight variant collection\nscenes. Experimental results demonstrate that our method successfully leverages\nthe advantages of both modalities, achieving state-of-the-art (SOTA)\nperformance among RGB-thermal methods, surpassing single-stream SOTA models in\nterms of accuracy, Intersection of Union (IoU), and F2 metrics by 4.86%, 5.65%,\nand 4.88%, respectively. The code and data can be found at\nhttps://github.com/logic112358/RT-CAN.\n","authors":["Jue Wang","Yuxiang Lin","Qi Zhao","Dong Luo","Shuaibao Chen","Wei Chen","Xiaojiang Peng"],"pdf_url":"https://arxiv.org/pdf/2403.17712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04041v1","updated":"2024-09-06T06:20:11Z","published":"2024-09-06T06:20:11Z","title":"On Evaluation of Vision Datasets and Models using Human Competency\n Frameworks","summary":" Evaluating models and datasets in computer vision remains a challenging task,\nwith most leaderboards relying solely on accuracy. While accuracy is a popular\nmetric for model evaluation, it provides only a coarse assessment by\nconsidering a single model's score on all dataset items. This paper explores\nItem Response Theory (IRT), a framework that infers interpretable latent\nparameters for an ensemble of models and each dataset item, enabling richer\nevaluation and analysis beyond the single accuracy number. Leveraging IRT, we\nassess model calibration, select informative data subsets, and demonstrate the\nusefulness of its latent parameters for analyzing and comparing models and\ndatasets in computer vision.\n","authors":["Rahul Ramachandran","Tejal Kulkarni","Charchit Sharma","Deepak Vijaykeerthy","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2409.04041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04038v1","updated":"2024-09-06T06:11:28Z","published":"2024-09-06T06:11:28Z","title":"PlantSeg: A Large-Scale In-the-wild Dataset for Plant Disease\n Segmentation","summary":" Plant diseases pose significant threats to agriculture. It necessitates\nproper diagnosis and effective treatment to safeguard crop yields. To automate\nthe diagnosis process, image segmentation is usually adopted for precisely\nidentifying diseased regions, thereby advancing precision agriculture.\nDeveloping robust image segmentation models for plant diseases demands\nhigh-quality annotations across numerous images. However, existing plant\ndisease datasets typically lack segmentation labels and are often confined to\ncontrolled laboratory settings, which do not adequately reflect the complexity\nof natural environments. Motivated by this fact, we established PlantSeg, a\nlarge-scale segmentation dataset for plant diseases. PlantSeg distinguishes\nitself from existing datasets in three key aspects. (1) Annotation type: Unlike\nthe majority of existing datasets that only contain class labels or bounding\nboxes, each image in PlantSeg includes detailed and high-quality segmentation\nmasks, associated with plant types and disease names. (2) Image source: Unlike\ntypical datasets that contain images from laboratory settings, PlantSeg\nprimarily comprises in-the-wild plant disease images. This choice enhances the\npractical applicability, as the trained models can be applied for integrated\ndisease management. (3) Scale: PlantSeg is extensive, featuring 11,400 images\nwith disease segmentation masks and an additional 8,000 healthy plant images\ncategorized by plant type. Extensive technical experiments validate the high\nquality of PlantSeg's annotations. This dataset not only allows researchers to\nevaluate their image classification methods but also provides a critical\nfoundation for developing and benchmarking advanced plant disease segmentation\nalgorithms.\n","authors":["Tianqi Wei","Zhi Chen","Xin Yu","Scott Chapman","Paul Melloy","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.04038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05953v5","updated":"2024-09-06T06:08:46Z","published":"2024-05-09T17:46:22Z","title":"Frame Interpolation with Consecutive Brownian Bridge Diffusion","summary":" Recent work in Video Frame Interpolation (VFI) tries to formulate VFI as a\ndiffusion-based conditional image generation problem, synthesizing the\nintermediate frame given a random noise and neighboring frames. Due to the\nrelatively high resolution of videos, Latent Diffusion Models (LDMs) are\nemployed as the conditional generation model, where the autoencoder compresses\nimages into latent representations for diffusion and then reconstructs images\nfrom these latent representations. Such a formulation poses a crucial\nchallenge: VFI expects that the output is deterministically equal to the ground\ntruth intermediate frame, but LDMs randomly generate a diverse set of different\nimages when the model runs multiple times. The reason for the diverse\ngeneration is that the cumulative variance (variance accumulated at each step\nof generation) of generated latent representations in LDMs is large. This makes\nthe sampling trajectory random, resulting in diverse rather than deterministic\ngenerations. To address this problem, we propose our unique solution: Frame\nInterpolation with Consecutive Brownian Bridge Diffusion. Specifically, we\npropose consecutive Brownian Bridge diffusion that takes a deterministic\ninitial value as input, resulting in a much smaller cumulative variance of\ngenerated latent representations. Our experiments suggest that our method can\nimprove together with the improvement of the autoencoder and achieve\nstate-of-the-art performance in VFI, leaving strong potential for further\nenhancement.\n","authors":["Zonglin Lyu","Ming Li","Jianbo Jiao","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2405.05953v5.pdf","comment":"corrected typo"},{"id":"http://arxiv.org/abs/2407.11633v2","updated":"2024-09-06T06:00:42Z","published":"2024-07-16T11:55:23Z","title":"Scaling Diffusion Transformers to 16 Billion Parameters","summary":" In this paper, we present DiT-MoE, a sparse version of the diffusion\nTransformer, that is scalable and competitive with dense networks while\nexhibiting highly optimized inference. The DiT-MoE includes two simple designs:\nshared expert routing and expert-level balance loss, thereby capturing common\nknowledge and reducing redundancy among the different routed experts. When\napplied to conditional image generation, a deep analysis of experts\nspecialization gains some interesting observations: (i) Expert selection shows\npreference with spatial position and denoising time step, while insensitive\nwith different class-conditional information; (ii) As the MoE layers go deeper,\nthe selection of experts gradually shifts from specific spacial position to\ndispersion and balance. (iii) Expert specialization tends to be more\nconcentrated at the early time step and then gradually uniform after half. We\nattribute it to the diffusion process that first models the low-frequency\nspatial information and then high-frequency complex information. Based on the\nabove guidance, a series of DiT-MoE experimentally achieves performance on par\nwith dense networks yet requires much less computational load during inference.\nMore encouragingly, we demonstrate the potential of DiT-MoE with synthesized\nimage data, scaling diffusion model at a 16.5B parameter that attains a new\nSoTA FID-50K score of 1.80 in 512$\\times$512 resolution settings. The project\npage: https://github.com/feizc/DiT-MoE.\n","authors":["Zhengcong Fei","Mingyuan Fan","Changqian Yu","Debang Li","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2407.11633v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04035v1","updated":"2024-09-06T05:57:49Z","published":"2024-09-06T05:57:49Z","title":"MultiCounter: Multiple Action Agnostic Repetition Counting in Untrimmed\n Videos","summary":" Multi-instance Repetitive Action Counting (MRAC) aims to estimate the number\nof repetitive actions performed by multiple instances in untrimmed videos,\ncommonly found in human-centric domains like sports and exercise. In this\npaper, we propose MultiCounter, a fully end-to-end deep learning framework that\nenables simultaneous detection, tracking, and counting of repetitive actions of\nmultiple human instances. Specifically, MultiCounter incorporates two novel\nmodules: 1) mixed spatiotemporal interaction for efficient context correlation\nacross consecutive frames, and 2) task-specific heads for accurate perception\nof periodic boundaries and generalization for action-agnostic human instances.\nWe train MultiCounter on a synthetic dataset called MultiRep generated from\nannotated real-world videos. Experiments on the MultiRep dataset validate the\nfundamental challenge of MRAC tasks and showcase the superiority of our\nproposed model. Compared to ByteTrack+RepNet, a solution that combines an\nadvanced tracker with a single repetition counter, MultiCounter substantially\nimproves Period-mAP by 41.0%, reduces AvgMAE by 58.6%, and increases AvgOBO\n1.48 times. This sets a new benchmark in the field of MRAC. Moreover,\nMultiCounter runs in real-time on a commodity GPU server and is insensitive to\nthe number of human instances in a video.\n","authors":["Yin Tang","Wei Luo","Jinrui Zhang","Wei Huang","Ruihai Jing","Deyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04035v1.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2407.05278v3","updated":"2024-09-06T05:56:39Z","published":"2024-07-07T06:36:09Z","title":"HyperKAN: Kolmogorov-Arnold Networks make Hyperspectral Image\n Classificators Smarter","summary":" In traditional neural network architectures, a multilayer perceptron (MLP) is\ntypically employed as a classification block following the feature extraction\nstage. However, the Kolmogorov-Arnold Network (KAN) presents a promising\nalternative to MLP, offering the potential to enhance prediction accuracy. In\nthis paper, we propose the replacement of linear and convolutional layers of\ntraditional networks with KAN-based counterparts. These modifications allowed\nus to significantly increase the per-pixel classification accuracy for\nhyperspectral remote-sensing images. We modified seven different neural network\narchitectures for hyperspectral image classification and observed a substantial\nimprovement in the classification accuracy across all the networks. The\narchitectures considered in the paper include baseline MLP, state-of-the-art 1D\n(1DCNN) and 3D convolutional (two different 3DCNN, NM3DCNN), and transformer\n(SSFTT) architectures, as well as newly proposed M1DCNN. The greatest effect\nwas achieved for convolutional networks working exclusively on spectral data,\nand the best classification quality was achieved using a KAN-based transformer\narchitecture. All the experiments were conducted using seven openly available\nhyperspectral datasets. Our code is available at\nhttps://github.com/f-neumann77/HyperKAN.\n","authors":["Valeriy Lobanov","Nikita Firsov","Evgeny Myasnikov","Roman Khabibullin","Artem Nikonorov"],"pdf_url":"https://arxiv.org/pdf/2407.05278v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17400v2","updated":"2024-09-06T05:50:16Z","published":"2024-04-26T13:21:31Z","title":"Spatial-frequency Dual-Domain Feature Fusion Network for Low-Light\n Remote Sensing Image Enhancement","summary":" Low-light remote sensing images generally feature high resolution and high\nspatial complexity, with continuously distributed surface features in space.\nThis continuity in scenes leads to extensive long-range correlations in spatial\ndomains within remote sensing images. Convolutional Neural Networks, which rely\non local correlations for long-distance modeling, struggle to establish\nlong-range correlations in such images. On the other hand, transformer-based\nmethods that focus on global information face high computational complexities\nwhen processing high-resolution remote sensing images. From another\nperspective, Fourier transform can compute global information without\nintroducing a large number of parameters, enabling the network to more\nefficiently capture the overall image structure and establish long-range\ncorrelations. Therefore, we propose a Dual-Domain Feature Fusion Network (DFFN)\nfor low-light remote sensing image enhancement. Specifically, this challenging\ntask of low-light enhancement is divided into two more manageable sub-tasks:\nthe first phase learns amplitude information to restore image brightness, and\nthe second phase learns phase information to refine details. To facilitate\ninformation exchange between the two phases, we designed an information fusion\naffine block that combines data from different phases and scales. Additionally,\nwe have constructed two dark light remote sensing datasets to address the\ncurrent lack of datasets in dark light remote sensing image enhancement.\nExtensive evaluations show that our method outperforms existing\nstate-of-the-art methods. The code is available at\nhttps://github.com/iijjlk/DFFN.\n","authors":["Zishu Yao","Guodong Fan","Jinfu Fan","Min Gan","C. L. Philip Chen"],"pdf_url":"https://arxiv.org/pdf/2404.17400v2.pdf","comment":"14 page"},{"id":"http://arxiv.org/abs/2409.04033v1","updated":"2024-09-06T05:49:38Z","published":"2024-09-06T05:49:38Z","title":"Dense Hand-Object(HO) GraspNet with Full Grasping Taxonomy and Dynamics","summary":" Existing datasets for 3D hand-object interaction are limited either in the\ndata cardinality, data variations in interaction scenarios, or the quality of\nannotations. In this work, we present a comprehensive new training dataset for\nhand-object interaction called HOGraspNet. It is the only real dataset that\ncaptures full grasp taxonomies, providing grasp annotation and wide intraclass\nvariations. Using grasp taxonomies as atomic actions, their space and time\ncombinatorial can represent complex hand activities around objects. We select\n22 rigid objects from the YCB dataset and 8 other compound objects using shape\nand size taxonomies, ensuring coverage of all hand grasp configurations. The\ndataset includes diverse hand shapes from 99 participants aged 10 to 74,\ncontinuous video frames, and a 1.5M RGB-Depth of sparse frames with\nannotations. It offers labels for 3D hand and object meshes, 3D keypoints,\ncontact maps, and \\emph{grasp labels}. Accurate hand and object 3D meshes are\nobtained by fitting the hand parametric model (MANO) and the hand implicit\nfunction (HALO) to multi-view RGBD frames, with the MoCap system only for\nobjects. Note that HALO fitting does not require any parameter tuning, enabling\nscalability to the dataset's size with comparable accuracy to MANO. We evaluate\nHOGraspNet on relevant tasks: grasp classification and 3D hand pose estimation.\nThe result shows performance variations based on grasp type and object class,\nindicating the potential importance of the interaction space captured by our\ndataset. The provided data aims at learning universal shape priors or\nfoundation models for 3D hand-object interaction. Our dataset and code are\navailable at https://hograspnet2024.github.io/.\n","authors":["Woojin Cho","Jihyun Lee","Minjae Yi","Minje Kim","Taeyun Woo","Donghwan Kim","Taewook Ha","Hyokeun Lee","Je-Hwan Ryu","Woontack Woo","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2409.04033v1.pdf","comment":"14 pages except for references. It will be published at European\n Conference on Computer Vision(ECCV) 2024"},{"id":"http://arxiv.org/abs/2409.04025v1","updated":"2024-09-06T04:44:52Z","published":"2024-09-06T04:44:52Z","title":"BFA-YOLO: Balanced multiscale object detection network for multi-view\n building facade attachments detection","summary":" Detection of building facade attachments such as doors, windows, balconies,\nair conditioner units, billboards, and glass curtain walls plays a pivotal role\nin numerous applications. Building facade attachments detection aids in\nvbuilding information modeling (BIM) construction and meeting Level of Detail 3\n(LOD3) standards. Yet, it faces challenges like uneven object distribution,\nsmall object detection difficulty, and background interference. To counter\nthese, we propose BFA-YOLO, a model for detecting facade attachments in\nmulti-view images. BFA-YOLO incorporates three novel innovations: the Feature\nBalanced Spindle Module (FBSM) for addressing uneven distribution, the Target\nDynamic Alignment Task Detection Head (TDATH) aimed at improving small object\ndetection, and the Position Memory Enhanced Self-Attention Mechanism (PMESA) to\ncombat background interference, with each component specifically designed to\nsolve its corresponding challenge. Detection efficacy of deep network models\ndeeply depends on the dataset's characteristics. Existing open source datasets\nrelated to building facades are limited by their single perspective, small\nimage pool, and incomplete category coverage. We propose a novel method for\nbuilding facade attachments detection dataset construction and construct the\nBFA-3D dataset for facade attachments detection. The BFA-3D dataset features\nmulti-view, accurate labels, diverse categories, and detailed classification.\nBFA-YOLO surpasses YOLOv8 by 1.8% and 2.9% in mAP@0.5 on the multi-view BFA-3D\nand street-view Facade-WHU datasets, respectively. These results underscore\nBFA-YOLO's superior performance in detecting facade attachments.\n","authors":["Yangguang Chen","Tong Wang","Guanzhou Chen","Kun Zhu","Xiaoliang Tan","Jiaqi Wang","Hong Xie","Wenlin Zhou","Jingyi Zhao","Qing Wang","Xiaolong Luo","Xiaodong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04025v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2409.04018v1","updated":"2024-09-06T04:10:33Z","published":"2024-09-06T04:10:33Z","title":"Towards Energy-Efficiency by Navigating the Trilemma of Energy, Latency,\n and Accuracy","summary":" Extended Reality (XR) enables immersive experiences through untethered\nheadsets but suffers from stringent battery and resource constraints.\nEnergy-efficient design is crucial to ensure both longevity and high\nperformance in XR devices. However, latency and accuracy are often prioritized\nover energy, leading to a gap in achieving energy efficiency. This paper\nexamines scene reconstruction, a key building block for immersive XR\nexperiences, and demonstrates how energy efficiency can be achieved by\nnavigating the trilemma of energy, latency, and accuracy.\n We explore three classes of energy-oriented optimizations, covering the\nalgorithm, execution, and data, that reveal a broad design space through\nconfigurable parameters. Our resulting 72 designs expose a wide range of\nlatency and energy trade-offs, with a smaller range of accuracy loss. We\nidentify a Pareto-optimal curve and show that the designs on the curve are\nachievable only through synergistic co-optimization of all three optimization\nclasses and by considering the latency and accuracy needs of downstream scene\nreconstruction consumers. Our analysis covering various use cases and\nmeasurements on an embedded class system shows that, relative to the baseline,\nour designs offer energy benefits of up to 60X with potential latency range of\n4X slowdown to 2X speedup. Detailed exploration of a use case across\nrepresentative data sequences from ScanNet showed about 25X energy savings with\n1.5X latency reduction and negligible reconstruction quality loss.\n","authors":["Boyuan Tian","Yihan Pang","Muhammad Huzaifa","Shenlong Wang","Sarita Adve"],"pdf_url":"https://arxiv.org/pdf/2409.04018v1.pdf","comment":"ISMAR 2024"},{"id":"http://arxiv.org/abs/2409.04013v1","updated":"2024-09-06T03:53:59Z","published":"2024-09-06T03:53:59Z","title":"3D-GP-LMVIC: Learning-based Multi-View Image Coding with 3D Gaussian\n Geometric Priors","summary":" Multi-view image compression is vital for 3D-related applications. To\neffectively model correlations between views, existing methods typically\npredict disparity between two views on a 2D plane, which works well for small\ndisparities, such as in stereo images, but struggles with larger disparities\ncaused by significant view changes. To address this, we propose a novel\napproach: learning-based multi-view image coding with 3D Gaussian geometric\npriors (3D-GP-LMVIC). Our method leverages 3D Gaussian Splatting to derive\ngeometric priors of the 3D scene, enabling more accurate disparity estimation\nacross views within the compression model. Additionally, we introduce a depth\nmap compression model to reduce redundancy in geometric information between\nviews. A multi-view sequence ordering method is also proposed to enhance\ncorrelations between adjacent views. Experimental results demonstrate that\n3D-GP-LMVIC surpasses both traditional and learning-based methods in\nperformance, while maintaining fast encoding and decoding speed.\n","authors":["Yujun Huang","Bin Chen","Niu Lian","Baoyi An","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2409.04013v1.pdf","comment":"19pages, 8 figures, conference"},{"id":"http://arxiv.org/abs/2409.04011v1","updated":"2024-09-06T03:34:44Z","published":"2024-09-06T03:34:44Z","title":"Hybrid Mask Generation for Infrared Small Target Detection with\n Single-Point Supervision","summary":" Single-frame infrared small target (SIRST) detection poses a significant\nchallenge due to the requirement to discern minute targets amidst complex\ninfrared background clutter. Recently, deep learning approaches have shown\npromising results in this domain. However, these methods heavily rely on\nextensive manual annotations, which are particularly cumbersome and\nresource-intensive for infrared small targets owing to their minute sizes. To\naddress this limitation, we introduce a Hybrid Mask Generation (HMG) approach\nthat recovers high-quality masks for each target from only a single-point label\nfor network training. Specifically, our HMG approach consists of a handcrafted\nPoints-to-Mask Generation strategy coupled with a pseudo mask updating strategy\nto recover and refine pseudo masks from point labels. The Points-to-Mask\nGeneration strategy divides two distinct stages: Points-to-Box conversion,\nwhere individual point labels are transformed into bounding boxes, and\nsubsequently, Box-to-Mask prediction, where these bounding boxes are elaborated\ninto precise masks. The mask updating strategy integrates the complementary\nstrengths of handcrafted and deep-learning algorithms to iteratively refine the\ninitial pseudo masks. Experimental results across three datasets demonstrate\nthat our method outperforms the existing methods for infrared small target\ndetection with single-point supervision.\n","authors":["Weijie He","Mushui Liu","Yunlong Yu","Zheming Lu","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2409.04011v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.04005v1","updated":"2024-09-06T03:13:45Z","published":"2024-09-06T03:13:45Z","title":"Qihoo-T2X: An Efficiency-Focused Diffusion Transformer via Proxy Tokens\n for Text-to-Any-Task","summary":" The global self-attention mechanism in diffusion transformers involves\nredundant computation due to the sparse and redundant nature of visual\ninformation, and the attention map of tokens within a spatial window shows\nsignificant similarity. To address this redundancy, we propose the Proxy Token\nDiffusion Transformer (PT-DiT), which employs sparse representative token\nattention (where the number of representative tokens is much smaller than the\ntotal number of tokens) to model global visual information efficiently.\nSpecifically, in each transformer block, we randomly sample one token from each\nspatial-temporal window to serve as a proxy token for that region. The global\nsemantics are captured through the self-attention of these proxy tokens and\nthen injected into all latent tokens via cross-attention. Simultaneously, we\nintroduce window and shift window attention to address the limitations in\ndetail modeling caused by the sparse attention mechanism. Building on the\nwell-designed PT-DiT, we further develop the Qihoo-T2X family, which includes a\nvariety of models for T2I, T2V, and T2MV tasks. Experimental results show that\nPT-DiT achieves competitive performance while reducing the computational\ncomplexity in both image and video generation tasks (e.g., a 48% reduction\ncompared to DiT and a 35% reduction compared to Pixart-alpha). Our source code\nis available at https://github.com/360CVGroup/Qihoo-T2X.\n","authors":["Jing Wang","Ao Ma","Jiasong Feng","Dawei Leng","Yuhui Yin","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2409.04005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04004v1","updated":"2024-09-06T03:10:59Z","published":"2024-09-06T03:10:59Z","title":"One-Shot Diffusion Mimicker for Handwritten Text Generation","summary":" Existing handwritten text generation methods often require more than ten\nhandwriting samples as style references. However, in practical applications,\nusers tend to prefer a handwriting generation model that operates with just a\nsingle reference sample for its convenience and efficiency. This approach,\nknown as \"one-shot generation\", significantly simplifies the process but poses\na significant challenge due to the difficulty of accurately capturing a\nwriter's style from a single sample, especially when extracting fine details\nfrom the characters' edges amidst sparse foreground and undesired background\nnoise. To address this problem, we propose a One-shot Diffusion Mimicker\n(One-DM) to generate handwritten text that can mimic any calligraphic style\nwith only one reference sample. Inspired by the fact that high-frequency\ninformation of the individual sample often contains distinct style patterns\n(e.g., character slant and letter joining), we develop a novel style-enhanced\nmodule to improve the style extraction by incorporating high-frequency\ncomponents from a single sample. We then fuse the style features with the text\ncontent as a merged condition for guiding the diffusion model to produce\nhigh-quality handwritten text images. Extensive experiments demonstrate that\nour method can successfully generate handwriting scripts with just one sample\nreference in multiple languages, even outperforming previous methods using over\nten samples. Our source code is available at\nhttps://github.com/dailenson/One-DM.\n","authors":["Gang Dai","Yifan Zhang","Quhui Ke","Qiangya Guo","Shuangping Huang"],"pdf_url":"https://arxiv.org/pdf/2409.04004v1.pdf","comment":"To appear in ECCV 2024"},{"id":"http://arxiv.org/abs/2409.04003v1","updated":"2024-09-06T03:09:58Z","published":"2024-09-06T03:09:58Z","title":"DreamForge: Motion-Aware Autoregressive Video Generation for Multi-View\n Driving Scenes","summary":" Recent advances in diffusion models have significantly enhanced the\ncotrollable generation of streetscapes for and facilitated downstream\nperception and planning tasks. However, challenges such as maintaining temporal\ncoherence, generating long videos, and accurately modeling driving scenes\npersist. Accordingly, we propose DreamForge, an advanced diffusion-based\nautoregressive video generation model designed for the long-term generation of\n3D-controllable and extensible video. In terms of controllability, our\nDreamForge supports flexible conditions such as text descriptions, camera\nposes, 3D bounding boxes, and road layouts, while also providing perspective\nguidance to produce driving scenes that are both geometrically and contextually\naccurate. For consistency, we ensure inter-view consistency through cross-view\nattention and temporal coherence via an autoregressive architecture enhanced\nwith motion cues. Codes will be available at\nhttps://github.com/PJLab-ADG/DriveArena.\n","authors":["Jianbiao Mei","Yukai Ma","Xuemeng Yang","Licheng Wen","Tiantian Wei","Min Dou","Botian Shi","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2409.04003v1.pdf","comment":"Second place solution for W-CODA-Track2"},{"id":"http://arxiv.org/abs/2312.02432v2","updated":"2024-09-06T03:00:09Z","published":"2023-12-05T02:17:48Z","title":"Orthogonal Adaptation for Modular Customization of Diffusion Models","summary":" Customization techniques for text-to-image models have paved the way for a\nwide range of previously unattainable applications, enabling the generation of\nspecific concepts across diverse contexts and styles. While existing methods\nfacilitate high-fidelity customization for individual concepts or a limited,\npre-defined set of them, they fall short of achieving scalability, where a\nsingle model can seamlessly render countless concepts. In this paper, we\naddress a new problem called Modular Customization, with the goal of\nefficiently merging customized models that were fine-tuned independently for\nindividual concepts. This allows the merged model to jointly synthesize\nconcepts in one image without compromising fidelity or incurring any additional\ncomputational costs. To address this problem, we introduce Orthogonal\nAdaptation, a method designed to encourage the customized models, which do not\nhave access to each other during fine-tuning, to have orthogonal residual\nweights. This ensures that during inference time, the customized models can be\nsummed with minimal interference. Our proposed method is both simple and\nversatile, applicable to nearly all optimizable weights in the model\narchitecture. Through an extensive set of quantitative and qualitative\nevaluations, our method consistently outperforms relevant baselines in terms of\nefficiency and identity preservation, demonstrating a significant leap toward\nscalable customization of diffusion models.\n","authors":["Ryan Po","Guandao Yang","Kfir Aberman","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2312.02432v2.pdf","comment":"Project page: https://ryanpo.com/ortha/; Hugging Face Demo:\n https://huggingface.co/spaces/ujin-song/ortha"},{"id":"http://arxiv.org/abs/2409.03982v1","updated":"2024-09-06T02:12:21Z","published":"2024-09-06T02:12:21Z","title":"Boundary feature fusion network for tooth image segmentation","summary":" Tooth segmentation is a critical technology in the field of medical image\nsegmentation, with applications ranging from orthodontic treatment to human\nbody identification and dental pathology assessment. Despite the development of\nnumerous tooth image segmentation models by researchers, a common shortcoming\nis the failure to account for the challenges of blurred tooth boundaries.\nDental diagnostics require precise delineation of tooth boundaries. This paper\nintroduces an innovative tooth segmentation network that integrates boundary\ninformation to address the issue of indistinct boundaries between teeth and\nadjacent tissues. This network's core is its boundary feature extraction\nmodule, which is designed to extract detailed boundary information from\nhigh-level features. Concurrently, the feature cross-fusion module merges\ndetailed boundary and global semantic information in a synergistic way,\nallowing for stepwise layer transfer of feature information. This method\nresults in precise tooth segmentation. In the most recent STS Data Challenge,\nour methodology was rigorously tested and received a commendable overall score\nof 0.91. When compared to other existing approaches, this score demonstrates\nour method's significant superiority in segmenting tooth boundaries.\n","authors":["Dongping Zhang","Zheng Li","Fangao Zeng","Yutong Wei"],"pdf_url":"https://arxiv.org/pdf/2409.03982v1.pdf","comment":"MICCAI workshop,see https://link.springer.com/book/9783031723957"},{"id":"http://arxiv.org/abs/2402.03666v3","updated":"2024-09-06T02:02:41Z","published":"2024-02-06T03:39:44Z","title":"QuEST: Low-bit Diffusion Model Quantization via Efficient Selective\n Finetuning","summary":" The practical deployment of diffusion models still suffers from the high\nmemory and time overhead. While quantization paves a way for compression and\nacceleration, existing methods unfortunately fail when the models are quantized\nto low-bits. In this paper, we empirically unravel three properties in\nquantized diffusion models that compromise the efficacy of current methods:\nimbalanced activation distributions, imprecise temporal information, and\nvulnerability to perturbations of specific modules. To alleviate the\nintensified low-bit quantization difficulty stemming from the distribution\nimbalance, we propose finetuning the quantized model to better adapt to the\nactivation distribution. Building on this idea, we identify two critical types\nof quantized layers: those holding vital temporal information and those\nsensitive to reduced bit-width, and finetune them to mitigate performance\ndegradation with efficiency. We empirically verify that our approach modifies\nthe activation distribution and provides meaningful temporal information,\nfacilitating easier and more accurate quantization. Our method is evaluated\nover three high-resolution image generation tasks and achieves state-of-the-art\nperformance under various bit-width settings, as well as being the first method\nto generate readable images on full 4-bit (i.e. W4A4) Stable Diffusion. Code is\navailable \\href{https://github.com/hatchetProject/QuEST}{here}.\n","authors":["Haoxuan Wang","Yuzhang Shang","Zhihang Yuan","Junyi Wu","Junchi Yan","Yan Yan"],"pdf_url":"https://arxiv.org/pdf/2402.03666v3.pdf","comment":"Code available at https://github.com/hatchetProject/QuEST"},{"id":"http://arxiv.org/abs/2409.03977v1","updated":"2024-09-06T01:54:35Z","published":"2024-09-06T01:54:35Z","title":"Bi-modality Images Transfer with a Discrete Process Matching Method","summary":" Recently, medical image synthesis gains more and more popularity, along with\nthe rapid development of generative models. Medical image synthesis aims to\ngenerate an unacquired image modality, often from other observed data\nmodalities. Synthesized images can be used for clinical diagnostic assistance,\ndata augmentation for model training and validation or image quality improving.\nIn the meanwhile, the flow-based models are among the successful generative\nmodels for the ability of generating realistic and high-quality synthetic\nimages. However, most flow-based models require to calculate flow ordinary\ndifferent equation (ODE) evolution steps in transfer process, for which the\nperformances are significantly limited by heavy computation time due to a large\nnumber of time iterations. In this paper, we propose a novel flow-based model,\nnamely Discrete Process Matching (DPM) to accomplish the bi-modality image\ntransfer tasks. Different to other flow matching based models, we propose to\nutilize both forward and backward ODE flow and enhance the consistency on the\nintermediate images of few discrete time steps, resulting in a transfer process\nwith much less iteration steps while maintaining high-quality generations for\nboth modalities. Our experiments on three datasets of MRI T1/T2 and CT/MRI\ndemonstrate that DPM outperforms other state-of-the-art flow-based methods for\nbi-modality image synthesis, achieving higher image quality with less\ncomputation time cost.\n","authors":["Zhe Xiong","Qiaoqiao Ding","Xiaoqun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.08621v4","updated":"2024-09-06T01:05:22Z","published":"2024-05-14T14:01:15Z","title":"RMT-BVQA: Recurrent Memory Transformer-based Blind Video Quality\n Assessment for Enhanced Video Content","summary":" With recent advances in deep learning, numerous algorithms have been\ndeveloped to enhance video quality, reduce visual artifacts, and improve\nperceptual quality. However, little research has been reported on the quality\nassessment of enhanced content - the evaluation of enhancement methods is often\nbased on quality metrics that were designed for compression applications. In\nthis paper, we propose a novel blind deep video quality assessment (VQA) method\nspecifically for enhanced video content. It employs a new Recurrent Memory\nTransformer (RMT) based network architecture to obtain video quality\nrepresentations, which is optimized through a novel content-quality-aware\ncontrastive learning strategy based on a new database containing 13K training\npatches with enhanced content. The extracted quality representations are then\ncombined through linear regression to generate video-level quality indices. The\nproposed method, RMT-BVQA, has been evaluated on the VDPVE (VQA Dataset for\nPerceptual Video Enhancement) database through a five-fold cross validation.\nThe results show its superior correlation performance when compared to ten\nexisting no-reference quality metrics.\n","authors":["Tianhao Peng","Chen Feng","Duolikun Danier","Fan Zhang","Benoit Vallade","Alex Mackin","David Bull"],"pdf_url":"https://arxiv.org/pdf/2405.08621v4.pdf","comment":"This paper has been accepted by the ECCV 2024 AIM Advances in Image\n Manipulation workshop"},{"id":"http://arxiv.org/abs/2409.03961v1","updated":"2024-09-06T00:59:10Z","published":"2024-09-06T00:59:10Z","title":"Generating Faithful and Salient Text from Multimodal Data","summary":" While large multimodal models (LMMs) have obtained strong performance on many\nmultimodal tasks, they may still hallucinate while generating text. Their\nperformance on detecting salient features from visual data is also unclear. In\nthis paper, we develop a framework to generate faithful and salient text from\nmixed-modal data, which includes images and structured data ( represented in\nknowledge graphs or tables). Specifically, we train a small vision critic model\nto identify hallucinated and non-salient features from the image modality. The\ncritic model also generates a list of salient image features. This information\nis used in the post editing step to improve the generation quality. Experiments\non two datasets show that our framework improves LMMs' generation quality on\nboth faithfulness and saliency, outperforming recent techniques aimed at\nreducing hallucination.\n","authors":["Tahsina Hashem","Weiqing Wang","Derry Tanti Wijaya","Mohammed Eunus Ali","Yuan-Fang Li"],"pdf_url":"https://arxiv.org/pdf/2409.03961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03947v1","updated":"2024-09-06T00:04:35Z","published":"2024-09-06T00:04:35Z","title":"FODA-PG for Enhanced Medical Imaging Narrative Generation: Adaptive\n Differentiation of Normal and Abnormal Attributes","summary":" Automatic Medical Imaging Narrative generation aims to alleviate the workload\nof radiologists by producing accurate clinical descriptions directly from\nradiological images. However, the subtle visual nuances and domain-specific\nterminology in medical images pose significant challenges compared to generic\nimage captioning tasks. Existing approaches often neglect the vital distinction\nbetween normal and abnormal findings, leading to suboptimal performance. In\nthis work, we propose FODA-PG, a novel Fine-grained Organ-Disease Adaptive\nPartitioning Graph framework that addresses these limitations through\ndomain-adaptive learning. FODA-PG constructs a granular graphical\nrepresentation of radiological findings by separating disease-related\nattributes into distinct \"disease-specific\" and \"disease-free\" categories based\non their clinical significance and location. This adaptive partitioning enables\nour model to capture the nuanced differences between normal and pathological\nstates, mitigating the impact of data biases. By integrating this fine-grained\nsemantic knowledge into a powerful transformer-based architecture and providing\nrigorous mathematical justifications for its effectiveness, FODA-PG generates\nprecise and clinically coherent reports with enhanced generalization\ncapabilities. Extensive experiments on the IU-Xray and MIMIC-CXR benchmarks\ndemonstrate the superiority of our approach over state-of-the-art methods,\nhighlighting the importance of domain adaptation in medical report generation.\n","authors":["Kai Shu","Yuzhuo Jia","Ziyang Zhang","Jiechao Gao"],"pdf_url":"https://arxiv.org/pdf/2409.03947v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.04432v1","updated":"2024-09-06T17:54:43Z","published":"2024-09-06T17:54:43Z","title":"A Survey on Knowledge Organization Systems of Research Fields: Resources\n and Challenges","summary":" Knowledge Organization Systems (KOSs), such as term lists, thesauri,\ntaxonomies, and ontologies, play a fundamental role in categorising, managing,\nand retrieving information. In the academic domain, KOSs are often adopted for\nrepresenting research areas and their relationships, primarily aiming to\nclassify research articles, academic courses, patents, books, scientific\nvenues, domain experts, grants, software, experiment materials, and several\nother relevant products and agents. These structured representations of\nresearch areas, widely embraced by many academic fields, have proven effective\nin empowering AI-based systems to i) enhance retrievability of relevant\ndocuments, ii) enable advanced analytic solutions to quantify the impact of\nacademic research, and iii) analyse and forecast research dynamics. This paper\naims to present a comprehensive survey of the current KOS for academic\ndisciplines. We analysed and compared 45 KOSs according to five main\ndimensions: scope, structure, curation, usage, and links to other KOSs. Our\nresults reveal a very heterogeneous scenario in terms of scope, scale, quality,\nand usage, highlighting the need for more integrated solutions for representing\nresearch knowledge across academic fields. We conclude by discussing the main\nchallenges and the most promising future directions.\n","authors":["Angelo Salatino","Tanay Aggarwal","Andrea Mannocci","Francesco Osborne","Enrico Motta"],"pdf_url":"https://arxiv.org/pdf/2409.04432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04339v1","updated":"2024-09-06T15:17:40Z","published":"2024-09-06T15:17:40Z","title":"How Fair is Your Diffusion Recommender Model?","summary":" Diffusion-based recommender systems have recently proven to outperform\ntraditional generative recommendation approaches, such as variational\nautoencoders and generative adversarial networks. Nevertheless, the machine\nlearning literature has raised several concerns regarding the possibility that\ndiffusion models, while learning the distribution of data samples, may\ninadvertently carry information bias and lead to unfair outcomes. In light of\nthis aspect, and considering the relevance that fairness has held in\nrecommendations over the last few decades, we conduct one of the first fairness\ninvestigations in the literature on DiffRec, a pioneer approach in\ndiffusion-based recommendation. First, we propose an experimental setting\ninvolving DiffRec (and its variant L-DiffRec) along with nine state-of-the-art\nrecommendation models, two popular recommendation datasets from the\nfairness-aware literature, and six metrics accounting for accuracy and\nconsumer/provider fairness. Then, we perform a twofold analysis, one assessing\nmodels' performance under accuracy and recommendation fairness separately, and\nthe other identifying if and to what extent such metrics can strike a\nperformance trade-off. Experimental results from both studies confirm the\ninitial unfairness warnings but pave the way for how to address them in future\nresearch directions.\n","authors":["Daniele Malitesta","Giacomo Medda","Erasmo Purificato","Ludovico Boratto","Fragkiskos D. Malliaros","Mirko Marras","Ernesto William De Luca"],"pdf_url":"https://arxiv.org/pdf/2409.04339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04329v1","updated":"2024-09-06T15:05:12Z","published":"2024-09-06T15:05:12Z","title":"Enhancing Sequential Music Recommendation with Personalized Popularity\n Awareness","summary":" In the realm of music recommendation, sequential recommender systems have\nshown promise in capturing the dynamic nature of music consumption.\nNevertheless, traditional Transformer-based models, such as SASRec and\nBERT4Rec, while effective, encounter challenges due to the unique\ncharacteristics of music listening habits. In fact, existing models struggle to\ncreate a coherent listening experience due to rapidly evolving preferences.\nMoreover, music consumption is characterized by a prevalence of repeated\nlistening, i.e., users frequently return to their favourite tracks, an\nimportant signal that could be framed as individual or personalized popularity.\n This paper addresses these challenges by introducing a novel approach that\nincorporates personalized popularity information into sequential\nrecommendation. By combining user-item popularity scores with model-generated\nscores, our method effectively balances the exploration of new music with the\nsatisfaction of user preferences. Experimental results demonstrate that a\nPersonalized Most Popular recommender, a method solely based on user-specific\npopularity, outperforms existing state-of-the-art models. Furthermore,\naugmenting Transformer-based models with personalized popularity awareness\nyields superior performance, showing improvements ranging from 25.2% to 69.8%.\nThe code for this paper is available at\nhttps://github.com/sisinflab/personalized-popularity-awareness.\n","authors":["Davide Abbattista","Vito Walter Anelli","Tommaso Di Noia","Craig Macdonald","Aleksandr Vladimirovich Petrov"],"pdf_url":"https://arxiv.org/pdf/2409.04329v1.pdf","comment":"Accepted by RecSys'24 as an LBR paper"},{"id":"http://arxiv.org/abs/2403.00884v3","updated":"2024-09-06T14:49:21Z","published":"2024-03-01T10:01:36Z","title":"Zero-Shot Topic Classification of Column Headers: Leveraging LLMs for\n Metadata Enrichment","summary":" Traditional dataset retrieval systems rely on metadata for indexing, rather\nthan on the underlying data values. However, high-quality metadata creation and\nenrichment often require manual annotations, which is a labour-intensive and\nchallenging process to automate. In this study, we propose a method to support\nmetadata enrichment using topic annotations generated by three Large Language\nModels (LLMs): ChatGPT-3.5, GoogleBard, and GoogleGemini. Our analysis focuses\non classifying column headers based on domain-specific topics from the\nConsortium of European Social Science Data Archives (CESSDA), a Linked Data\ncontrolled vocabulary. Our approach operates in a zero-shot setting,\nintegrating the controlled topic vocabulary directly within the input prompt.\nThis integration serves as a Large Context Windows approach, with the aim of\nimproving the results of the topic classification task.\n We evaluated the performance of the LLMs in terms of internal consistency,\ninter-machine alignment, and agreement with human classification. Additionally,\nwe investigate the impact of contextual information (i.e., dataset description)\non the classification outcomes. Our findings suggest that ChatGPT and\nGoogleGemini outperform GoogleBard in terms of internal consistency as well as\nLLM-human-agreement. Interestingly, we found that contextual information had no\nsignificant impact on LLM performance.\n This work proposes a novel approach that leverages LLMs for topic\nclassification of column headers using a controlled vocabulary, presenting a\npractical application of LLMs and Large Context Windows within the Semantic Web\ndomain. This approach has the potential to facilitate automated metadata\nenrichment, thereby enhancing dataset retrieval and the Findability,\nAccessibility, Interoperability, and Reusability (FAIR) of research data on the\nWeb.\n","authors":["Margherita Martorana","Tobias Kuhn","Lise Stork","Jacco van Ossenbruggen"],"pdf_url":"https://arxiv.org/pdf/2403.00884v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03708v2","updated":"2024-09-06T14:18:20Z","published":"2024-09-05T17:14:23Z","title":"RAG based Question-Answering for Contextual Response Prediction System","summary":" Large Language Models (LLMs) have shown versatility in various Natural\nLanguage Processing (NLP) tasks, including their potential as effective\nquestion-answering systems. However, to provide precise and relevant\ninformation in response to specific customer queries in industry settings, LLMs\nrequire access to a comprehensive knowledge base to avoid hallucinations.\nRetrieval Augmented Generation (RAG) emerges as a promising technique to\naddress this challenge. Yet, developing an accurate question-answering\nframework for real-world applications using RAG entails several challenges: 1)\ndata availability issues, 2) evaluating the quality of generated content, and\n3) the costly nature of human evaluation. In this paper, we introduce an\nend-to-end framework that employs LLMs with RAG capabilities for industry use\ncases. Given a customer query, the proposed system retrieves relevant knowledge\ndocuments and leverages them, along with previous chat history, to generate\nresponse suggestions for customer service agents in the contact centers of a\nmajor retail company. Through comprehensive automated and human evaluations, we\nshow that this solution outperforms the current BERT-based algorithms in\naccuracy and relevance. Our findings suggest that RAG-based LLMs can be an\nexcellent support to human customer service representatives by lightening their\nworkload.\n","authors":["Sriram Veturi","Saurabh Vaichal","Reshma Lal Jagadheesh","Nafis Irtiza Tripto","Nian Yan"],"pdf_url":"https://arxiv.org/pdf/2409.03708v2.pdf","comment":"Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise,\n CIKM'24. 6 pages"},{"id":"http://arxiv.org/abs/2408.09236v3","updated":"2024-09-06T13:34:16Z","published":"2024-08-17T16:04:31Z","title":"Hybrid Semantic Search: Unveiling User Intent Beyond Keywords","summary":" This paper addresses the limitations of traditional keyword-based search in\nunderstanding user intent and introduces a novel hybrid search approach that\nleverages the strengths of non-semantic search engines, Large Language Models\n(LLMs), and embedding models. The proposed system integrates keyword matching,\nsemantic vector embeddings, and LLM-generated structured queries to deliver\nhighly relevant and contextually appropriate search results. By combining these\ncomplementary methods, the hybrid approach effectively captures both explicit\nand implicit user intent.The paper further explores techniques to optimize\nquery execution for faster response times and demonstrates the effectiveness of\nthis hybrid search model in producing comprehensive and accurate search\noutcomes.\n","authors":["Aman Ahluwalia","Bishwajit Sutradhar","Karishma Ghosh","Indrapal Yadav","Arpan Sheetal","Prashant Patil"],"pdf_url":"https://arxiv.org/pdf/2408.09236v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15675v2","updated":"2024-09-06T13:20:40Z","published":"2024-04-24T06:05:35Z","title":"Hi-Gen: Generative Retrieval For Large-Scale Personalized E-commerce\n Search","summary":" Leveraging generative retrieval (GR) techniques to enhance search systems is\nan emerging methodology that has shown promising results in recent years. In\nGR, a text-to-text model maps string queries directly to relevant document\nidentifiers (docIDs), dramatically simplifying the retrieval process. However,\nwhen applying most GR models in large-scale E-commerce for personalized item\nsearch, we must face two key problems in encoding and decoding. (1) Existing\ndocID generation methods ignore the encoding of efficiency information, which\nis critical in E-commerce. (2) The positional information is important in\ndecoding docIDs, while prior studies have not adequately discriminated the\nsignificance of positional information or well exploited the inherent\ninterrelation among these positions. To overcome these problems, we introduce\nan efficient Hierarchical encoding-decoding Generative retrieval method\n(Hi-Gen) for large-scale personalized E-commerce search systems. Specifically,\nwe first design a representation learning model using metric learning to learn\ndiscriminative feature representations of items to capture semantic relevance\nand efficiency information. Then, we propose a category-guided hierarchical\nclustering scheme that makes full use of the semantic and efficiency\ninformation of items to facilitate docID generation. Finally, we design a\nposition-aware loss to discriminate the importance of positions and mine the\ninherent interrelation between different tokens at the same position. This loss\nboosts the performance of the language model used in the decoding stage.\nBesides, we propose two variants of Hi-Gen (Hi-Gen-I2I and Hi-Gen-Cluster) to\nsupport online real-time large-scale recall in the online serving process.\nHi-Gen gets 3.30% and 4.62% improvements over SOTA for Recall@1 on the public\nand industry datasets, respectively.\n","authors":["Yanjing Wu","Yinfu Feng","Jian Wang","Wenji Zhou","Yunan Ye","Rong Xiao","Jun Xiao"],"pdf_url":"https://arxiv.org/pdf/2404.15675v2.pdf","comment":"Accepted by ICDM 2024"},{"id":"http://arxiv.org/abs/2409.04244v1","updated":"2024-09-06T12:51:10Z","published":"2024-09-06T12:51:10Z","title":"WarpAdam: A new Adam optimizer based on Meta-Learning approach","summary":" Optimal selection of optimization algorithms is crucial for training deep\nlearning models. The Adam optimizer has gained significant attention due to its\nefficiency and wide applicability. However, to enhance the adaptability of\noptimizers across diverse datasets, we propose an innovative optimization\nstrategy by integrating the 'warped gradient descend'concept from Meta Learning\ninto the Adam optimizer. In the conventional Adam optimizer, gradients are\nutilized to compute estimates of gradient mean and variance, subsequently\nupdating model parameters. Our approach introduces a learnable distortion\nmatrix, denoted as P, which is employed for linearly transforming gradients.\nThis transformation slightly adjusts gradients during each iteration, enabling\nthe optimizer to better adapt to distinct dataset characteristics. By learning\nan appropriate distortion matrix P, our method aims to adaptively adjust\ngradient information across different data distributions, thereby enhancing\noptimization performance. Our research showcases the potential of this novel\napproach through theoretical insights and empirical evaluations. Experimental\nresults across various tasks and datasets validate the superiority of our\noptimizer that integrates the 'warped gradient descend' concept in terms of\nadaptability. Furthermore, we explore effective strategies for training the\nadaptation matrix P and identify scenarios where this method can yield optimal\nresults. In summary, this study introduces an innovative approach that merges\nthe 'warped gradient descend' concept from Meta Learning with the Adam\noptimizer. By introducing a learnable distortion matrix P within the optimizer,\nwe aim to enhance the model's generalization capability across diverse data\ndistributions, thus opening up new possibilities in the field of deep learning\noptimization.\n","authors":["Chengxi Pan","Junshang Chen","Jingrui Ye"],"pdf_url":"https://arxiv.org/pdf/2409.04244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02664v3","updated":"2024-09-06T11:38:00Z","published":"2024-05-04T13:25:06Z","title":"MedPromptExtract (Medical Data Extraction Tool): Anonymization and\n Hi-fidelity Automated data extraction using NLP and prompt engineering","summary":" Introduction: The labour-intensive nature of data extraction from sources\nlike discharge summaries (DS) poses significant obstacles to the digitisation\nof medical records particularly for low- and middle-income countries (LMICs).\nIn this paper we present a completely automated method MedPromptExtract to\nefficiently extract data from DS while maintaining confidentiality. Methods:\nThe source of data was Discharge Summaries (DS) from Kokilaben Dhirubhai Ambani\nHospital (KDAH) of patients having Acute Kidney Injury (AKI). A pre-existing\ntool EIGEN which leverages semi-supervised learning techniques for\nhigh-fidelity information extraction was used to anonymize the DS, Natural\nLanguage Processing (NLP) was used to extract data from regular fields. We used\nPrompt Engineering and Large Language Model(LLM) to extract custom clinical\ninformation from free flowing text describing the patients stay in the\nhospital. Twelve features associated with occurrence of AKI were extracted. The\nLLM responses were validated against clinicians annotations. Results: The\nMedPromptExtracttool first subjected DS to the anonymization pipeline which\ntook three seconds per summary. Successful anonymization was verified by\nclinicians, thereafter NLP pipeline extracted structured text from the\nanonymized pdfs at the rate of 0.2 seconds per summary with 100%\naccuracy.Finally DS were analysed by the LLM pipeline using Gemini Pro for the\ntwelve features. Accuracy metrics were calculated by comparing model responses\nto clinicians annotations with seven features achieving AUCs above 0.9,\nindicating high fidelity of the extraction process. Conclusion:\nMedPromptExtract serves as an automated adaptable tool for efficient data\nextraction from medical records with a dynamic user interface. Keywords:\nDigitizing Medical Records, Automated Anonymisation, Information Retrieval,\nLarge Language Models, Prompt Engineering\n","authors":["Roomani Srivastava","Suraj Prasad","Lipika Bhat","Sarvesh Deshpande","Barnali Das","Kshitij Jadhav"],"pdf_url":"https://arxiv.org/pdf/2405.02664v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04056v1","updated":"2024-09-06T06:53:45Z","published":"2024-09-06T06:53:45Z","title":"Refining Wikidata Taxonomy using Large Language Models","summary":" Due to its collaborative nature, Wikidata is known to have a complex\ntaxonomy, with recurrent issues like the ambiguity between instances and\nclasses, the inaccuracy of some taxonomic paths, the presence of cycles, and\nthe high level of redundancy across classes. Manual efforts to clean up this\ntaxonomy are time-consuming and prone to errors or subjective decisions. We\npresent WiKC, a new version of Wikidata taxonomy cleaned automatically using a\ncombination of Large Language Models (LLMs) and graph mining techniques.\nOperations on the taxonomy, such as cutting links or merging classes, are\nperformed with the help of zero-shot prompting on an open-source LLM. The\nquality of the refined taxonomy is evaluated from both intrinsic and extrinsic\nperspectives, on a task of entity typing for the latter, showing the practical\ninterest of WiKC.\n","authors":["Yiwen Peng","Thomas Bonald","Mehwish Alam"],"pdf_url":"https://arxiv.org/pdf/2409.04056v1.pdf","comment":"ACM International Conference on Information and Knowledge Management,\n Oct 2024, Boise, Idaho, United States"},{"id":"http://arxiv.org/abs/2409.04649v1","updated":"2024-09-06T23:16:06Z","published":"2024-09-06T23:16:06Z","title":"Preserving Individuality while Following the Crowd: Understanding the\n Role of User Taste and Crowd Wisdom in Online Product Rating Prediction","summary":" Numerous algorithms have been developed for online product rating prediction,\nbut the specific influence of user and product information in determining the\nfinal prediction score remains largely unexplored. Existing research often\nrelies on narrowly defined data settings, which overlooks real-world challenges\nsuch as the cold-start problem, cross-category information utilization, and\nscalability and deployment issues. To delve deeper into these aspects, and\nparticularly to uncover the roles of individual user taste and collective\nwisdom, we propose a unique and practical approach that emphasizes historical\nratings at both the user and product levels, encapsulated using a continuously\nupdated dynamic tree representation. This representation effectively captures\nthe temporal dynamics of users and products, leverages user information across\nproduct categories, and provides a natural solution to the cold-start problem.\nFurthermore, we have developed an efficient data processing strategy that makes\nthis approach highly scalable and easily deployable. Comprehensive experiments\nin real industry settings demonstrate the effectiveness of our approach.\nNotably, our findings reveal that individual taste dominates over collective\nwisdom in online product rating prediction, a perspective that contrasts with\nthe commonly observed wisdom of the crowd phenomenon in other domains. This\ndominance of individual user taste is consistent across various model types,\nincluding the boosting tree model, recurrent neural network (RNN), and\ntransformer-based architectures. This observation holds true across the overall\npopulation, within individual product categories, and in cold-start scenarios.\nOur findings underscore the significance of individual user tastes in the\ncontext of online product rating prediction and the robustness of our approach\nacross different model architectures.\n","authors":["Liang Wang","Shubham Jain","Yingtong Dou","Junpeng Wang","Chin-Chia Michael Yeh","Yujie Fan","Prince Aboagye","Yan Zheng","Xin Dai","Zhongfang Zhuang","Uday Singh Saini","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04649v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2302.11157v2","updated":"2024-09-06T19:30:26Z","published":"2023-02-22T05:41:27Z","title":"FiNER-ORD: Financial Named Entity Recognition Open Research Dataset","summary":" Over the last two decades, the development of the CoNLL-2003 named entity\nrecognition (NER) dataset has helped enhance the capabilities of deep learning\nand natural language processing (NLP). The finance domain, characterized by its\nunique semantic and lexical variations for the same entities, presents specific\nchallenges to the NER task; thus, a domain-specific customized dataset is\ncrucial for advancing research in this field. In our work, we develop the first\nhigh-quality English Financial NER Open Research Dataset (FiNER-ORD). We\nbenchmark multiple pre-trained language models (PLMs) and large-language models\n(LLMs) on FiNER-ORD. We believe our proposed FiNER-ORD dataset will open future\nopportunities to use FiNER-ORD as a benchmark for financial domain-specific NER\nand NLP tasks. Our dataset, models, and code are publicly available on GitHub\nand Hugging Face under CC BY-NC 4.0 license.\n","authors":["Agam Shah","Abhinav Gullapalli","Ruchit Vithani","Michael Galarnyk","Sudheer Chava"],"pdf_url":"https://arxiv.org/pdf/2302.11157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03140v2","updated":"2024-09-06T18:41:50Z","published":"2024-09-05T00:25:37Z","title":"GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase\n Recommendation","summary":" Online sellers and advertisers are recommended keyphrases for their listed\nproducts, which they bid on to enhance their sales. One popular paradigm that\ngenerates such recommendations is Extreme Multi-Label Classification (XMC),\nwhich involves tagging/mapping keyphrases to items. We outline the limitations\nof using traditional item-query based tagging or mapping techniques for\nkeyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an\ninnovative graph-based approach that recommends keyphrases to sellers using\nextraction of token permutations from item titles. Additionally, we demonstrate\nthat relying on traditional metrics such as precision/recall can be misleading\nin practical applications, thereby necessitating a combination of metrics to\nevaluate performance in real-world scenarios. These metrics are designed to\nassess the relevance of keyphrases to items and the potential for buyer\noutreach. GraphEx outperforms production models at eBay, achieving the\nobjectives mentioned above. It supports near real-time inferencing in\nresource-constrained production environments and scales effectively for\nbillions of items.\n","authors":["Ashirbad Mishra","Soumik Dey","Marshall Wu","Jinyu Zhao","He Yu","Kaichen Ni","Binbin Li","Kamesh Madduri"],"pdf_url":"https://arxiv.org/pdf/2409.03140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04540v1","updated":"2024-09-06T18:10:42Z","published":"2024-09-06T18:10:42Z","title":"A Unified Framework for Cross-Domain Recommendation","summary":" In addressing the persistent challenges of data-sparsity and cold-start\nissues in domain-expert recommender systems, Cross-Domain Recommendation (CDR)\nemerges as a promising methodology. CDR aims at enhancing prediction\nperformance in the target domain by leveraging interaction knowledge from\nrelated source domains, particularly through users or items that span across\nmultiple domains (e.g., Short-Video and Living-Room). For academic research\npurposes, there are a number of distinct aspects to guide CDR method designing,\nincluding the auxiliary domain number, domain-overlapped element, user-item\ninteraction types, and downstream tasks. With so many different CDR combination\nscenario settings, the proposed scenario-expert approaches are tailored to\naddress a specific vertical CDR scenario, and often lack the capacity to adapt\nto multiple horizontal scenarios. In an effect to coherently adapt to various\nscenarios, and drawing inspiration from the concept of domain-invariant\ntransfer learning, we extend the former SOTA model UniCDR in five different\naspects, named as UniCDR+. Our work was successfully deployed on the Kuaishou\nLiving-Room RecSys.\n","authors":["Jiangxia Cao","Shen Wang","Gaode Chen","Rui Huang","Shuang Yang","Zhaojie Liu","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.04540v1.pdf","comment":"Work in progress"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2406.12002v2","updated":"2024-09-06T17:57:52Z","published":"2024-06-17T18:13:57Z","title":"Modeling, Inference, and Prediction in Mobility-Based Compartmental\n Models for Epidemiology","summary":" Classical compartmental models in epidemiology often assume a homogeneous\npopulation for simplicity, which neglects the inherent heterogeneity among\nindividuals. This assumption frequently leads to inaccurate predictions when\napplied to real-world data. For example, evidence has shown that classical\nmodels overestimate the final pandemic size in the H1N1-2009 and COVID-19\noutbreaks. To address this issue, we introduce individual mobility as a key\nfactor in disease transmission and control. We characterize disease dynamics\nusing mobility distribution functions for each compartment and propose a\nmobility-based compartmental model that incorporates population heterogeneity.\nOur results demonstrate that, for the same basic reproduction number, our\nmobility-based model predicts a smaller final pandemic size compared to the\nclassical models, effectively addressing the common overestimation problem.\nAdditionally, we infer mobility distributions from the time series of the\ninfected population. We provide sufficient conditions for uniquely identifying\nthe mobility distribution from a dataset and propose a machine-learning-based\napproach to learn mobility from both synthesized and real-world data.\n","authors":["Ning Jiang","Weiqi Chu","Yao Li"],"pdf_url":"https://arxiv.org/pdf/2406.12002v2.pdf","comment":"19 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.19796v3","updated":"2024-09-06T17:55:53Z","published":"2023-10-30T17:59:04Z","title":"Re-evaluating Retrosynthesis Algorithms with Syntheseus","summary":" Automated Synthesis Planning has recently re-emerged as a research area at\nthe intersection of chemistry and machine learning. Despite the appearance of\nsteady progress, we argue that imperfect benchmarks and inconsistent\ncomparisons mask systematic shortcomings of existing techniques, and\nunnecessarily hamper progress. To remedy this, we present a synthesis planning\nlibrary with an extensive benchmarking framework, called syntheseus, which\npromotes best practice by default, enabling consistent meaningful evaluation of\nsingle-step models and multi-step planning algorithms. We demonstrate the\ncapabilities of syntheseus by re-evaluating several previous retrosynthesis\nalgorithms, and find that the ranking of state-of-the-art models changes in\ncontrolled evaluation experiments. We end with guidance for future works in\nthis area, and call the community to engage in the discussion on how to improve\nbenchmarks for synthesis planning.\n","authors":["Krzysztof Maziarz","Austin Tripp","Guoqing Liu","Megan Stanley","Shufang Xie","Piotr Gaiński","Philipp Seidl","Marwin Segler"],"pdf_url":"https://arxiv.org/pdf/2310.19796v3.pdf","comment":"Accepted for publication in Faraday Discussions"},{"id":"http://arxiv.org/abs/2409.04434v1","updated":"2024-09-06T17:55:49Z","published":"2024-09-06T17:55:49Z","title":"Accelerating Training with Neuron Interaction and Nowcasting Networks","summary":" Neural network training can be accelerated when a learnable update rule is\nused in lieu of classic adaptive optimizers (e.g. Adam). However, learnable\nupdate rules can be costly and unstable to train and use. A simpler recently\nproposed approach to accelerate training is to use Adam for most of the\noptimization steps and periodically, only every few steps, nowcast (predict\nfuture) parameters. We improve this approach by Neuron interaction and\nNowcasting (NiNo) networks. NiNo leverages neuron connectivity and graph neural\nnetworks to more accurately nowcast parameters by learning in a supervised way\nfrom a set of training trajectories over multiple tasks. We show that in some\nnetworks, such as Transformers, neuron connectivity is non-trivial. By\naccurately modeling neuron connectivity, we allow NiNo to accelerate Adam\ntraining by up to 50\\% in vision and language tasks.\n","authors":["Boris Knyazev","Abhinav Moudgil","Guillaume Lajoie","Eugene Belilovsky","Simon Lacoste-Julien"],"pdf_url":"https://arxiv.org/pdf/2409.04434v1.pdf","comment":"code https://github.com/SamsungSAILMontreal/nino"},{"id":"http://arxiv.org/abs/2409.04431v1","updated":"2024-09-06T17:53:26Z","published":"2024-09-06T17:53:26Z","title":"Theory, Analysis, and Best Practices for Sigmoid Self-Attention","summary":" Attention is a key part of the transformer architecture. It is a\nsequence-to-sequence mapping that transforms each sequence element into a\nweighted sum of values. The weights are typically obtained as the softmax of\ndot products between keys and queries. Recent work has explored alternatives to\nsoftmax attention in transformers, such as ReLU and sigmoid activations. In\nthis work, we revisit sigmoid attention and conduct an in-depth theoretical and\nempirical analysis. Theoretically, we prove that transformers with sigmoid\nattention are universal function approximators and benefit from improved\nregularity compared to softmax attention. Through detailed empirical analysis,\nwe identify stabilization of large initial attention norms during the early\nstages of training as a crucial factor for the successful training of models\nwith sigmoid attention, outperforming prior attempts. We also introduce\nFLASHSIGMOID, a hardware-aware and memory-efficient implementation of sigmoid\nattention yielding a 17% inference kernel speed-up over FLASHATTENTION2 on H100\nGPUs. Experiments across language, vision, and speech show that properly\nnormalized sigmoid attention matches the strong performance of softmax\nattention on a wide range of domains and scales, which previous attempts at\nsigmoid attention were unable to fully achieve. Our work unifies prior art and\nestablishes best practices for sigmoid attention as a drop-in softmax\nreplacement in transformers.\n","authors":["Jason Ramapuram","Federico Danieli","Eeshan Dhekane","Floris Weers","Dan Busbridge","Pierre Ablin","Tatiana Likhomanenko","Jagrit Digani","Zijin Gu","Amitis Shidani","Russ Webb"],"pdf_url":"https://arxiv.org/pdf/2409.04431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04429v1","updated":"2024-09-06T17:49:56Z","published":"2024-09-06T17:49:56Z","title":"VILA-U: a Unified Foundation Model Integrating Visual Understanding and\n Generation","summary":" VILA-U is a Unified foundation model that integrates Video, Image, Language\nunderstanding and generation. Traditional visual language models (VLMs) use\nseparate modules for understanding and generating visual content, which can\nlead to misalignment and increased complexity. In contrast, VILA-U employs a\nsingle autoregressive next-token prediction framework for both tasks,\neliminating the need for additional components like diffusion models. This\napproach not only simplifies the model but also achieves near state-of-the-art\nperformance in visual language understanding and generation. The success of\nVILA-U is attributed to two main factors: the unified vision tower that aligns\ndiscrete visual tokens with textual inputs during pretraining, which enhances\nvisual perception, and autoregressive image generation can achieve similar\nquality as diffusion models with high-quality dataset. This allows VILA-U to\nperform comparably to more complex models using a fully token-based\nautoregressive framework.\n","authors":["Yecheng Wu","Zhuoyang Zhang","Junyu Chen","Haotian Tang","Dacheng Li","Yunhao Fang","Ligeng Zhu","Enze Xie","Hongxu Yin","Li Yi","Song Han","Yao Lu"],"pdf_url":"https://arxiv.org/pdf/2409.04429v1.pdf","comment":"11 pages, 7 figures, 8 tables"},{"id":"http://arxiv.org/abs/2409.04428v1","updated":"2024-09-06T17:48:44Z","published":"2024-09-06T17:48:44Z","title":"Hybrid Spiking Neural Networks for Low-Power Intra-Cortical\n Brain-Machine Interfaces","summary":" Intra-cortical brain-machine interfaces (iBMIs) have the potential to\ndramatically improve the lives of people with paraplegia by restoring their\nability to perform daily activities. However, current iBMIs suffer from\nscalability and mobility limitations due to bulky hardware and wiring. Wireless\niBMIs offer a solution but are constrained by a limited data rate. To overcome\nthis challenge, we are investigating hybrid spiking neural networks for\nembedded neural decoding in wireless iBMIs. The networks consist of a temporal\nconvolution-based compression followed by recurrent processing and a final\ninterpolation back to the original sequence length. As recurrent units, we\nexplore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons,\nand a combination of both - spiking GRUs (sGRUs) and analyze their differences\nin terms of accuracy, footprint, and activation sparsity. To that end, we train\ndecoders on the \"Nonhuman Primate Reaching with Multichannel Sensorimotor\nCortex Electrophysiology\" dataset and evaluate it using the NeuroBench\nframework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural\nDecoding. Our approach achieves high accuracy in predicting velocities of\nprimate reaching movements from multichannel primary motor cortex recordings\nwhile maintaining a low number of synaptic operations, surpassing the current\nbaseline models in the NeuroBench framework. This work highlights the potential\nof hybrid neural networks to facilitate wireless iBMIs with high decoding\nprecision and a substantial increase in the number of monitored neurons, paving\nthe way toward more advanced neuroprosthetic technologies.\n","authors":["Alexandru Vasilache","Jann Krausse","Klaus Knobloch","Juergen Becker"],"pdf_url":"https://arxiv.org/pdf/2409.04428v1.pdf","comment":"This work has been accepted at the 2024 IEEE Biomedical Circuits and\n Systems Conference"},{"id":"http://arxiv.org/abs/2409.04421v1","updated":"2024-09-06T17:30:45Z","published":"2024-09-06T17:30:45Z","title":"RLPF: Reinforcement Learning from Prediction Feedback for User\n Summarization with LLMs","summary":" LLM-powered personalization agent systems employ Large Language Models (LLMs)\nto predict users' behavior from their past activities. However, their\neffectiveness often hinges on the ability to effectively leverage extensive,\nlong user historical data due to its inherent noise and length of such data.\nExisting pretrained LLMs may generate summaries that are concise but lack the\nnecessary context for downstream tasks, hindering their utility in\npersonalization systems. To address these challenges, we introduce\nReinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to\ngenerate concise, human-readable user summaries that are optimized for\ndownstream task performance. By maximizing the usefulness of the generated\nsummaries, RLPF effectively distills extensive user history data while\npreserving essential information for downstream tasks. Our empirical evaluation\ndemonstrates significant improvements in both extrinsic downstream task utility\nand intrinsic summary quality, surpassing baseline methods by up to 22% on\ndownstream task performance and achieving an up to 84.59% win rate on\nFactuality, Abstractiveness, and Readability. RLPF also achieves a remarkable\n74% reduction in context length while improving performance on 16 out of 19\nunseen tasks and/or datasets, showcasing its generalizability. This approach\noffers a promising solution for enhancing LLM personalization by effectively\ntransforming long, noisy user histories into informative and human-readable\nrepresentations.\n","authors":["Jiaxing Wu","Lin Ning","Luyang Liu","Harrison Lee","Neo Wu","Chao Wang","Sushant Prakash","Shawn O'Banion","Bradley Green","Jun Xie"],"pdf_url":"https://arxiv.org/pdf/2409.04421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09532v2","updated":"2024-09-06T17:27:10Z","published":"2024-08-18T16:37:53Z","title":"Deep Limit Model-free Prediction in Regression","summary":" In this paper, we provide a novel Model-free approach based on Deep Neural\nNetwork (DNN) to accomplish point prediction and prediction interval under a\ngeneral regression setting. Usually, people rely on parametric or\nnon-parametric models to bridge dependent and independent variables (Y and X).\nHowever, this classical method relies heavily on the correct model\nspecification. Even for the non-parametric approach, some additive form is\noften assumed. A newly proposed Model-free prediction principle sheds light on\na prediction procedure without any model assumption. Previous work regarding\nthis principle has shown better performance than other standard alternatives.\nRecently, DNN, one of the machine learning methods, has received increasing\nattention due to its great performance in practice. Guided by the Model-free\nprediction idea, we attempt to apply a fully connected forward DNN to map X and\nsome appropriate reference random variable Z to Y. The targeted DNN is trained\nby minimizing a specially designed loss function so that the randomness of Y\nconditional on X is outsourced to Z through the trained DNN. Our method is more\nstable and accurate compared to other DNN-based counterparts, especially for\noptimal point predictions. With a specific prediction procedure, our prediction\ninterval can capture the estimation variability so that it can render a better\ncoverage rate for finite sample cases. The superior performance of our method\nis verified by simulation and empirical studies.\n","authors":["Kejin Wu","Dimitris N. Politis"],"pdf_url":"https://arxiv.org/pdf/2408.09532v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03223v6","updated":"2024-09-06T17:26:00Z","published":"2023-10-05T00:45:04Z","title":"TacoGFN: Target-conditioned GFlowNet for Structure-based Drug Design","summary":" Searching the vast chemical space for drug-like molecules that bind with a\nprotein pocket is a challenging task in drug discovery. Recently,\nstructure-based generative models have been introduced which promise to be more\nefficient by learning to generate molecules for any given protein structure.\nHowever, since they learn the distribution of a limited protein-ligand complex\ndataset, structure-based methods do not yet outperform optimization-based\nmethods that generate binding molecules for just one pocket. To overcome\nlimitations on data while leveraging learning across protein targets, we choose\nto model the reward distribution conditioned on pocket structure, instead of\nthe training data distribution. We design TacoGFN, a novel GFlowNet-based\napproach for structure-based drug design, which can generate molecules\nconditioned on any protein pocket structure with probabilities proportional to\nits affinity and property rewards. In the generative setting for\nCrossDocked2020 benchmark, TacoGFN attains a state-of-the-art success rate of\n$56.0\\%$ and $-8.44$ kcal/mol in median Vina Dock score while improving the\ngeneration time by multiple orders of magnitude. Fine-tuning TacoGFN further\nimproves the median Vina Dock score to $-10.93$ kcal/mol and the success rate\nto $88.8\\%$, outperforming all optimization-based methods.\n","authors":["Tony Shen","Seonghwan Seo","Grayson Lee","Mohit Pandey","Jason R Smith","Artem Cherkasov","Woo Youn Kim","Martin Ester"],"pdf_url":"https://arxiv.org/pdf/2310.03223v6.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR),\n 09/2024"},{"id":"http://arxiv.org/abs/2409.04411v1","updated":"2024-09-06T17:15:28Z","published":"2024-09-06T17:15:28Z","title":"Approximating Metric Magnitude of Point Sets","summary":" Metric magnitude is a measure of the \"size\" of point clouds with many\ndesirable geometric properties. It has been adapted to various mathematical\ncontexts and recent work suggests that it can enhance machine learning and\noptimization algorithms. But its usability is limited due to the computational\ncost when the dataset is large or when the computation must be carried out\nrepeatedly (e.g. in model training). In this paper, we study the magnitude\ncomputation problem, and show efficient ways of approximating it. We show that\nit can be cast as a convex optimization problem, but not as a submodular\noptimization. The paper describes two new algorithms - an iterative\napproximation algorithm that converges fast and is accurate, and a subset\nselection method that makes the computation even faster. It has been previously\nproposed that magnitude of model sequences generated during stochastic gradient\ndescent is correlated to generalization gap. Extension of this result using our\nmore scalable algorithms shows that longer sequences in fact bear higher\ncorrelations. We also describe new applications of magnitude in machine\nlearning - as an effective regularizer for neural network training, and as a\nnovel clustering criterion.\n","authors":["Rayna Andreeva","James Ward","Primoz Skraba","Jie Gao","Rik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2409.04411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04407v1","updated":"2024-09-06T17:10:28Z","published":"2024-09-06T17:10:28Z","title":"Exploiting the Data Gap: Utilizing Non-ignorable Missingness to\n Manipulate Model Learning","summary":" Missing data is commonly encountered in practice, and when the missingness is\nnon-ignorable, effective remediation depends on knowledge of the missingness\nmechanism. Learning the underlying missingness mechanism from the data is not\npossible in general, so adversaries can exploit this fact by maliciously\nengineering non-ignorable missingness mechanisms. Such Adversarial Missingness\n(AM) attacks have only recently been motivated and introduced, and then\nsuccessfully tailored to mislead causal structure learning algorithms into\nhiding specific cause-and-effect relationships. However, existing AM attacks\nassume the modeler (victim) uses full-information maximum likelihood methods to\nhandle the missing data, and are of limited applicability when the modeler uses\ndifferent remediation strategies. In this work we focus on associational\nlearning in the context of AM attacks. We consider (i) complete case analysis,\n(ii) mean imputation, and (iii) regression-based imputation as alternative\nstrategies used by the modeler. Instead of combinatorially searching for\nmissing entries, we propose a novel probabilistic approximation by deriving the\nasymptotic forms of these methods used for handling the missing entries. We\nthen formulate the learning of the adversarial missingness mechanism as a\nbi-level optimization problem. Experiments on generalized linear models show\nthat AM attacks can be used to change the p-values of features from significant\nto insignificant in real datasets, such as the California-housing dataset,\nwhile using relatively moderate amounts of missingness (<20%). Additionally, we\nassess the robustness of our attacks against defense strategies based on data\nvaluation.\n","authors":["Deniz Koyuncu","Alex Gittens","Bülent Yener","Moti Yung"],"pdf_url":"https://arxiv.org/pdf/2409.04407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04406v1","updated":"2024-09-06T16:56:06Z","published":"2024-09-06T16:56:06Z","title":"Quantum Kernel Methods under Scrutiny: A Benchmarking Study","summary":" Since the entry of kernel theory in the field of quantum machine learning,\nquantum kernel methods (QKMs) have gained increasing attention with regard to\nboth probing promising applications and delivering intriguing research\ninsights. Two common approaches for computing the underlying Gram matrix have\nemerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs).\nBenchmarking these methods is crucial to gain robust insights and to understand\ntheir practical utility. In this work, we present a comprehensive large-scale\nstudy examining QKMs based on FQKs and PQKs across a manifold of design\nchoices. Our investigation encompasses both classification and regression tasks\nfor five dataset families and 64 datasets, systematically comparing the use of\nFQKs and PQKs quantum support vector machines and kernel ridge regression. This\nresulted in over 20,000 models that were trained and optimized using a\nstate-of-the-art hyperparameter search to ensure robust and comprehensive\ninsights. We delve into the importance of hyperparameters on model performance\nscores and support our findings through rigorous correlation analyses. In this,\nwe also closely inspect two data encoding strategies. Moreover, we provide an\nin-depth analysis addressing the design freedom of PQKs and explore the\nunderlying principles responsible for learning. Our goal is not to identify the\nbest-performing model for a specific task but to uncover the mechanisms that\nlead to effective QKMs and reveal universal patterns.\n","authors":["Jan Schnabel","Marco Roth"],"pdf_url":"https://arxiv.org/pdf/2409.04406v1.pdf","comment":"19 pages main text including 12 figures, appendix 25 pages with 31\n figures"},{"id":"http://arxiv.org/abs/2409.03024v2","updated":"2024-09-06T16:55:26Z","published":"2024-09-04T18:31:24Z","title":"NUMOSIM: A Synthetic Mobility Dataset with Anomaly Detection Benchmarks","summary":" Collecting real-world mobility data is challenging. It is often fraught with\nprivacy concerns, logistical difficulties, and inherent biases. Moreover,\naccurately annotating anomalies in large-scale data is nearly impossible, as it\ndemands meticulous effort to distinguish subtle and complex patterns. These\nchallenges significantly impede progress in geospatial anomaly detection\nresearch by restricting access to reliable data and complicating the rigorous\nevaluation, comparison, and benchmarking of methodologies. To address these\nlimitations, we introduce a synthetic mobility dataset, NUMOSIM, that provides\na controlled, ethical, and diverse environment for benchmarking anomaly\ndetection techniques. NUMOSIM simulates a wide array of realistic mobility\nscenarios, encompassing both typical and anomalous behaviours, generated\nthrough advanced deep learning models trained on real mobility data. This\napproach allows NUMOSIM to accurately replicate the complexities of real-world\nmovement patterns while strategically injecting anomalies to challenge and\nevaluate detection algorithms based on how effectively they capture the\ninterplay between demographic, geospatial, and temporal factors. Our goal is to\nadvance geospatial mobility analysis by offering a realistic benchmark for\nimproving anomaly detection and mobility modeling techniques. To support this,\nwe provide open access to the NUMOSIM dataset, along with comprehensive\ndocumentation, evaluation metrics, and benchmark results.\n","authors":["Chris Stanford","Suman Adari","Xishun Liao","Yueshuai He","Qinhua Jiang","Chenchen Kuai","Jiaqi Ma","Emmanuel Tung","Yinlong Qian","Lingyi Zhao","Zihao Zhou","Zeeshan Rasheed","Khurram Shafique"],"pdf_url":"https://arxiv.org/pdf/2409.03024v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04374v1","updated":"2024-09-06T16:13:04Z","published":"2024-09-06T16:13:04Z","title":"Gaussian-Mixture-Model Q-Functions for Reinforcement Learning by\n Riemannian Optimization","summary":" This paper establishes a novel role for Gaussian-mixture models (GMMs) as\nfunctional approximators of Q-function losses in reinforcement learning (RL).\nUnlike the existing RL literature, where GMMs play their typical role as\nestimates of probability density functions, GMMs approximate here Q-function\nlosses. The new Q-function approximators, coined GMM-QFs, are incorporated in\nBellman residuals to promote a Riemannian-optimization task as a novel\npolicy-evaluation step in standard policy-iteration schemes. The paper\ndemonstrates how the hyperparameters (means and covariance matrices) of the\nGaussian kernels are learned from the data, opening thus the door of RL to the\npowerful toolbox of Riemannian optimization. Numerical tests show that with no\nuse of training data, the proposed design outperforms state-of-the-art methods,\neven deep Q-networks which use training data, on benchmark RL tasks.\n","authors":["Minh Vu","Konstantinos Slavakis"],"pdf_url":"https://arxiv.org/pdf/2409.04374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00077v2","updated":"2024-09-06T16:12:00Z","published":"2024-08-24T09:26:59Z","title":"Are LLM-based methods good enough for detecting unfair terms of service?","summary":" Countless terms of service (ToS) are being signed everyday by users all over\nthe world while interacting with all kinds of apps and websites. More often\nthan not, these online contracts spanning double-digit pages are signed blindly\nby users who simply want immediate access to the desired service. What would\nnormally require a consultation with a legal team, has now become a mundane\nactivity consisting of a few clicks where users potentially sign away their\nrights, for instance in terms of their data privacy, to countless online\nentities/companies. Large language models (LLMs) are good at parsing long\ntext-based documents, and could potentially be adopted to help users when\ndealing with dubious clauses in ToS and their underlying privacy policies. To\ninvestigate the utility of existing models for this task, we first build a\ndataset consisting of 12 questions applied individually to a set of privacy\npolicies crawled from popular websites. Thereafter, a series of open-source as\nwell as commercial chatbots such as ChatGPT, are queried over each question,\nwith the answers being compared to a given ground truth. Our results show that\nsome open-source models are able to provide a higher accuracy compared to some\ncommercial models. However, the best performance is recorded from a commercial\nchatbot (ChatGPT4). Overall, all models perform only slightly better than\nrandom at this task. Consequently, their performance needs to be significantly\nimproved before they can be adopted at large for this purpose.\n","authors":["Mirgita Frasheri","Arian Bakhtiarnia","Lukas Esterle","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2409.00077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04373v1","updated":"2024-09-06T16:08:27Z","published":"2024-09-06T16:08:27Z","title":"Evaluating Fairness in Transaction Fraud Models: Fairness Metrics, Bias\n Audits, and Challenges","summary":" Ensuring fairness in transaction fraud detection models is vital due to the\npotential harms and legal implications of biased decision-making. Despite\nextensive research on algorithmic fairness, there is a notable gap in the study\nof bias in fraud detection models, mainly due to the field's unique challenges.\nThese challenges include the need for fairness metrics that account for fraud\ndata's imbalanced nature and the tradeoff between fraud protection and service\nquality. To address this gap, we present a comprehensive fairness evaluation of\ntransaction fraud models using public synthetic datasets, marking the first\nalgorithmic bias audit in this domain. Our findings reveal three critical\ninsights: (1) Certain fairness metrics expose significant bias only after\nnormalization, highlighting the impact of class imbalance. (2) Bias is\nsignificant in both service quality-related parity metrics and fraud\nprotection-related parity metrics. (3) The fairness through unawareness\napproach, which involved removing sensitive attributes such as gender, does not\nimprove bias mitigation within these datasets, likely due to the presence of\ncorrelated proxies. We also discuss socio-technical fairness-related challenges\nin transaction fraud models. These insights underscore the need for a nuanced\napproach to fairness in fraud detection, balancing protection and service\nquality, and moving beyond simple bias mitigation strategies. Future work must\nfocus on refining fairness metrics and developing methods tailored to the\nunique complexities of the transaction fraud domain.\n","authors":["Parameswaran Kamalaruban","Yulu Pi","Stuart Burrell","Eleanor Drage","Piotr Skalski","Jason Wong","David Sutton"],"pdf_url":"https://arxiv.org/pdf/2409.04373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04367v1","updated":"2024-09-06T15:58:20Z","published":"2024-09-06T15:58:20Z","title":"Provable Hyperparameter Tuning for Structured Pfaffian Settings","summary":" Data-driven algorithm design automatically adapts algorithms to specific\napplication domains, achieving better performance. In the context of\nparameterized algorithms, this approach involves tuning the algorithm\nparameters using problem instances drawn from the problem distribution of the\ntarget application domain. While empirical evidence supports the effectiveness\nof data-driven algorithm design, providing theoretical guarantees for several\nparameterized families remains challenging. This is due to the intricate\nbehaviors of their corresponding utility functions, which typically admit\npiece-wise and discontinuity structures. In this work, we present refined\nframeworks for providing learning guarantees for parameterized data-driven\nalgorithm design problems in both distributional and online learning settings.\nFor the distributional learning setting, we introduce the Pfaffian GJ\nframework, an extension of the classical GJ framework, capable of providing\nlearning guarantees for function classes for which the computation involves\nPfaffian functions. Unlike the GJ framework, which is limited to function\nclasses with computation characterized by rational functions, our proposed\nframework can deal with function classes involving Pfaffian functions, which\nare much more general and widely applicable. We then show that for many\nparameterized algorithms of interest, their utility function possesses a\nrefined piece-wise structure, which automatically translates to learning\nguarantees using our proposed framework. For the online learning setting, we\nprovide a new tool for verifying dispersion property of a sequence of loss\nfunctions. This sufficient condition allows no-regret learning for sequences of\npiece-wise structured loss functions where the piece-wise structure involves\nPfaffian transition boundaries.\n","authors":["Maria-Florina Balcan","Anh Tuan Nguyen","Dravyansh Sharma"],"pdf_url":"https://arxiv.org/pdf/2409.04367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04365v1","updated":"2024-09-06T15:57:25Z","published":"2024-09-06T15:57:25Z","title":"Leveraging Machine Learning for Official Statistics: A Statistical\n Manifesto","summary":" It is important for official statistics production to apply ML with\nstatistical rigor, as it presents both opportunities and challenges. Although\nmachine learning has enjoyed rapid technological advances in recent years, its\napplication does not possess the methodological robustness necessary to produce\nhigh quality statistical results. In order to account for all sources of error\nin machine learning models, the Total Machine Learning Error (TMLE) is\npresented as a framework analogous to the Total Survey Error Model used in\nsurvey methodology. As a means of ensuring that ML models are both internally\nvalid as well as externally valid, the TMLE model addresses issues such as\nrepresentativeness and measurement errors. There are several case studies\npresented, illustrating the importance of applying more rigor to the\napplication of machine learning in official statistics.\n","authors":["Marco Puts","David Salgado","Piet Daas"],"pdf_url":"https://arxiv.org/pdf/2409.04365v1.pdf","comment":"29 pages, 4 figures, 1 table. To appear in the proceedings of the\n conference on Foundations and Advances of Machine Learning in Official\n Statistics, which was held in Wiesbaden, from 3rd to 5th April, 2024"},{"id":"http://arxiv.org/abs/2409.04352v1","updated":"2024-09-06T15:34:17Z","published":"2024-09-06T15:34:17Z","title":"A naive aggregation algorithm for improving generalization in a class of\n learning problems","summary":" In this brief paper, we present a naive aggregation algorithm for a typical\nlearning problem with expert advice setting, in which the task of improving\ngeneralization, i.e., model validation, is embedded in the learning process as\na sequential decision-making problem. In particular, we consider a class of\nlearning problem of point estimations for modeling high-dimensional nonlinear\nfunctions, where a group of experts update their parameter estimates using the\ndiscrete-time version of gradient systems, with small additive noise term,\nguided by the corresponding subsample datasets obtained from the original\ndataset. Here, our main objective is to provide conditions under which such an\nalgorithm will sequentially determine a set of mixing distribution strategies\nused for aggregating the experts' estimates that ultimately leading to an\noptimal parameter estimate, i.e., as a consensus solution for all experts,\nwhich is better than any individual expert's estimate in terms of improved\ngeneralization or learning performances. Finally, as part of this work, we\npresent some numerical results for a typical case of nonlinear regression\nproblem.\n","authors":["Getachew K Befekadu"],"pdf_url":"https://arxiv.org/pdf/2409.04352v1.pdf","comment":"Brief paper, with 7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2406.10214v2","updated":"2024-09-06T15:28:03Z","published":"2024-06-14T17:49:29Z","title":"Universal randomised signatures for generative time series modelling","summary":" Randomised signature has been proposed as a flexible and easily implementable\nalternative to the well-established path signature. In this article, we employ\nrandomised signature to introduce a generative model for financial time series\ndata in the spirit of reservoir computing. Specifically, we propose a novel\nWasserstein-type distance based on discrete-time randomised signatures. This\nmetric on the space of probability measures captures the distance between\n(conditional) distributions. Its use is justified by our novel universal\napproximation results for randomised signatures on the space of continuous\nfunctions taking the underlying path as an input. We then use our metric as the\nloss function in a non-adversarial generator model for synthetic time series\ndata based on a reservoir neural stochastic differential equation. We compare\nthe results of our model to benchmarks from the existing literature.\n","authors":["Francesca Biagini","Lukas Gonon","Niklas Walter"],"pdf_url":"https://arxiv.org/pdf/2406.10214v2.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2409.04340v1","updated":"2024-09-06T15:18:12Z","published":"2024-09-06T15:18:12Z","title":"AGR: Age Group fairness Reward for Bias Mitigation in LLMs","summary":" LLMs can exhibit age biases, resulting in unequal treatment of individuals\nacross age groups. While much research has addressed racial and gender biases,\nage bias remains little explored. The scarcity of instruction-tuning and\npreference datasets for age bias hampers its detection and measurement, and\nexisting fine-tuning methods seldom address age-related fairness. In this\npaper, we construct age bias preference datasets and instruction-tuning\ndatasets for RLHF. We introduce ARG, an age fairness reward to reduce\ndifferences in the response quality of LLMs across different age groups.\nExtensive experiments demonstrate that this reward significantly improves\nresponse accuracy and reduces performance disparities across age groups. Our\nsource code and datasets are available at the anonymous\n\\href{https://anonymous.4open.science/r/FairRLHF-D445/readme.md}{link}.\n","authors":["Shuirong Cao","Ruoxi Cheng","Zhiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04340v1.pdf","comment":"The first two authors contributed equally to this work. Corresponding\n to Zhiqiang Wang. ACKNOWLEDGMENT: we would like to thank the computing\n resources support from the State Key Laboratory of New Computer Software\n Technologies at Nanjing University"},{"id":"http://arxiv.org/abs/2409.04335v1","updated":"2024-09-06T15:11:49Z","published":"2024-09-06T15:11:49Z","title":"A high-accuracy multi-model mixing retrosynthetic method","summary":" The field of computer-aided synthesis planning (CASP) has seen rapid\nadvancements in recent years, achieving significant progress across various\nalgorithmic benchmarks. However, chemists often encounter numerous infeasible\nreactions when using CASP in practice. This article delves into common errors\nassociated with CASP and introduces a product prediction model aimed at\nenhancing the accuracy of single-step models. While the product prediction\nmodel reduces the number of single-step reactions, it integrates multiple\nsingle-step models to maintain the overall reaction count and increase reaction\ndiversity. Based on manual analysis and large-scale testing, the product\nprediction model, combined with the multi-model ensemble approach, has been\nproven to offer higher feasibility and greater diversity.\n","authors":["Shang Xiang","Lin Yao","Zhen Wang","Qifan Yu","Wentan Liu","Wentao Guo","Guolin Ke"],"pdf_url":"https://arxiv.org/pdf/2409.04335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04332v1","updated":"2024-09-06T15:09:04Z","published":"2024-09-06T15:09:04Z","title":"Amortized Bayesian Workflow (Extended Abstract)","summary":" Bayesian inference often faces a trade-off between computational speed and\nsampling accuracy. We propose an adaptive workflow that integrates rapid\namortized inference with gold-standard MCMC techniques to achieve both speed\nand accuracy when performing inference on many observed datasets. Our approach\nuses principled diagnostics to guide the choice of inference method for each\ndataset, moving along the Pareto front from fast amortized sampling to slower\nbut guaranteed-accurate MCMC when necessary. By reusing computations across\nsteps, our workflow creates synergies between amortized and MCMC-based\ninference. We demonstrate the effectiveness of this integrated approach on a\ngeneralized extreme value task with 1000 observed data sets, showing 90x time\nefficiency gains while maintaining high posterior quality.\n","authors":["Marvin Schmitt","Chengkun Li","Aki Vehtari","Luigi Acerbi","Paul-Christian Bürkner","Stefan T. Radev"],"pdf_url":"https://arxiv.org/pdf/2409.04332v1.pdf","comment":"Extended Abstract"},{"id":"http://arxiv.org/abs/2401.08281v2","updated":"2024-09-06T15:08:03Z","published":"2024-01-16T11:12:36Z","title":"The Faiss library","summary":" Vector databases typically manage large collections of embedding vectors.\nCurrently, AI applications are growing rapidly, and so is the number of\nembeddings that need to be stored and indexed. The Faiss library is dedicated\nto vector similarity search, a core functionality of vector databases. Faiss is\na toolkit of indexing methods and related primitives used to search, cluster,\ncompress and transform vectors. This paper describes the trade-off space of\nvector search and the design principles of Faiss in terms of structure,\napproach to optimization and interfacing. We benchmark key features of the\nlibrary and discuss a few selected applications to highlight its broad\napplicability.\n","authors":["Matthijs Douze","Alexandr Guzhva","Chengqi Deng","Jeff Johnson","Gergely Szilvasy","Pierre-Emmanuel Mazaré","Maria Lomeli","Lucas Hosseini","Hervé Jégou"],"pdf_url":"https://arxiv.org/pdf/2401.08281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04328v1","updated":"2024-09-06T15:03:42Z","published":"2024-09-06T15:03:42Z","title":"Active learning for regression in engineering populations: A\n risk-informed approach","summary":" Regression is a fundamental prediction task common in data-centric\nengineering applications that involves learning mappings between continuous\nvariables. In many engineering applications (e.g.\\ structural health\nmonitoring), feature-label pairs used to learn such mappings are of limited\navailability which hinders the effectiveness of traditional supervised machine\nlearning approaches. The current paper proposes a methodology for overcoming\nthe issue of data scarcity by combining active learning with hierarchical\nBayesian modelling.\n Active learning is an approach for preferentially acquiring feature-label\npairs in a resource-efficient manner. In particular, the current work adopts a\nrisk-informed approach that leverages contextual information associated with\nregression-based engineering decision-making tasks (e.g.\\ inspection and\nmaintenance). Hierarchical Bayesian modelling allow multiple related regression\ntasks to be learned over a population, capturing local and global effects. The\ninformation sharing facilitated by this modelling approach means that\ninformation acquired for one engineering system can improve predictive\nperformance across the population.\n The proposed methodology is demonstrated using an experimental case study.\nSpecifically, multiple regressions are performed over a population of machining\ntools, where the quantity of interest is the surface roughness of the\nworkpieces. An inspection and maintenance decision process is defined using\nthese regression tasks which is in turn used to construct the active-learning\nalgorithm. The novel methodology proposed is benchmarked against an uninformed\napproach to label acquisition and independent modelling of the regression\ntasks. It is shown that the proposed approach has superior performance in terms\nof expected cost -- maintaining predictive performance while reducing the\nnumber of inspections required.\n","authors":["Daniel R. Clarkson","Lawrence A. Bull","Chandula T. Wickramarachchi","Elizabeth J. Cross","Timothy J. Rogers","Keith Worden","Nikolaos Dervilis","Aidan J. Hughes"],"pdf_url":"https://arxiv.org/pdf/2409.04328v1.pdf","comment":"19 pages, 12 figures, 3 tables, submitted to Data-Centric Engineering"},{"id":"http://arxiv.org/abs/2409.04320v1","updated":"2024-09-06T14:49:43Z","published":"2024-09-06T14:49:43Z","title":"Faster Sampling from Log-Concave Densities over Polytopes via Efficient\n Linear Solvers","summary":" We consider the problem of sampling from a log-concave distribution\n$\\pi(\\theta) \\propto e^{-f(\\theta)}$ constrained to a polytope $K:=\\{\\theta \\in\n\\mathbb{R}^d: A\\theta \\leq b\\}$, where $A\\in \\mathbb{R}^{m\\times d}$ and $b \\in\n\\mathbb{R}^m$.The fastest-known algorithm \\cite{mangoubi2022faster} for the\nsetting when $f$ is $O(1)$-Lipschitz or $O(1)$-smooth runs in roughly $O(md\n\\times md^{\\omega -1})$ arithmetic operations, where the $md^{\\omega -1}$ term\narises because each Markov chain step requires computing a matrix inversion and\ndeterminant (here $\\omega \\approx 2.37$ is the matrix multiplication constant).\nWe present a nearly-optimal implementation of this Markov chain with per-step\ncomplexity which is roughly the number of non-zero entries of $A$ while the\nnumber of Markov chain steps remains the same. The key technical ingredients\nare 1) to show that the matrices that arise in this Dikin walk change slowly,\n2) to deploy efficient linear solvers that can leverage this slow change to\nspeed up matrix inversion by using information computed in previous steps, and\n3) to speed up the computation of the determinantal term in the Metropolis\nfilter step via a randomized Taylor series-based estimator.\n","authors":["Oren Mangoubi","Nisheeth K. Vishnoi"],"pdf_url":"https://arxiv.org/pdf/2409.04320v1.pdf","comment":"The conference version of this paper appears in ICLR 2024"},{"id":"http://arxiv.org/abs/2409.04313v1","updated":"2024-09-06T14:38:47Z","published":"2024-09-06T14:38:47Z","title":"Enhancing Uncertainty Quantification in Drug Discovery with Censored\n Regression Labels","summary":" In the early stages of drug discovery, decisions regarding which experiments\nto pursue can be influenced by computational models. These decisions are\ncritical due to the time-consuming and expensive nature of the experiments.\nTherefore, it is becoming essential to accurately quantify the uncertainty in\nmachine learning predictions, such that resources can be used optimally and\ntrust in the models improves. While computational methods for drug discovery\noften suffer from limited data and sparse experimental observations, additional\ninformation can exist in the form of censored labels that provide thresholds\nrather than precise values of observations. However, the standard approaches\nthat quantify uncertainty in machine learning cannot fully utilize censored\nlabels. In this work, we adapt ensemble-based, Bayesian, and Gaussian models\nwith tools to learn from censored labels by using the Tobit model from survival\nanalysis. Our results demonstrate that despite the partial information\navailable in censored labels, they are essential to accurately and reliably\nmodel the real pharmaceutical setting.\n","authors":["Emma Svensson","Hannah Rosa Friesacher","Susanne Winiwarter","Lewis Mervin","Adam Arany","Ola Engkvist"],"pdf_url":"https://arxiv.org/pdf/2409.04313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04264v4","updated":"2024-09-06T14:35:22Z","published":"2023-10-06T14:11:21Z","title":"Deep learning modelling of manufacturing and build variations on\n multi-stage axial compressors aerodynamics","summary":" Applications of deep learning to physical simulations such as Computational\nFluid Dynamics have recently experienced a surge in interest, and their\nviability has been demonstrated in different domains. However, due to the\nhighly complex, turbulent and three-dimensional flows, they have not yet been\nproven usable for turbomachinery applications. Multi-stage axial compressors\nfor gas turbine applications represent a remarkably challenging case, due to\nthe high-dimensionality of the regression of the flow-field from geometrical\nand operational variables. This paper demonstrates the development and\napplication of a deep learning framework for predictions of the flow field and\naerodynamic performance of multi-stage axial compressors. A physics-based\ndimensionality reduction unlocks the potential for flow-field predictions, as\nit re-formulates the regression problem from an un-structured to a structured\none, as well as reducing the number of degrees of freedom. Compared to\ntraditional \"black-box\" surrogate models, it provides explainability to the\npredictions of overall performance by identifying the corresponding aerodynamic\ndrivers. This is applied to model the effect of manufacturing and build\nvariations, as the associated performance scatter is known to have a\nsignificant impact on $CO_2$ emissions, therefore posing a challenge of great\nindustrial and environmental relevance. The proposed architecture is proven to\nachieve an accuracy comparable to that of the CFD benchmark, in real-time, for\nan industrially relevant application. The deployed model, is readily integrated\nwithin the manufacturing and build process of gas turbines, thus providing the\nopportunity to analytically assess the impact on performance with actionable\nand explainable data.\n","authors":["Giuseppe Bruni","Sepehr Maleki","Senthil K. Krishnababu"],"pdf_url":"https://arxiv.org/pdf/2310.04264v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04301v1","updated":"2024-09-06T14:20:38Z","published":"2024-09-06T14:20:38Z","title":"A Unified Approach to Inferring Chemical Compounds with the Desired\n Aqueous Solubility","summary":" Aqueous solubility (AS) is a key physiochemical property that plays a crucial\nrole in drug discovery and material design. We report a novel unified approach\nto predict and infer chemical compounds with the desired AS based on simple\ndeterministic graph-theoretic descriptors, multiple linear regression (MLR) and\nmixed integer linear programming (MILP). Selected descriptors based on a\nforward stepwise procedure enabled the simplest regression model, MLR, to\nachieve significantly good prediction accuracy compared to the existing\napproaches, achieving the accuracy in the range [0.7191, 0.9377] for 29 diverse\ndatasets. By simulating these descriptors and learning models as MILPs, we\ninferred mathematically exact and optimal compounds with the desired AS,\nprescribed structures, and up to 50 non-hydrogen atoms in a reasonable time\nrange [6, 1204] seconds. These findings indicate a strong correlation between\nthe simple graph-theoretic descriptors and the AS of compounds, potentially\nleading to a deeper understanding of their AS without relying on widely used\ncomplicated chemical descriptors and complex machine learning models that are\ncomputationally expensive, and therefore difficult to use for inference. An\nimplementation of the proposed approach is available at\nhttps://github.com/ku-dml/mol-infer/tree/master/AqSol.\n","authors":["Muniba Batool","Naveed Ahmed Azam","Jianshen Zhu","Kazuya Haraguchi","Liang Zhao","Tatsuya Akutsu"],"pdf_url":"https://arxiv.org/pdf/2409.04301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03151v2","updated":"2024-09-06T14:04:43Z","published":"2024-09-05T00:58:07Z","title":"Standing on the shoulders of giants","summary":" Although fundamental to the advancement of Machine Learning, the classic\nevaluation metrics extracted from the confusion matrix, such as precision and\nF1, are limited. Such metrics only offer a quantitative view of the models'\nperformance, without considering the complexity of the data or the quality of\nthe hit. To overcome these limitations, recent research has introduced the use\nof psychometric metrics such as Item Response Theory (IRT), which allows an\nassessment at the level of latent characteristics of instances. This work\ninvestigates how IRT concepts can enrich a confusion matrix in order to\nidentify which model is the most appropriate among options with similar\nperformance. In the study carried out, IRT does not replace, but complements\nclassical metrics by offering a new layer of evaluation and observation of the\nfine behavior of models in specific instances. It was also observed that there\nis 97% confidence that the score from the IRT has different contributions from\n66% of the classical metrics analyzed.\n","authors":["Lucas Felipe Ferraro Cardoso","José de Sousa Ribeiro Filho","Vitor Cirilo Araujo Santos","Regiane Silva Kawasaki Frances","Ronnie Cley de Oliveira Alves"],"pdf_url":"https://arxiv.org/pdf/2409.03151v2.pdf","comment":"15 pages, 8 figures, 3 tables, submitted for the BRACIS'24 conference"},{"id":"http://arxiv.org/abs/2409.04290v1","updated":"2024-09-06T13:59:58Z","published":"2024-09-06T13:59:58Z","title":"CoxKAN: Kolmogorov-Arnold Networks for Interpretable, High-Performance\n Survival Analysis","summary":" Survival analysis is a branch of statistics used for modeling the time until\na specific event occurs and is widely used in medicine, engineering, finance,\nand many other fields. When choosing survival models, there is typically a\ntrade-off between performance and interpretability, where the highest\nperformance is achieved by black-box models based on deep learning. This is a\nmajor problem in fields such as medicine where practitioners are reluctant to\nblindly trust black-box models to make important patient decisions.\nKolmogorov-Arnold Networks (KANs) were recently proposed as an interpretable\nand accurate alternative to multi-layer perceptrons (MLPs). We introduce\nCoxKAN, a Cox proportional hazards Kolmogorov-Arnold Network for interpretable,\nhigh-performance survival analysis. We evaluate the proposed CoxKAN on 4\nsynthetic datasets and 9 real medical datasets. The synthetic experiments\ndemonstrate that CoxKAN accurately recovers interpretable symbolic formulae for\nthe hazard function, and effectively performs automatic feature selection.\nEvaluation on the 9 real datasets show that CoxKAN consistently outperforms the\nCox proportional hazards model and achieves performance that is superior or\ncomparable to that of tuned MLPs. Furthermore, we find that CoxKAN identifies\ncomplex interactions between predictor variables that would be extremely\ndifficult to recognise using existing survival methods, and automatically finds\nsymbolic formulae which uncover the precise effect of important biomarkers on\npatient risk.\n","authors":["William Knottenbelt","Zeyu Gao","Rebecca Wray","Woody Zhidong Zhang","Jiashuai Liu","Mireia Crispin-Ortuzar"],"pdf_url":"https://arxiv.org/pdf/2409.04290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14557v2","updated":"2024-09-06T13:49:48Z","published":"2024-01-25T22:54:39Z","title":"Extension of Recurrent Kernels to different Reservoir Computing\n topologies","summary":" Reservoir Computing (RC) has become popular in recent years due to its fast\nand efficient computational capabilities. Standard RC has been shown to be\nequivalent in the asymptotic limit to Recurrent Kernels, which helps in\nanalyzing its expressive power. However, many well-established RC paradigms,\nsuch as Leaky RC, Sparse RC, and Deep RC, are yet to be analyzed in such a way.\nThis study aims to fill this gap by providing an empirical analysis of the\nequivalence of specific RC architectures with their corresponding Recurrent\nKernel formulation. We conduct a convergence study by varying the activation\nfunction implemented in each architecture. Our study also sheds light on the\nrole of sparse connections in RC architectures and propose an optimal sparsity\nlevel that depends on the reservoir size. Furthermore, our systematic analysis\nshows that in Deep RC models, convergence is better achieved with successive\nreservoirs of decreasing sizes.\n","authors":["Giuseppe Alessio D'Inverno","Jonathan Dong"],"pdf_url":"https://arxiv.org/pdf/2401.14557v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2409.04275v1","updated":"2024-09-06T13:37:08Z","published":"2024-09-06T13:37:08Z","title":"AttentionX: Exploiting Consensus Discrepancy In Attention from A\n Distributed Optimization Perspective","summary":" In this paper, we extend the standard Attention in transformer by exploiting\nthe consensus discrepancy from a distributed optimization perspective, referred\nto as AttentionX. It is noted that %the popular distributed optimization\nalgorithm \\cite{Boyd11ADMM} and the primal-dual method of multipliers (PDMM)\n\\cite{Zhang16PDMM} is designed to iteratively solve a broad class of\ndistributed optimization problems over a pear-to-pear (P2P) network, where\nneighbouring nodes gradually reach consensus as specified by predefined linear\nedge-constraints in the optimization process. In particular, at each iteration\nof PDMM, each node in a network first performs information-gathering from\nneighbours and then performs local information-fusion. From a high-level point\nof view, the $KQ$-softmax-based weighted summation of $V$-representations in\nAttention corresponds information-gathering from neighbours while the\nfeature-processing via the feed-forward network (FFN) in transformer\ncorresponds to local information fusion. PDMM exploits the Lagrangian\nmultipliers to capture the historical consensus discrepancy in the form of\nresidual errors of the linear edge-constraints, which plays a crucial role for\nthe algorithm to converge. Inspired by PDMM, we propose AttentionX to\nincorporate the consensus discrepancy in the output update-expression of the\nstandard Attention. The consensus discrepancy in AttentionX refers to the\ndifference between the weighted summation of $V$-representations and scaled\n$V$-representions themselves. Experiments on ViT and nanoGPT show promising\nperformance.\n","authors":["Guoqiang Zhang","Richard Heusdens"],"pdf_url":"https://arxiv.org/pdf/2409.04275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11041v2","updated":"2024-09-06T13:33:34Z","published":"2024-07-06T15:03:40Z","title":"Integer-only Quantized Transformers for Embedded FPGA-based Time-series\n Forecasting in AIoT","summary":" This paper presents the design of a hardware accelerator for Transformers,\noptimized for on-device time-series forecasting in AIoT systems. It integrates\ninteger-only quantization and Quantization-Aware Training with optimized\nhardware designs to realize 6-bit and 4-bit quantized Transformer models, which\nachieved precision comparable to 8-bit quantized models from related research.\nUtilizing a complete implementation on an embedded FPGA (Xilinx Spartan-7\nXC7S15), we examine the feasibility of deploying Transformer models on embedded\nIoT devices. This includes a thorough analysis of achievable precision,\nresource utilization, timing, power, and energy consumption for on-device\ninference. Our results indicate that while sufficient performance can be\nattained, the optimization process is not trivial. For instance, reducing the\nquantization bitwidth does not consistently result in decreased latency or\nenergy consumption, underscoring the necessity of systematically exploring\nvarious optimization combinations. Compared to an 8-bit quantized Transformer\nmodel in related studies, our 4-bit quantized Transformer model increases test\nloss by only 0.63%, operates up to 132.33x faster, and consumes 48.19x less\nenergy.\n","authors":["Tianheng Ling","Chao Qian","Gregor Schiele"],"pdf_url":"https://arxiv.org/pdf/2407.11041v2.pdf","comment":"7 pages, 3 figures, 4 tables. The paper was accepted by 2024 IEEE\n Annual Congress on Artificial Intelligence of Things (IEEE AIoT) and got best\n paper award"},{"id":"http://arxiv.org/abs/2310.06585v2","updated":"2024-09-06T13:29:32Z","published":"2023-10-10T12:52:42Z","title":"A Black-Box Physics-Informed Estimator based on Gaussian Process\n Regression for Robot Inverse Dynamics Identification","summary":" Learning the inverse dynamics of robots directly from data, adopting a\nblack-box approach, is interesting for several real-world scenarios where\nlimited knowledge about the system is available. In this paper, we propose a\nblack-box model based on Gaussian Process (GP) Regression for the\nidentification of the inverse dynamics of robotic manipulators. The proposed\nmodel relies on a novel multidimensional kernel, called \\textit{Lagrangian\nInspired Polynomial} (\\kernelInitials{}) kernel. The \\kernelInitials{} kernel\nis based on two main ideas. First, instead of directly modeling the inverse\ndynamics components, we model as GPs the kinetic and potential energy of the\nsystem. The GP prior on the inverse dynamics components is derived from those\non the energies by applying the properties of GPs under linear operators.\nSecond, as regards the energy prior definition, we prove a polynomial structure\nof the kinetic and potential energy, and we derive a polynomial kernel that\nencodes this property. As a consequence, the proposed model allows also to\nestimate the kinetic and potential energy without requiring any label on these\nquantities. Results on simulation and on two real robotic manipulators, namely\na 7 DOF Franka Emika Panda, and a 6 DOF MELFA RV4FL, show that the proposed\nmodel outperforms state-of-the-art black-box estimators based both on Gaussian\nProcesses and Neural Networks in terms of accuracy, generality and data\nefficiency. The experiments on the MELFA robot also demonstrate that our\napproach achieves performance comparable to fine-tuned model-based estimators,\ndespite requiring less prior information.\n","authors":["Giulio Giacomuzzos","Ruggero Carli","Diego Romeres","Alberto Dalla Libera"],"pdf_url":"https://arxiv.org/pdf/2310.06585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15638v2","updated":"2024-09-06T13:21:12Z","published":"2023-09-27T13:14:57Z","title":"RSF-Conv: Rotation-and-Scale Equivariant Fourier Parameterized\n Convolution for Retinal Vessel Segmentation","summary":" Retinal vessel segmentation is of great clinical significance for the\ndiagnosis of many eye-related diseases, but it is still a formidable challenge\ndue to the intricate vascular morphology. With the skillful characterization of\nthe translation symmetry existing in retinal vessels, convolutional neural\nnetworks (CNNs) have achieved great success in retinal vessel segmentation.\nHowever, the rotation-and-scale symmetry, as a more widespread image prior in\nretinal vessels, fails to be characterized by CNNs. Therefore, we propose a\nrotation-and-scale equivariant Fourier parameterized convolution (RSF-Conv)\nspecifically for retinal vessel segmentation, and provide the corresponding\nequivariance analysis. As a general module, RSF-Conv can be integrated into\nexisting networks in a plug-and-play manner while significantly reducing the\nnumber of parameters. For instance, we replace the traditional convolution\nfilters in U-Net and Iter-Net with RSF-Convs, and faithfully conduct\ncomprehensive experiments. RSF-Conv+U-Net and RSF-Conv+Iter-Net not only have\nslight advantages under in-domain evaluation, but more importantly, outperform\nall comparison methods by a significant margin under out-of-domain evaluation.\nIt indicates the remarkable generalization of RSF-Conv, which holds greater\npractical clinical significance for the prevalent cross-device and\ncross-hospital challenges in clinical practice. To comprehensively demonstrate\nthe effectiveness of RSF-Conv, we also apply RSF-Conv+U-Net and\nRSF-Conv+Iter-Net to retinal artery/vein classification and achieve promising\nperformance as well, indicating its clinical application potential.\n","authors":["Zihong Sun","Hong Wang","Qi Xie","Yefeng Zheng","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2309.15638v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04249v1","updated":"2024-09-06T12:55:49Z","published":"2024-09-06T12:55:49Z","title":"Hermes: Memory-Efficient Pipeline Inference for Large Models on Edge\n Devices","summary":" The application of Transformer-based large models has achieved numerous\nsuccess in recent years. However, the exponential growth in the parameters of\nlarge models introduces formidable memory challenge for edge deployment. Prior\nworks to address this challenge mainly focus on optimizing the model structure\nand adopting memory swapping methods. However, the former reduces the inference\naccuracy, and the latter raises the inference latency. This paper introduces\nPIPELOAD, a novel memory-efficient pipeline execution mechanism. It reduces\nmemory usage by incorporating dynamic memory management and minimizes inference\nlatency by employing parallel model loading. Based on PIPELOAD mechanism, we\npresent Hermes, a framework optimized for large model inference on edge\ndevices. We evaluate Hermes on Transformer-based models of different sizes. Our\nexperiments illustrate that Hermes achieves up to 4.24 X increase in inference\nspeed and 86.7% lower memory consumption than the state-of-the-art pipeline\nmechanism for BERT and ViT models, 2.58 X increase in inference speed and 90.3%\nlower memory consumption for GPT-style models.\n","authors":["Xueyuan Han","Zinuo Cai","Yichu Zhang","Chongxin Fan","Junhan Liu","Ruhui Ma","Rajkumar Buyya"],"pdf_url":"https://arxiv.org/pdf/2409.04249v1.pdf","comment":"Accepted by the 42nd IEEE International Conference on Computer Design\n (ICCD 2024)"},{"id":"http://arxiv.org/abs/2409.04244v1","updated":"2024-09-06T12:51:10Z","published":"2024-09-06T12:51:10Z","title":"WarpAdam: A new Adam optimizer based on Meta-Learning approach","summary":" Optimal selection of optimization algorithms is crucial for training deep\nlearning models. The Adam optimizer has gained significant attention due to its\nefficiency and wide applicability. However, to enhance the adaptability of\noptimizers across diverse datasets, we propose an innovative optimization\nstrategy by integrating the 'warped gradient descend'concept from Meta Learning\ninto the Adam optimizer. In the conventional Adam optimizer, gradients are\nutilized to compute estimates of gradient mean and variance, subsequently\nupdating model parameters. Our approach introduces a learnable distortion\nmatrix, denoted as P, which is employed for linearly transforming gradients.\nThis transformation slightly adjusts gradients during each iteration, enabling\nthe optimizer to better adapt to distinct dataset characteristics. By learning\nan appropriate distortion matrix P, our method aims to adaptively adjust\ngradient information across different data distributions, thereby enhancing\noptimization performance. Our research showcases the potential of this novel\napproach through theoretical insights and empirical evaluations. Experimental\nresults across various tasks and datasets validate the superiority of our\noptimizer that integrates the 'warped gradient descend' concept in terms of\nadaptability. Furthermore, we explore effective strategies for training the\nadaptation matrix P and identify scenarios where this method can yield optimal\nresults. In summary, this study introduces an innovative approach that merges\nthe 'warped gradient descend' concept from Meta Learning with the Adam\noptimizer. By introducing a learnable distortion matrix P within the optimizer,\nwe aim to enhance the model's generalization capability across diverse data\ndistributions, thus opening up new possibilities in the field of deep learning\noptimization.\n","authors":["Chengxi Pan","Junshang Chen","Jingrui Ye"],"pdf_url":"https://arxiv.org/pdf/2409.04244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04242v1","updated":"2024-09-06T12:47:15Z","published":"2024-09-06T12:47:15Z","title":"Unmasking Covert Intrusions: Detection of Fault-Masking Cyberattacks on\n Differential Protection Systems","summary":" Line Current Differential Relays (LCDRs) are high-speed relays progressively\nused to protect critical transmission lines. However, LCDRs are vulnerable to\ncyberattacks. Fault-Masking Attacks (FMAs) are stealthy cyberattacks performed\nby manipulating the remote measurements of the targeted LCDR to disguise faults\non the protected line. Hence, they remain undetected by this LCDR. In this\npaper, we propose a two-module framework to detect FMAs. The first module is a\nMismatch Index (MI) developed from the protected transmission line's equivalent\nphysical model. The MI is triggered only if there is a significant mismatch in\nthe LCDR's local and remote measurements while the LCDR itself is untriggered,\nwhich indicates an FMA. After the MI is triggered, the second module, a neural\nnetwork-based classifier, promptly confirms that the triggering event is a\nphysical fault that lies on the line protected by the LCDR before declaring the\noccurrence of an FMA. The proposed framework is tested using the IEEE 39-bus\nbenchmark system. Our simulation results confirm that the proposed framework\ncan accurately detect FMAs on LCDRs and is not affected by normal system\ndisturbances, variations, or measurement noise. Our experimental results using\nOPAL-RT's real-time simulator confirm the proposed solution's real-time\nperformance capability.\n","authors":["Ahmad Mohammad Saber","Amr Youssef","Davor Svetinovic","Hatem Zeineldin","Ehab F. El-Saadany"],"pdf_url":"https://arxiv.org/pdf/2409.04242v1.pdf","comment":"Accepted to IEEE Transactions on Systems, Man, and Cybernetics:\n Systems. \\c{opyright} 2024 IEEE"},{"id":"http://arxiv.org/abs/2409.04241v1","updated":"2024-09-06T12:46:43Z","published":"2024-09-06T12:46:43Z","title":"Calibration of Network Confidence for Unsupervised Domain Adaptation\n Using Estimated Accuracy","summary":" This study addresses the problem of calibrating network confidence while\nadapting a model that was originally trained on a source domain to a target\ndomain using unlabeled samples from the target domain. The absence of labels\nfrom the target domain makes it impossible to directly calibrate the adapted\nnetwork on the target domain. To tackle this challenge, we introduce a\ncalibration procedure that relies on estimating the network's accuracy on the\ntarget domain. The network accuracy is first computed on the labeled source\ndata and then is modified to represent the actual accuracy of the model on the\ntarget domain. The proposed algorithm calibrates the prediction confidence\ndirectly in the target domain by minimizing the disparity between the estimated\naccuracy and the computed confidence. The experimental results show that our\nmethod significantly outperforms existing methods, which rely on importance\nweighting, across several standard datasets.\n","authors":["Coby Penso","Jacob Goldberger"],"pdf_url":"https://arxiv.org/pdf/2409.04241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10943v3","updated":"2024-09-06T12:30:03Z","published":"2023-06-19T14:03:27Z","title":"Probabilistic Matching of Real and Generated Data Statistics in\n Generative Adversarial Networks","summary":" Generative adversarial networks constitute a powerful approach to generative\nmodeling. While generated samples often are indistinguishable from real data,\nthere is no guarantee that they will follow the true data distribution. For\nscientific applications in particular, it is essential that the true\ndistribution is well captured by the generated distribution. In this work, we\npropose a method to ensure that the distributions of certain generated data\nstatistics coincide with the respective distributions of the real data. In\norder to achieve this, we add a new loss term to the generator loss function,\nwhich quantifies the difference between these distributions via suitable\nf-divergences. Kernel density estimation is employed to obtain representations\nof the true distributions, and to estimate the corresponding generated\ndistributions from minibatch values at each iteration. When compared to other\nmethods, our approach has the advantage that the complete shapes of the\ndistributions are taken into account. We evaluate the method on a synthetic\ndataset and a real-world dataset and demonstrate improved performance of our\napproach.\n","authors":["Philipp Pilar","Niklas Wahlström"],"pdf_url":"https://arxiv.org/pdf/2306.10943v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04224v1","updated":"2024-09-06T12:26:47Z","published":"2024-09-06T12:26:47Z","title":"Advancing Multi-Organ Disease Care: A Hierarchical Multi-Agent\n Reinforcement Learning Framework","summary":" Multi-organ diseases present significant challenges due to their simultaneous\nimpact on multiple organ systems, necessitating complex and adaptive treatment\nstrategies. Despite recent advancements in AI-powered healthcare decision\nsupport systems, existing solutions are limited to individual organ systems.\nThey often ignore the intricate dependencies between organ system and thereby\nfails to provide holistic treatment recommendations that are useful in\npractice. We propose a novel hierarchical multi-agent reinforcement learning\n(HMARL) framework to address these challenges. This framework uses dedicated\nagents for each organ system, and model dynamic through explicit inter-agent\ncommunication channels, enabling coordinated treatment strategies across\norgans. Furthermore, we introduce a dual-layer state representation technique\nto contextualize patient conditions at various hierarchical levels, enhancing\nthe treatment accuracy and relevance. Through extensive qualitative and\nquantitative evaluations in managing sepsis (a complex multi-organ disease),\nour approach demonstrates its ability to learn effective treatment policies\nthat significantly improve patient survival rates. This framework marks a\nsubstantial advancement in clinical decision support systems, pioneering a\ncomprehensive approach for multi-organ treatment recommendations.\n","authors":["Daniel J. Tan","Qianyi Xu","Kay Choong See","Dilruk Perera","Mengling Feng"],"pdf_url":"https://arxiv.org/pdf/2409.04224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09048v2","updated":"2024-09-06T12:13:40Z","published":"2023-12-14T15:47:26Z","title":"On The Expressivity of Recurrent Neural Cascades","summary":" Recurrent Neural Cascades (RNCs) are the recurrent neural networks with no\ncyclic dependencies among recurrent neurons. This class of recurrent networks\nhas received a lot of attention in practice. Besides training methods for a\nfixed architecture such as backpropagation, the cascade architecture naturally\nallows for constructive learning methods, where recurrent nodes are added\nincrementally one at a time, often yielding smaller networks. Furthermore,\nacyclicity amounts to a structural prior that even for the same number of\nneurons yields a more favourable sample complexity compared to a\nfully-connected architecture. A central question is whether the advantages of\nthe cascade architecture come at the cost of a reduced expressivity. We provide\nnew insights into this question. We show that the regular languages captured by\nRNCs with sign and tanh activation with positive recurrent weights are the\nstar-free regular languages. In order to establish our results we developed a\nnovel framework where capabilities of RNCs are accessed by analysing which\nsemigroups and groups a single neuron is able to implement. A notable\nimplication of our framework is that RNCs can achieve the expressivity of all\nregular languages by introducing neurons that can implement groups.\n","authors":["Nadezda Alexandrovna Knorozova","Alessandro Ronca"],"pdf_url":"https://arxiv.org/pdf/2312.09048v2.pdf","comment":"Full version with appendix of a paper with the same title that\n appears in the proceedings of AAAI 2024"},{"id":"http://arxiv.org/abs/2304.06841v3","updated":"2024-09-06T12:09:03Z","published":"2023-04-13T22:20:54Z","title":"Video alignment using unsupervised learning of local and global features","summary":" In this paper, we tackle the problem of video alignment, the process of\nmatching the frames of a pair of videos containing similar actions. The main\nchallenge in video alignment is that accurate correspondence should be\nestablished despite the differences in the execution processes and appearances\nbetween the two videos. We introduce an unsupervised method for alignment that\nuses global and local features of the frames. In particular, we introduce\neffective features for each video frame by means of three machine vision tools:\nperson detection, pose estimation, and VGG network. Then the features are\nprocessed and combined to construct a multidimensional time series that\nrepresent the video. The resulting time series are used to align videos of the\nsame actions using a novel version of dynamic time warping named Diagonalized\nDynamic Time Warping(DDTW). The main advantage of our approach is that no\ntraining is required, which makes it applicable for any new type of action\nwithout any need to collect training samples for it. Additionally, our approach\ncan be used for framewise labeling of action phases in a dataset with only a\nfew labeled videos. For evaluation, we considered video synchronization and\nphase classification tasks on the Penn action and subset of UCF101 datasets.\nAlso, for an effective evaluation of the video synchronization task, we present\na new metric called Enclosed Area Error(EAE). The results show that our method\noutperforms previous state-of-the-art methods, such as TCC, and other\nself-supervised and weakly supervised methods.\n","authors":["Niloufar Fakhfour","Mohammad ShahverdiKondori","Sajjad Hashembeiki","Mohammadjavad Norouzi","Hoda Mohammadzade"],"pdf_url":"https://arxiv.org/pdf/2304.06841v3.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.03250v3","updated":"2024-09-06T11:56:16Z","published":"2024-07-03T16:29:47Z","title":"When big data actually are low-rank, or entrywise approximation of\n certain function-generated matrices","summary":" The article concerns low-rank approximation of matrices generated by sampling\na smooth function of two $m$-dimensional variables. We refute an argument made\nin the literature to prove that, for a specific class of analytic functions,\nsuch matrices admit accurate entrywise approximation of rank that is\nindependent of $m$ -- a claim known as \"big-data matrices are approximately\nlow-rank\". We provide a theoretical explanation of the numerical results\npresented in support of this claim, describing three narrower classes of\nfunctions for which $n \\times n$ function-generated matrices can be\napproximated within an entrywise error of order $\\varepsilon$ with rank\n$\\mathcal{O}(\\log(n) \\varepsilon^{-2} \\mathrm{polylog}(\\varepsilon^{-1}))$ that\nis independent of the dimension $m$: (i) functions of the inner product of the\ntwo variables, (ii) functions of the Euclidean distance between the variables,\nand (iii) shift-invariant positive-definite kernels. We extend our argument to\ntensor-train approximation of tensors generated with functions of the\nmulti-linear product of their $m$-dimensional variables. We discuss our results\nin the context of low-rank approximation of (a) growing datasets and (b)\nattention in transformer neural networks.\n","authors":["Stanislav Budzinskiy"],"pdf_url":"https://arxiv.org/pdf/2407.03250v3.pdf","comment":"Extended Sections 1 and 2"},{"id":"http://arxiv.org/abs/2409.04206v1","updated":"2024-09-06T11:53:37Z","published":"2024-09-06T11:53:37Z","title":"Fast Forwarding Low-Rank Training","summary":" Parameter efficient finetuning methods like low-rank adaptation (LoRA) aim to\nreduce the computational costs of finetuning pretrained Language Models (LMs).\nEnabled by these low-rank settings, we propose an even more efficient\noptimization strategy: Fast Forward, a simple and effective approach to\naccelerate large segments of training. In a Fast Forward stage, we repeat the\nmost recent optimizer step until the loss stops improving on a tiny validation\nset. By alternating between regular optimization steps and Fast Forward stages,\nFast Forward provides up to an 87\\% reduction in FLOPs and up to an 81\\%\nreduction in train time over standard SGD with Adam. We validate Fast Forward\nby finetuning various models on different tasks and demonstrate that it speeds\nup training without compromising model performance. Additionally, we analyze\nwhen and how to apply Fast Forward.\n","authors":["Adir Rahamim","Naomi Saphra","Sara Kangaslahti","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2409.04206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16115v3","updated":"2024-09-06T11:50:36Z","published":"2024-08-28T19:59:58Z","title":"Uncertainty Modeling in Graph Neural Networks via Stochastic\n Differential Equations","summary":" We address the problem of learning uncertainty-aware representations for\ngraph-structured data. While Graph Neural Ordinary Differential Equations\n(GNODE) are effective in learning node representations, they fail to quantify\nuncertainty. To address this, we introduce Latent Graph Neural Stochastic\nDifferential Equations (LGNSDE), which enhance GNODE by embedding randomness\nthrough Brownian motion to quantify uncertainty. We provide theoretical\nguarantees for LGNSDE and empirically show better performance in uncertainty\nquantification.\n","authors":["Richard Bergna","Sergio Calvo-Ordoñez","Felix L. Opolka","Pietro Liò","Jose Miguel Hernandez-Lobato"],"pdf_url":"https://arxiv.org/pdf/2408.16115v3.pdf","comment":"9 pages including appendix"},{"id":"http://arxiv.org/abs/2409.04194v1","updated":"2024-09-06T11:24:25Z","published":"2024-09-06T11:24:25Z","title":"Towards Privacy-Preserving Relational Data Synthesis via Probabilistic\n Relational Models","summary":" Probabilistic relational models provide a well-established formalism to\ncombine first-order logic and probabilistic models, thereby allowing to\nrepresent relationships between objects in a relational domain. At the same\ntime, the field of artificial intelligence requires increasingly large amounts\nof relational training data for various machine learning tasks. Collecting\nreal-world data, however, is often challenging due to privacy concerns, data\nprotection regulations, high costs, and so on. To mitigate these challenges,\nthe generation of synthetic data is a promising approach. In this paper, we\nsolve the problem of generating synthetic relational data via probabilistic\nrelational models. In particular, we propose a fully-fledged pipeline to go\nfrom relational database to probabilistic relational model, which can then be\nused to sample new synthetic relational data points from its underlying\nprobability distribution. As part of our proposed pipeline, we introduce a\nlearning algorithm to construct a probabilistic relational model from a given\nrelational database.\n","authors":["Malte Luttermann","Ralf Möller","Mattis Hartwig"],"pdf_url":"https://arxiv.org/pdf/2409.04194v1.pdf","comment":"Accepted to the Proceedings of the 47th German Conference on\n Artificial Intelligence (KI 2024)"},{"id":"http://arxiv.org/abs/2409.04188v1","updated":"2024-09-06T11:05:26Z","published":"2024-09-06T11:05:26Z","title":"Reassessing the Validity of Spurious Correlations Benchmarks","summary":" Neural networks can fail when the data contains spurious correlations. To\nunderstand this phenomenon, researchers have proposed numerous spurious\ncorrelations benchmarks upon which to evaluate mitigation methods. However, we\nobserve that these benchmarks exhibit substantial disagreement, with the best\nmethods on one benchmark performing poorly on another. We explore this\ndisagreement, and examine benchmark validity by defining three desiderata that\na benchmark should satisfy in order to meaningfully evaluate methods. Our\nresults have implications for both benchmarks and mitigations: we find that\ncertain benchmarks are not meaningful measures of method performance, and that\nseveral methods are not sufficiently robust for widespread use. We present a\nsimple recipe for practitioners to choose methods using the most similar\nbenchmark to their given problem.\n","authors":["Samuel J. Bell","Diane Bouchacourt","Levent Sagun"],"pdf_url":"https://arxiv.org/pdf/2409.04188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04185v1","updated":"2024-09-06T11:01:55Z","published":"2024-09-06T11:01:55Z","title":"Residual Stream Analysis with Multi-Layer SAEs","summary":" Sparse autoencoders (SAEs) are a promising approach to interpreting the\ninternal representations of transformer language models. However, standard SAEs\nare trained separately on each transformer layer, making it difficult to use\nthem to study how information flows across layers. To solve this problem, we\nintroduce the multi-layer SAE (MLSAE): a single SAE trained on the residual\nstream activation vectors from every transformer layer simultaneously. The\nresidual stream is usually understood as preserving information across layers,\nso we expected to, and did, find individual SAE features that are active at\nmultiple layers. Interestingly, while a single SAE feature is active at\ndifferent layers for different prompts, for a single prompt, we find that a\nsingle feature is far more likely to be active at a single layer. For larger\nunderlying models, we find that the cosine similarities between adjacent layers\nin the residual stream are higher, so we expect more features to be active at\nmultiple layers. These results show that MLSAEs are a promising method to study\ninformation flow in transformers. We release our code to train and analyze\nMLSAEs at https://github.com/tim-lawson/mlsae.\n","authors":["Tim Lawson","Lucy Farnik","Conor Houghton","Laurence Aitchison"],"pdf_url":"https://arxiv.org/pdf/2409.04185v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.04180v1","updated":"2024-09-06T10:45:58Z","published":"2024-09-06T10:45:58Z","title":"The Prevalence of Neural Collapse in Neural Multivariate Regression","summary":" Recently it has been observed that neural networks exhibit Neural Collapse\n(NC) during the final stage of training for the classification problem. We\nempirically show that multivariate regression, as employed in imitation\nlearning and other applications, exhibits Neural Regression Collapse (NRC), a\nnew form of neural collapse: (NRC1) The last-layer feature vectors collapse to\nthe subspace spanned by the $n$ principal components of the feature vectors,\nwhere $n$ is the dimension of the targets (for univariate regression, $n=1$);\n(NRC2) The last-layer feature vectors also collapse to the subspace spanned by\nthe last-layer weight vectors; (NRC3) The Gram matrix for the weight vectors\nconverges to a specific functional form that depends on the covariance matrix\nof the targets. After empirically establishing the prevalence of (NRC1)-(NRC3)\nfor a variety of datasets and network architectures, we provide an explanation\nof these phenomena by modeling the regression task in the context of the\nUnconstrained Feature Model (UFM), in which the last layer feature vectors are\ntreated as free variables when minimizing the loss function. We show that when\nthe regularization parameters in the UFM model are strictly positive, then\n(NRC1)-(NRC3) also emerge as solutions in the UFM optimization problem. We also\nshow that if the regularization parameters are equal to zero, then there is no\ncollapse. To our knowledge, this is the first empirical and theoretical study\nof neural collapse in the context of regression. This extension is significant\nnot only because it broadens the applicability of neural collapse to a new\ncategory of problems but also because it suggests that the phenomena of neural\ncollapse could be a universal behavior in deep learning.\n","authors":["George Andriopoulos","Zixuan Dong","Li Guo","Zifan Zhao","Keith Ross"],"pdf_url":"https://arxiv.org/pdf/2409.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04175v1","updated":"2024-09-06T10:34:06Z","published":"2024-09-06T10:34:06Z","title":"CISCA and CytoDArk0: a Cell Instance Segmentation and Classification\n method for histo(patho)logical image Analyses and a new, open, Nissl-stained\n dataset for brain cytoarchitecture studies","summary":" Delineating and classifying individual cells in microscopy tissue images is a\ncomplex task, yet it is a pivotal endeavor in various medical and biological\ninvestigations. We propose a new deep learning framework (CISCA) for automatic\ncell instance segmentation and classification in histological slices to support\ndetailed morphological and structural analysis or straightforward cell counting\nin digital pathology workflows and brain cytoarchitecture studies. At the core\nof CISCA lies a network architecture featuring a lightweight U-Net with three\nheads in the decoder. The first head classifies pixels into boundaries between\nneighboring cells, cell bodies, and background, while the second head regresses\nfour distance maps along four directions. The network outputs from the first\nand second heads are integrated through a tailored post-processing step, which\nultimately yields the segmentation of individual cells. A third head enables\nsimultaneous classification of cells into relevant classes, if required. We\nshowcase the effectiveness of our method using four datasets, including CoNIC,\nPanNuke, and MoNuSeg, which are publicly available H\\&E datasets. Additionally,\nwe introduce CytoDArk0, a novel dataset consisting of Nissl-stained images of\nthe cortex, cerebellum, and hippocampus from mammals belonging to the orders\nCetartiodactyla and Primates. We evaluate CISCA in comparison to other\nstate-of-the-art methods, demonstrating CISCA's robustness and accuracy in\nsegmenting and classifying cells across diverse tissue types, magnifications,\nand staining techniques.\n","authors":["Valentina Vadori","Jean-Marie Graïc","Antonella Peruffo","Giulia Vadori","Livio Finos","Enrico Grisan"],"pdf_url":"https://arxiv.org/pdf/2409.04175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04174v1","updated":"2024-09-06T10:33:42Z","published":"2024-09-06T10:33:42Z","title":"Towards Measuring Sell Side Outcomes in Buy Side Marketplace Experiments\n using In-Experiment Bipartite Graph","summary":" In this study, we evaluate causal inference estimators for online controlled\nbipartite graph experiments in a real marketplace setting. Our novel\ncontribution is constructing a bipartite graph using in-experiment data, rather\nthan relying on prior knowledge or historical data, the common approach in the\nliterature published to date. We build the bipartite graph from various\ninteractions between buyers and sellers in the marketplace, establishing a\nnovel research direction at the intersection of bipartite experiments and\nmediation analysis. This approach is crucial for modern marketplaces aiming to\nevaluate seller-side causal effects in buyer-side experiments, or vice versa.\nWe demonstrate our method using historical buyer-side experiments conducted at\nVinted, the largest second-hand marketplace in Europe with over 80M users.\n","authors":["Vaiva Pilkauskaitė","Jevgenij Gamper","Rasa Giniūnaitė","Agne Reklaitė"],"pdf_url":"https://arxiv.org/pdf/2409.04174v1.pdf","comment":"5 pages, 3 figures, this work was presented at the KDD 2024\n Conference Undergraduate Consortium"},{"id":"http://arxiv.org/abs/2409.04164v1","updated":"2024-09-06T10:03:49Z","published":"2024-09-06T10:03:49Z","title":"Can OpenSource beat ChatGPT? -- A Comparative Study of Large Language\n Models for Text-to-Code Generation","summary":" In recent years, large language models (LLMs) have emerged as powerful tools\nwith potential applications in various fields, including software engineering.\nWithin the scope of this research, we evaluate five different state-of-the-art\nLLMs - Bard, BingChat, ChatGPT, Llama2, and Code Llama - concerning their\ncapabilities for text-to-code generation. In an empirical study, we feed\nprompts with textual descriptions of coding problems sourced from the\nprogramming website LeetCode to the models with the task of creating solutions\nin Python. Subsequently, the quality of the generated outputs is assessed using\nthe testing functionalities of LeetCode. The results indicate large differences\nin performance between the investigated models. ChatGPT can handle these\ntypical programming challenges by far the most effectively, surpassing even\ncode-specialized models like Code Llama. To gain further insights, we measure\nthe runtime as well as the memory usage of the generated outputs and compared\nthem to the other code submissions on Leetcode. A detailed error analysis,\nencompassing a comparison of the differences concerning correct indentation and\nform of the generated code as well as an assignment of the incorrectly solved\ntasks to certain error categories allows us to obtain a more nuanced picture of\nthe results and potential for improvement. The results also show a clear\npattern of increasingly incorrect produced code when the models are facing a\nlot of context in the form of longer prompts.\n","authors":["Luis Mayer","Christian Heumann","Matthias Aßenmacher"],"pdf_url":"https://arxiv.org/pdf/2409.04164v1.pdf","comment":"Conference Paper accepted at the 9th SwissText Conference (2024)"},{"id":"http://arxiv.org/abs/2310.00505v2","updated":"2024-09-06T09:58:55Z","published":"2023-09-30T22:02:51Z","title":"Unveiling the Unborn: Advancing Fetal Health Classification through\n Machine Learning","summary":" Fetal health classification is a critical task in obstetrics, enabling early\nidentification and management of potential health problems. However, it remains\nchallenging due to data complexity and limited labeled samples. This research\npaper presents a novel machine-learning approach for fetal health\nclassification, leveraging a LightGBM classifier trained on a comprehensive\ndataset. The proposed model achieves an impressive accuracy of 98.31% on a test\nset. Our findings demonstrate the potential of machine learning in enhancing\nfetal health classification, offering a more objective and accurate assessment.\nNotably, our approach combines various features, such as fetal heart rate,\nuterine contractions, and maternal blood pressure, to provide a comprehensive\nevaluation. This methodology holds promise for improving early detection and\ntreatment of fetal health issues, ensuring better outcomes for both mothers and\nbabies. Beyond the high accuracy achieved, the novelty of our approach lies in\nits comprehensive feature selection and assessment methodology. By\nincorporating multiple data points, our model offers a more holistic and\nreliable evaluation compared to traditional methods. This research has\nsignificant implications in the field of obstetrics, paving the way for\nadvancements in early detection and intervention of fetal health concerns.\nFuture work involves validating the model on a larger dataset and developing a\nclinical application. Ultimately, we anticipate that our research will\nrevolutionize the assessment and management of fetal health, contributing to\nimproved healthcare outcomes for expectant mothers and their babies.\n","authors":["Sujith K Mandala"],"pdf_url":"https://arxiv.org/pdf/2310.00505v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04159v1","updated":"2024-09-06T09:43:09Z","published":"2024-09-06T09:43:09Z","title":"CUQ-GNN: Committee-based Graph Uncertainty Quantification using\n Posterior Networks","summary":" In this work, we study the influence of domain-specific characteristics when\ndefining a meaningful notion of predictive uncertainty on graph data.\nPreviously, the so-called Graph Posterior Network (GPN) model has been proposed\nto quantify uncertainty in node classification tasks. Given a graph, it uses\nNormalizing Flows (NFs) to estimate class densities for each node independently\nand converts those densities into Dirichlet pseudo-counts, which are then\ndispersed through the graph using the personalized Page-Rank algorithm. The\narchitecture of GPNs is motivated by a set of three axioms on the properties of\nits uncertainty estimates. We show that those axioms are not always satisfied\nin practice and therefore propose the family of Committe-based Uncertainty\nQuantification Graph Neural Networks (CUQ-GNNs), which combine standard Graph\nNeural Networks with the NF-based uncertainty estimation of Posterior Networks\n(PostNets). This approach adapts more flexibly to domain-specific demands on\nthe properties of uncertainty estimates. We compare CUQ-GNN against GPN and\nother uncertainty quantification approaches on common node classification\nbenchmarks and show that it is effective at producing useful uncertainty\nestimates.\n","authors":["Clemens Damke","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2409.04159v1.pdf","comment":"17 pages, 4 figures, 1 table. Accepted at ECML PKDD 2024. arXiv admin\n note: substantial text overlap with arXiv:2406.04041"},{"id":"http://arxiv.org/abs/2409.04143v1","updated":"2024-09-06T09:17:41Z","published":"2024-09-06T09:17:41Z","title":"An efficient hp-Variational PINNs framework for incompressible\n Navier-Stokes equations","summary":" Physics-informed neural networks (PINNs) are able to solve partial\ndifferential equations (PDEs) by incorporating the residuals of the PDEs into\ntheir loss functions. Variational Physics-Informed Neural Networks (VPINNs) and\nhp-VPINNs use the variational form of the PDE residuals in their loss function.\nAlthough hp-VPINNs have shown promise over traditional PINNs, they suffer from\nhigher training times and lack a framework capable of handling complex\ngeometries, which limits their application to more complex PDEs. As such,\nhp-VPINNs have not been applied in solving the Navier-Stokes equations, amongst\nother problems in CFD, thus far. FastVPINNs was introduced to address these\nchallenges by incorporating tensor-based loss computations, significantly\nimproving the training efficiency. Moreover, by using the bilinear\ntransformation, the FastVPINNs framework was able to solve PDEs on complex\ngeometries. In the present work, we extend the FastVPINNs framework to\nvector-valued problems, with a particular focus on solving the incompressible\nNavier-Stokes equations for two-dimensional forward and inverse problems,\nincluding problems such as the lid-driven cavity flow, the Kovasznay flow, and\nflow past a backward-facing step for Reynolds numbers up to 200. Our results\ndemonstrate a 2x improvement in training time while maintaining the same order\nof accuracy compared to PINNs algorithms documented in the literature. We\nfurther showcase the framework's efficiency in solving inverse problems for the\nincompressible Navier-Stokes equations by accurately identifying the Reynolds\nnumber of the underlying flow. Additionally, the framework's ability to handle\ncomplex geometries highlights its potential for broader applications in\ncomputational fluid dynamics. This implementation opens new avenues for\nresearch on hp-VPINNs, potentially extending their applicability to more\ncomplex problems.\n","authors":["Thivin Anandh","Divij Ghose","Ankit Tyagi","Abhineet Gupta","Suranjan Sarkar","Sashikumaar Ganesan"],"pdf_url":"https://arxiv.org/pdf/2409.04143v1.pdf","comment":"18 pages, 13 tables and 20 figures"},{"id":"http://arxiv.org/abs/2409.04140v1","updated":"2024-09-06T09:11:15Z","published":"2024-09-06T09:11:15Z","title":"Half-VAE: An Encoder-Free VAE to Bypass Explicit Inverse Mapping","summary":" Inference and inverse problems are closely related concepts, both\nfundamentally involving the deduction of unknown causes or parameters from\nobserved data. Bayesian inference, a powerful class of methods, is often\nemployed to solve a variety of problems, including those related to causal\ninference. Variational inference, a subset of Bayesian inference, is primarily\nused to efficiently approximate complex posterior distributions. Variational\nAutoencoders (VAEs), which combine variational inference with deep learning,\nhave become widely applied across various domains. This study explores the\npotential of VAEs for solving inverse problems, such as Independent Component\nAnalysis (ICA), without relying on an explicit inverse mapping process. Unlike\nother VAE-based ICA methods, this approach discards the encoder in the VAE\narchitecture, directly setting the latent variables as trainable parameters. In\nother words, the latent variables are no longer outputs of the encoder but are\ninstead optimized directly through the objective function to converge to\nappropriate values. We find that, with a suitable prior setup, the latent\nvariables, represented by trainable parameters, can exhibit mutually\nindependent properties as the parameters converge, all without the need for an\nencoding process. This approach, referred to as the Half-VAE, bypasses the\ninverse mapping process by eliminating the encoder. This study demonstrates the\nfeasibility of using the Half-VAE to solve ICA without the need for an explicit\ninverse mapping process.\n","authors":["Yuan-Hao Wei","Yan-Jie Sun","Chen Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.02019v2","updated":"2024-09-06T09:06:25Z","published":"2021-11-03T04:47:37Z","title":"Scalable mixed-domain Gaussian process modeling and model reduction for\n longitudinal data","summary":" Gaussian process (GP) models that combine both categorical and continuous\ninput variables have found use in longitudinal data analysis of and computer\nexperiments. However, standard inference for these models has the typical cubic\nscaling, and common scalable approximation schemes for GPs cannot be applied\nsince the covariance function is non-continuous. In this work, we derive a\nbasis function approximation scheme for mixed-domain covariance functions,\nwhich scales linearly with respect to the number of observations and total\nnumber of basis functions. The proposed approach is naturally applicable to\nalso Bayesian GP regression with discrete observation models. We demonstrate\nthe scalability of the approach and compare model reduction techniques for\nadditive GP models in a longitudinal data context. We confirm that we can\napproximate the exact GP model accurately in a fraction of the runtime compared\nto fitting the corresponding exact model. In addition, we demonstrate a\nscalable model reduction workflow for obtaining smaller and more interpretable\nmodels when dealing with a large number of candidate predictors.\n","authors":["Juho Timonen","Harri Lähdesmäki"],"pdf_url":"https://arxiv.org/pdf/2111.02019v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16565v3","updated":"2024-09-06T09:05:31Z","published":"2024-02-26T13:43:25Z","title":"Partial Rankings of Optimizers","summary":" We introduce a framework for benchmarking optimizers according to multiple\ncriteria over various test functions. Based on a recently introduced union-free\ngeneric depth function for partial orders/rankings, it fully exploits the\nordinal information and allows for incomparability. Our method describes the\ndistribution of all partial orders/rankings, avoiding the notorious\nshortcomings of aggregation. This permits to identify test functions that\nproduce central or outlying rankings of optimizers and to assess the quality of\nbenchmarking suites.\n","authors":["Julian Rodemann","Hannah Blocher"],"pdf_url":"https://arxiv.org/pdf/2402.16565v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02850v2","updated":"2024-09-06T08:30:57Z","published":"2024-09-04T16:20:57Z","title":"Oops, I Sampled it Again: Reinterpreting Confidence Intervals in\n Few-Shot Learning","summary":" The predominant method for computing confidence intervals (CI) in few-shot\nlearning (FSL) is based on sampling the tasks with replacement, i.e.\\ allowing\nthe same samples to appear in multiple tasks. This makes the CI misleading in\nthat it takes into account the randomness of the sampler but not the data\nitself. To quantify the extent of this problem, we conduct a comparative\nanalysis between CIs computed with and without replacement. These reveal a\nnotable underestimation by the predominant method. This observation calls for a\nreevaluation of how we interpret confidence intervals and the resulting\nconclusions in FSL comparative studies. Our research demonstrates that the use\nof paired tests can partially address this issue. Additionally, we explore\nmethods to further reduce the (size of the) CI by strategically sampling tasks\nof a specific size. We also introduce a new optimized benchmark, which can be\naccessed at https://github.com/RafLaf/FSL-benchmark-again\n","authors":["Raphael Lafargue","Luke Smith","Franck Vermet","Mathias Löwe","Ian Reid","Vincent Gripon","Jack Valmadre"],"pdf_url":"https://arxiv.org/pdf/2409.02850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04111v1","updated":"2024-09-06T08:28:35Z","published":"2024-09-06T08:28:35Z","title":"Active-Passive Federated Learning for Vertically Partitioned Multi-view\n Data","summary":" Vertical federated learning is a natural and elegant approach to integrate\nmulti-view data vertically partitioned across devices (clients) while\npreserving their privacies. Apart from the model training, existing methods\nrequires the collaboration of all clients in the model inference. However, the\nmodel inference is probably maintained for service in a long time, while the\ncollaboration, especially when the clients belong to different organizations,\nis unpredictable in real-world scenarios, such as concellation of contract,\nnetwork unavailablity, etc., resulting in the failure of them. To address this\nissue, we, at the first attempt, propose a flexible Active-Passive Federated\nlearning (APFed) framework. Specifically, the active client is the initiator of\na learning task and responsible to build the complete model, while the passive\nclients only serve as assistants. Once the model built, the active client can\nmake inference independently. In addition, we instance the APFed framework into\ntwo classification methods with employing the reconstruction loss and the\ncontrastive loss on passive clients, respectively. Meanwhile, the two methods\nare tested in a set of experiments and achieves desired results, validating\ntheir effectiveness.\n","authors":["Jiyuan Liu","Xinwang Liu","Siqi Wang","Xingchen Hu","Qing Liao","Xinhang Wan","Yi Zhang","Xin Lv","Kunlun He"],"pdf_url":"https://arxiv.org/pdf/2409.04111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03637v4","updated":"2024-09-06T08:28:01Z","published":"2024-07-04T05:13:58Z","title":"QET: Enhancing Quantized LLM Parameters and KV cache Compression through\n Element Substitution and Residual Clustering","summary":" The matrix quantization entails representing matrix elements in a more\nspace-efficient form to reduce storage usage, with dequantization restoring the\noriginal matrix for use. We formulate the Quantization Error Minimization (QEM)\nproblem as minimizing the distance between a matrix before and after\nquantization, under the condition that the quantized matrix occupies the same\nmemory space. Matrix quantization is crucial in various applications, including\nLarge Language Models (LLMs) weight quantization, vector databases, KV cache\nquantization, graph compression, and image compression. Recent advancements in\nLLMs, such as GPT-4 and BERT, have highlighted the importance of matrix\ncompression due to the large size of parameters and KV cache, which are stored\nas matrices.\n We propose Quantum Entanglement Trees (QET) to address the QEM problem by\nleveraging the local orderliness of matrix elements, involving iterative\nelement swapping to form a locally ordered matrix. This matrix is then grouped\nand quantized by columns. To enhance QET, we introduce two optimizations:\nfurther quantizing residuals to reduce MSE, and using masking and batch\nprocessing to accelerate the algorithm.\n Experimental results demonstrate that QET can effectively reduce MSE to\n5.05%, 13.33%, and 11.89% of the current best method on the LLM dataset, K\ncache, and V cache, respectively. Our contributions include the abstraction of\nthe QEM problem, the design of the QET algorithm, and the proposal of two\noptimizations to improve accuracy and speed.\n","authors":["Yanshu Wang","Wang Li","Zhaoqian Yao","Tong Yang"],"pdf_url":"https://arxiv.org/pdf/2407.03637v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04109v1","updated":"2024-09-06T08:25:03Z","published":"2024-09-06T08:25:03Z","title":"Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with\n 100+ NLP Researchers","summary":" Recent advancements in large language models (LLMs) have sparked optimism\nabout their potential to accelerate scientific discovery, with a growing number\nof works proposing research agents that autonomously generate and validate new\nideas. Despite this, no evaluations have shown that LLM systems can take the\nvery first step of producing novel, expert-level ideas, let alone perform the\nentire research process. We address this by establishing an experimental design\nthat evaluates research idea generation while controlling for confounders and\nperforms the first head-to-head comparison between expert NLP researchers and\nan LLM ideation agent. By recruiting over 100 NLP researchers to write novel\nideas and blind reviews of both LLM and human ideas, we obtain the first\nstatistically significant conclusion on current LLM capabilities for research\nideation: we find LLM-generated ideas are judged as more novel (p < 0.05) than\nhuman expert ideas while being judged slightly weaker on feasibility. Studying\nour agent baselines closely, we identify open problems in building and\nevaluating research agents, including failures of LLM self-evaluation and their\nlack of diversity in generation. Finally, we acknowledge that human judgements\nof novelty can be difficult, even by experts, and propose an end-to-end study\ndesign which recruits researchers to execute these ideas into full projects,\nenabling us to study whether these novelty and feasibility judgements result in\nmeaningful differences in research outcome.\n","authors":["Chenglei Si","Diyi Yang","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2409.04109v1.pdf","comment":"main paper is 20 pages"},{"id":"http://arxiv.org/abs/2409.01588v2","updated":"2024-09-06T08:16:02Z","published":"2024-09-03T04:04:40Z","title":"Large-scale Urban Facility Location Selection with Knowledge-informed\n Reinforcement Learning","summary":" The facility location problem (FLP) is a classical combinatorial optimization\nchallenge aimed at strategically laying out facilities to maximize their\naccessibility. In this paper, we propose a reinforcement learning method\ntailored to solve large-scale urban FLP, capable of producing near-optimal\nsolutions at superfast inference speed. We distill the essential swap operation\nfrom local search, and simulate it by intelligently selecting edges on a graph\nof urban regions, guided by a knowledge-informed graph neural network, thus\nsidestepping the need for heavy computation of local search. Extensive\nexperiments on four US cities with different geospatial conditions demonstrate\nthat our approach can achieve comparable performance to commercial solvers with\nless than 5\\% accessibility loss, while displaying up to 1000 times speedup. We\ndeploy our model as an online geospatial application at\nhttps://huggingface.co/spaces/randommmm/MFLP.\n","authors":["Hongyuan Su","Yu Zheng","Jingtao Ding","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2409.01588v2.pdf","comment":"Sigspatial2024"},{"id":"http://arxiv.org/abs/2409.04104v1","updated":"2024-09-06T08:14:58Z","published":"2024-09-06T08:14:58Z","title":"MixNet: Joining Force of Classical and Modern Approaches Toward the\n Comprehensive Pipeline in Motor Imagery EEG Classification","summary":" Recent advances in deep learning (DL) have significantly impacted motor\nimagery (MI)-based brain-computer interface (BCI) systems, enhancing the\ndecoding of electroencephalography (EEG) signals. However, most studies\nstruggle to identify discriminative patterns across subjects during MI tasks,\nlimiting MI classification performance. In this article, we propose MixNet, a\nnovel classification framework designed to overcome this limitation by\nutilizing spectral-spatial signals from MI data, along with a multitask\nlearning architecture named MIN2Net, for classification. Here, the\nspectral-spatial signals are generated using the filter-bank common spatial\npatterns (FBCSPs) method on MI data. Since the multitask learning architecture\nis used for the classification task, the learning in each task may exhibit\ndifferent generalization rates and potential overfitting across tasks. To\naddress this issue, we implement adaptive gradient blending, simultaneously\nregulating multiple loss weights and adjusting the learning pace for each task\nbased on its generalization/overfitting tendencies. Experimental results on six\nbenchmark data sets of different data sizes demonstrate that MixNet\nconsistently outperforms all state-of-the-art algorithms in subject-dependent\nand -independent settings. Finally, the low-density EEG MI classification\nresults show that MixNet outperforms all state-of-the-art algorithms, offering\npromising implications for Internet of Thing (IoT) applications, such as\nlightweight and portable EEG wearable devices based on low-density montages.\n","authors":["Phairot Autthasan","Rattanaphon Chaisaen","Huy Phan","Maarten De Vos","Theerawit Wilaiprasitporn"],"pdf_url":"https://arxiv.org/pdf/2409.04104v1.pdf","comment":"Supplementary materials and source codes are available on-line at\n https://github.com/Max-Phairot-A/MixNet"},{"id":"http://arxiv.org/abs/2408.14042v2","updated":"2024-09-06T08:13:09Z","published":"2024-08-26T06:39:49Z","title":"PAGE: Parametric Generative Explainer for Graph Neural Network","summary":" This article introduces PAGE, a parameterized generative interpretive\nframework. PAGE is capable of providing faithful explanations for any graph\nneural network without necessitating prior knowledge or internal details.\nSpecifically, we train the auto-encoder to generate explanatory substructures\nby designing appropriate training strategy. Due to the dimensionality reduction\nof features in the latent space of the auto-encoder, it becomes easier to\nextract causal features leading to the model's output, which can be easily\nemployed to generate explanations. To accomplish this, we introduce an\nadditional discriminator to capture the causality between latent causal\nfeatures and the model's output. By designing appropriate optimization\nobjectives, the well-trained discriminator can be employed to constrain the\nencoder in generating enhanced causal features. Finally, these features are\nmapped to substructures of the input graph through the decoder to serve as\nexplanations. Compared to existing methods, PAGE operates at the sample scale\nrather than nodes or edges, eliminating the need for perturbation or encoding\nprocesses as seen in previous methods. Experimental results on both\nartificially synthesized and real-world datasets demonstrate that our approach\nnot only exhibits the highest faithfulness and accuracy but also significantly\noutperforms baseline models in terms of efficiency.\n","authors":["Yang Qiu","Wei Liu","Jun Wang","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2408.14042v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04103v1","updated":"2024-09-06T08:09:15Z","published":"2024-09-06T08:09:15Z","title":"The Role of Graph Topology in the Performance of Biomedical Knowledge\n Graph Completion Models","summary":" Knowledge Graph Completion has been increasingly adopted as a useful method\nfor several tasks in biomedical research, like drug repurposing or drug-target\nidentification. To that end, a variety of datasets and Knowledge Graph\nEmbedding models has been proposed over the years. However, little is known\nabout the properties that render a dataset useful for a given task and, even\nthough theoretical properties of Knowledge Graph Embedding models are well\nunderstood, their practical utility in this field remains controversial. We\nconduct a comprehensive investigation into the topological properties of\npublicly available biomedical Knowledge Graphs and establish links to the\naccuracy observed in real-world applications. By releasing all model\npredictions and a new suite of analysis tools we invite the community to build\nupon our work and continue improving the understanding of these crucial\napplications.\n","authors":["Alberto Cattaneo","Stephen Bonner","Thomas Martynec","Carlo Luschi","Ian P Barrett","Daniel Justus"],"pdf_url":"https://arxiv.org/pdf/2409.04103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04101v1","updated":"2024-09-06T08:07:09Z","published":"2024-09-06T08:07:09Z","title":"Ultra-imbalanced classification guided by statistical information","summary":" Imbalanced data are frequently encountered in real-world classification\ntasks. Previous works on imbalanced learning mostly focused on learning with a\nminority class of few samples. However, the notion of imbalance also applies to\ncases where the minority class contains abundant samples, which is usually the\ncase for industrial applications like fraud detection in the area of financial\nrisk management. In this paper, we take a population-level approach to\nimbalanced learning by proposing a new formulation called\n\\emph{ultra-imbalanced classification} (UIC). Under UIC, loss functions behave\ndifferently even if infinite amount of training samples are available. To\nunderstand the intrinsic difficulty of UIC problems, we borrow ideas from\ninformation theory and establish a framework to compare different loss\nfunctions through the lens of statistical information. A novel learning\nobjective termed Tunable Boosting Loss is developed which is provably resistant\nagainst data imbalance under UIC, as well as being empirically efficient\nverified by extensive experimental studies on both public and industrial\ndatasets.\n","authors":["Yin Jin","Ningtao Wang","Ruofan Wu","Pengfei Shi","Xing Fu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04101v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04081v1","updated":"2024-09-06T07:44:44Z","published":"2024-09-06T07:44:44Z","title":"UI-JEPA: Towards Active Perception of User Intent through Onscreen User\n Activity","summary":" Generating user intent from a sequence of user interface (UI) actions is a\ncore challenge in comprehensive UI understanding. Recent advancements in\nmultimodal large language models (MLLMs) have led to substantial progress in\nthis area, but their demands for extensive model parameters, computing power,\nand high latency makes them impractical for scenarios requiring lightweight,\non-device solutions with low latency or heightened privacy. Additionally, the\nlack of high-quality datasets has hindered the development of such lightweight\nmodels. To address these challenges, we propose UI-JEPA, a novel framework that\nemploys masking strategies to learn abstract UI embeddings from unlabeled data\nthrough self-supervised learning, combined with an LLM decoder fine-tuned for\nuser intent prediction. We also introduce two new UI-grounded multimodal\ndatasets, \"Intent in the Wild\" (IIW) and \"Intent in the Tame\" (IIT), designed\nfor few-shot and zero-shot UI understanding tasks. IIW consists of 1.7K videos\nacross 219 intent categories, while IIT contains 914 videos across 10\ncategories. We establish the first baselines for these datasets, showing that\nrepresentations learned using a JEPA-style objective, combined with an LLM\ndecoder, can achieve user intent predictions that match the performance of\nstate-of-the-art large MLLMs, but with significantly reduced annotation and\ndeployment resources. Measured by intent similarity scores, UI-JEPA outperforms\nGPT-4 Turbo and Claude 3.5 Sonnet by 10.0% and 7.2% respectively, averaged\nacross two datasets. Notably, UI-JEPA accomplishes the performance with a 50.5x\nreduction in computational cost and a 6.6x improvement in latency in the IIW\ndataset. These results underscore the effectiveness of UI-JEPA, highlighting\nits potential for lightweight, high-performance UI understanding.\n","authors":["Yicheng Fu","Raviteja Anantha","Prabal Vashisht","Jianpeng Cheng","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2409.04081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13110v4","updated":"2024-09-06T07:40:40Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v4.pdf","comment":"Accepted at Journal of Machine Learning Research. This paper\n integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete\n story. In this paper, we improve the writing and organization, and also add\n conceptual, empirical, and theoretical improvements over the previous work.\n V2: small typo fixes/formatting improvements. V3: improvements from journal\n revisions. V4: fix figures"},{"id":"http://arxiv.org/abs/2409.04072v1","updated":"2024-09-06T07:26:14Z","published":"2024-09-06T07:26:14Z","title":"Study of Brain Network in Alzheimers Disease Using Wavelet-Based Graph\n Theory Method","summary":" Alzheimer's disease (AD) is a neurodegenerative disorder marked by memory\nloss and cognitive decline, making early detection vital for timely\nintervention. However, early diagnosis is challenging due to the heterogeneous\npresentation of symptoms. Resting-state fMRI (rs-fMRI) captures spontaneous\nbrain activity and functional connectivity, which are known to be disrupted in\nAD and mild cognitive impairment (MCI). Traditional methods, such as Pearson's\ncorrelation, have been used to calculate association matrices, but these\napproaches often overlook the dynamic and non-stationary nature of brain\nactivity. In this study, we introduce a novel method that integrates discrete\nwavelet transform (DWT) and graph theory to model the dynamic behavior of brain\nnetworks. By decomposing rs-fMRI signals using DWT, our approach captures the\ntime-frequency representation of brain activity, allowing for a more nuanced\nanalysis of the underlying network dynamics. Graph theory provides a robust\nmathematical framework to analyze these complex networks, while machine\nlearning is employed to automate the discrimination of different stages of AD\nbased on learned patterns from different frequency bands. We applied our method\nto a dataset of rs-fMRI images from the Alzheimer's Disease Neuroimaging\nInitiative (ADNI) database, demonstrating its potential as an early diagnostic\ntool for AD and for monitoring disease progression. Our statistical analysis\nidentifies specific brain regions and connections that are affected in AD and\nMCI, at different frequency bands, offering deeper insights into the disease's\nimpact on brain function.\n","authors":["Ali Khazaee","Abdolreza Mohammadi","Ruairi Oreally"],"pdf_url":"https://arxiv.org/pdf/2409.04072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16639v3","updated":"2024-09-06T07:24:18Z","published":"2024-05-26T17:30:44Z","title":"A unified law of robustness for Bregman divergence losses","summary":" In contemporary deep learning practice, models are often trained to near zero\nloss i.e. to nearly interpolate the training data. However, the number of\nparameters in the model is usually far more than the number of data points $n$,\nthe theoretical minimum needed for interpolation: a phenomenon referred to as\noverparameterization. In an interesting piece of work that contributes to the\nconsiderable research that has been devoted to understand overparameterization,\nBubeck and Sellke showed that for a broad class of covariate distributions\n(specifically those satisfying a natural notion of concentration of measure),\noverparameterization is necessary for robust interpolation i.e. if the\ninterpolating function is required to be Lipschitz. However, their robustness\nresults were proved only in the setting of regression with square loss. In\npractice, however many other kinds of losses are used, e.g. cross entropy loss\nfor classification. In this work, we generalize Bubeck and Selke's result to\nBregman divergence losses, which form a common generalization of square loss\nand cross-entropy loss. Our generalization relies on identifying a\nbias-variance type decomposition that lies at the heart of the proof and Bubeck\nand Sellke.\n","authors":["Santanu Das","Jatin Batra","Piyush Srivastava"],"pdf_url":"https://arxiv.org/pdf/2405.16639v3.pdf","comment":"18 pages; fixed a typo in a citation"},{"id":"http://arxiv.org/abs/2409.04069v1","updated":"2024-09-06T07:20:45Z","published":"2024-09-06T07:20:45Z","title":"Online Residual Learning from Offline Experts for Pedestrian Tracking","summary":" In this paper, we consider the problem of predicting unknown targets from\ndata. We propose Online Residual Learning (ORL), a method that combines online\nadaptation with offline-trained predictions. At a lower level, we employ\nmultiple offline predictions generated before or at the beginning of the\nprediction horizon. We augment every offline prediction by learning their\nrespective residual error concerning the true target state online, using the\nrecursive least squares algorithm. At a higher level, we treat the augmented\nlower-level predictors as experts, adopting the Prediction with Expert Advice\nframework. We utilize an adaptive softmax weighting scheme to form an aggregate\nprediction and provide guarantees for ORL in terms of regret. We employ ORL to\nboost performance in the setting of online pedestrian trajectory prediction.\nBased on data from the Stanford Drone Dataset, we show that ORL can demonstrate\nbest-of-both-worlds performance.\n","authors":["Anastasios Vlachos","Anastasios Tsiamis","Aren Karapetyan","Efe C. Balta","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2409.04069v1.pdf","comment":"Accepted to CDC 2024"},{"id":"http://arxiv.org/abs/2409.04067v1","updated":"2024-09-06T07:17:01Z","published":"2024-09-06T07:17:01Z","title":"FEM-based Neural Networks for Solving Incompressible Fluid Flows and\n Related Inverse Problems","summary":" The numerical simulation and optimization of technical systems described by\npartial differential equations is expensive, especially in multi-query\nscenarios in which the underlying equations have to be solved for different\nparameters. A comparatively new approach in this context is to combine the good\napproximation properties of neural networks (for parameter dependence) with the\nclassical finite element method (for discretization). However, instead of\nconsidering the solution mapping of the PDE from the parameter space into the\nFEM-discretized solution space as a purely data-driven regression problem,\nso-called physically informed regression problems have proven to be useful. In\nthese, the equation residual is minimized during the training of the neural\nnetwork, i.e. the neural network \"learns\" the physics underlying the problem.\nIn this paper, we extend this approach to saddle-point and non-linear fluid\ndynamics problems, respectively, namely stationary Stokes and stationary\nNavier-Stokes equations. In particular, we propose a modification of the\nexisting approach: Instead of minimizing the plain vanilla equation residual\nduring training, we minimize the equation residual modified by a\npreconditioner. By analogy with the linear case, this also improves the\ncondition in the present non-linear case. Our numerical examples demonstrate\nthat this approach significantly reduces the training effort and greatly\nincreases accuracy and generalizability. Finally, we show the application of\nthe resulting parameterized model to a related inverse problem.\n","authors":["Franziska Griese","Fabian Hoppe","Alexander Rüttgers","Philipp Knechtges"],"pdf_url":"https://arxiv.org/pdf/2409.04067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04343v3","updated":"2024-09-06T07:14:32Z","published":"2023-04-10T01:12:09Z","title":"Certifiable Black-Box Attacks with Randomized Adversarial Examples:\n Breaking Defenses with Provable Confidence","summary":" Black-box adversarial attacks have demonstrated strong potential to\ncompromise machine learning models by iteratively querying the target model or\nleveraging transferability from a local surrogate model. Recently, such attacks\ncan be effectively mitigated by state-of-the-art (SOTA) defenses, e.g.,\ndetection via the pattern of sequential queries, or injecting noise into the\nmodel. To our best knowledge, we take the first step to study a new paradigm of\nblack-box attacks with provable guarantees -- certifiable black-box attacks\nthat can guarantee the attack success probability (ASP) of adversarial examples\nbefore querying over the target model. This new black-box attack unveils\nsignificant vulnerabilities of machine learning models, compared to traditional\nempirical black-box attacks, e.g., breaking strong SOTA defenses with provable\nconfidence, constructing a space of (infinite) adversarial examples with high\nASP, and the ASP of the generated adversarial examples is theoretically\nguaranteed without verification/queries over the target model. Specifically, we\nestablish a novel theoretical foundation for ensuring the ASP of the black-box\nattack with randomized adversarial examples (AEs). Then, we propose several\nnovel techniques to craft the randomized AEs while reducing the perturbation\nsize for better imperceptibility. Finally, we have comprehensively evaluated\nthe certifiable black-box attacks on the CIFAR10/100, ImageNet, and LibriSpeech\ndatasets, while benchmarking with 16 SOTA black-box attacks, against various\nSOTA defenses in the domains of computer vision and speech recognition. Both\ntheoretical and experimental results have validated the significance of the\nproposed attack. The code and all the benchmarks are available at\n\\url{https://github.com/datasec-lab/CertifiedAttack}.\n","authors":["Hanbin Hong","Xinyu Zhang","Binghui Wang","Zhongjie Ba","Yuan Hong"],"pdf_url":"https://arxiv.org/pdf/2304.04343v3.pdf","comment":"accepted by ACM CCS 2024"},{"id":"http://arxiv.org/abs/2409.04060v1","updated":"2024-09-06T07:04:27Z","published":"2024-09-06T07:04:27Z","title":"D4: Text-guided diffusion model-based domain adaptive data augmentation\n for vineyard shoot detection","summary":" In an agricultural field, plant phenotyping using object detection models is\ngaining attention. However, collecting the training data necessary to create\ngeneric and high-precision models is extremely challenging due to the\ndifficulty of annotation and the diversity of domains. Furthermore, it is\ndifficult to transfer training data across different crops, and although\nmachine learning models effective for specific environments, conditions, or\ncrops have been developed, they cannot be widely applied in actual fields. In\nthis study, we propose a generative data augmentation method (D4) for vineyard\nshoot detection. D4 uses a pre-trained text-guided diffusion model based on a\nlarge number of original images culled from video data collected by unmanned\nground vehicles or other means, and a small number of annotated datasets. The\nproposed method generates new annotated images with background information\nadapted to the target domain while retaining annotation information necessary\nfor object detection. In addition, D4 overcomes the lack of training data in\nagriculture, including the difficulty of annotation and diversity of domains.\nWe confirmed that this generative data augmentation method improved the mean\naverage precision by up to 28.65% for the BBox detection task and the average\nprecision by up to 13.73% for the keypoint detection task for vineyard shoot\ndetection. Our generative data augmentation method D4 is expected to\nsimultaneously solve the cost and domain diversity issues of training data\ngeneration in agriculture and improve the generalization performance of\ndetection models.\n","authors":["Kentaro Hirahara","Chikahito Nakane","Hajime Ebisawa","Tsuyoshi Kuroda","Yohei Iwaki","Tomoyoshi Utsumi","Yuichiro Nomura","Makoto Koike","Hiroshi Mineno"],"pdf_url":"https://arxiv.org/pdf/2409.04060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00046v3","updated":"2024-09-06T06:59:53Z","published":"2024-08-19T11:50:23Z","title":"Rethinking Molecular Design: Integrating Latent Variable and\n Auto-Regressive Models for Goal Directed Generation","summary":" De novo molecule design has become a highly active research area, advanced\nsignificantly through the use of state-of-the-art generative models. Despite\nthese advances, several fundamental questions remain unanswered as the field\nincreasingly focuses on more complex generative models and sophisticated\nmolecular representations as an answer to the challenges of drug design. In\nthis paper, we return to the simplest representation of molecules, and\ninvestigate overlooked limitations of classical generative approaches,\nparticularly Variational Autoencoders (VAEs) and auto-regressive models. We\npropose a hybrid model in the form of a novel regularizer that leverages the\nstrengths of both to improve validity, conditional generation, and style\ntransfer of molecular sequences. Additionally, we provide an in depth\ndiscussion of overlooked assumptions of these models' behaviour.\n","authors":["Heath Arthur-Loui","Amina Mollaysa","Michael Krauthammer"],"pdf_url":"https://arxiv.org/pdf/2409.00046v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11133v3","updated":"2024-09-06T06:58:02Z","published":"2023-07-07T11:49:55Z","title":"Contrastive Graph Pooling for Explainable Classification of Brain\n Networks","summary":" Functional magnetic resonance imaging (fMRI) is a commonly used technique to\nmeasure neural activation. Its application has been particularly important in\nidentifying underlying neurodegenerative conditions such as Parkinson's,\nAlzheimer's, and Autism. Recent analysis of fMRI data models the brain as a\ngraph and extracts features by graph neural networks (GNNs). However, the\nunique characteristics of fMRI data require a special design of GNN. Tailoring\nGNN to generate effective and domain-explainable features remains challenging.\nIn this paper, we propose a contrastive dual-attention block and a\ndifferentiable graph pooling method called ContrastPool to better utilize GNN\nfor brain networks, meeting fMRI-specific requirements. We apply our method to\n5 resting-state fMRI brain network datasets of 3 diseases and demonstrate its\nsuperiority over state-of-the-art baselines. Our case study confirms that the\npatterns extracted by our method match the domain knowledge in neuroscience\nliterature, and disclose direct and interesting insights. Our contributions\nunderscore the potential of ContrastPool for advancing the understanding of\nbrain networks and neurodegenerative conditions. The source code is available\nat https://github.com/AngusMonroe/ContrastPool.\n","authors":["Jiaxing Xu","Qingtian Bian","Xinhang Li","Aihu Zhang","Yiping Ke","Miao Qiao","Wei Zhang","Wei Khang Jeremy Sim","Balázs Gulyás"],"pdf_url":"https://arxiv.org/pdf/2307.11133v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.00892v3","updated":"2024-09-06T06:52:52Z","published":"2024-03-01T13:47:39Z","title":"PowerFlowMultiNet: Multigraph Neural Networks for Unbalanced Three-Phase\n Distribution Systems","summary":" Efficiently solving unbalanced three-phase power flow in distribution grids\nis pivotal for grid analysis and simulation. There is a pressing need for\nscalable algorithms capable of handling large-scale unbalanced power grids that\ncan provide accurate and fast solutions. To address this, deep learning\ntechniques, especially Graph Neural Networks (GNNs), have emerged. However,\nexisting literature primarily focuses on balanced networks, leaving a critical\ngap in supporting unbalanced three-phase power grids. This letter introduces\nPowerFlowMultiNet, a novel multigraph GNN framework explicitly designed for\nunbalanced three-phase power grids. The proposed approach models each phase\nseparately in a multigraph representation, effectively capturing the inherent\nasymmetry in unbalanced grids. A graph embedding mechanism utilizing message\npassing is introduced to capture spatial dependencies within the power system\nnetwork. PowerFlowMultiNet outperforms traditional methods and other deep\nlearning approaches in terms of accuracy and computational speed. Rigorous\ntesting reveals significantly lower error rates and a notable hundredfold\nincrease in computational speed for large power networks compared to\nmodel-based methods.\n","authors":["Salah Ghamizi","Jun Cao","Aoxiang Ma","Pedro Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2403.00892v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19270v2","updated":"2024-09-06T05:45:42Z","published":"2023-10-30T05:06:52Z","title":"Invariant kernels on Riemannian symmetric spaces: a harmonic-analytic\n approach","summary":" This work aims to prove that the classical Gaussian kernel, when defined on a\nnon-Euclidean symmetric space, is never positive-definite for any choice of\nparameter. To achieve this goal, the paper develops new geometric and\nanalytical arguments. These provide a rigorous characterization of the\npositive-definiteness of the Gaussian kernel, which is complete but for a\nlimited number of scenarios in low dimensions that are treated by numerical\ncomputations. Chief among these results are the L$^{\\!\\scriptscriptstyle\np}$-$\\hspace{0.02cm}$Godement theorems (where $p = 1,2$), which provide\nverifiable necessary and sufficient conditions for a kernel defined on a\nsymmetric space of non-compact type to be positive-definite. A celebrated\ntheorem, sometimes called the Bochner-Godement theorem, already gives such\nconditions and is far more general in its scope, but is especially hard to\napply. Beyond the connection with the Gaussian kernel, the new results in this\nwork lay out a blueprint for the study of invariant kernels on symmetric\nspaces, bringing forth specific harmonic analysis tools that suggest many\nfuture applications.\n","authors":["Nathael Da Costa","Cyrus Mostajeran","Juan-Pablo Ortega","Salem Said"],"pdf_url":"https://arxiv.org/pdf/2310.19270v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01875v2","updated":"2024-09-06T05:37:22Z","published":"2024-03-04T09:31:56Z","title":"Locally Convex Global Loss Network for Decision-Focused Learning","summary":" In decision-making problem under uncertainty, predicting unknown parameters\nis often considered independent of the optimization part. Decision-focused\nLearning (DFL) is a task-oriented framework to integrate prediction and\noptimization by adapting predictive model to give better decision for the\ncorresponding task. Here, an inevitable challenge arises when computing\ngradients of the optimal decision with respect to the parameters. Existing\nresearches cope this issue by smoothly reforming surrogate optimization or\nconstruct surrogate loss function that mimic task loss. However, they are\napplied to restricted optimization domain. In this paper, we propose Locally\nConvex Global Loss Network (LCGLN), a global surrogate loss model which can be\nimplemented in a general DFL paradigm. LCGLN learns task loss via partial input\nconvex neural network which is guaranteed to be convex for chosen inputs, while\nkeeping the non-convex global structure for the other inputs. This enables\nLCGLN to admit general DFL through only a single surrogate loss without any\nsense for choosing appropriate parametric forms. We confirm effectiveness and\nflexibility of LCGLN by evaluating our proposed model with three stochastic\ndecision-making problems.\n","authors":["Haeun Jeon","Hyunglip Bae","Minsu Park","Chanyeong Kim","Woo Chang Kim"],"pdf_url":"https://arxiv.org/pdf/2403.01875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.02464v3","updated":"2024-09-06T05:33:55Z","published":"2022-02-05T01:59:09Z","title":"Minimax Optimal Algorithms with Fixed-$k$-Nearest Neighbors","summary":" This paper presents how to perform minimax optimal classification,\nregression, and density estimation based on fixed-$k$ nearest neighbor (NN)\nsearches. We consider a distributed learning scenario, in which a massive\ndataset is split into smaller groups, where the $k$-NNs are found for a query\npoint with respect to each subset of data. We propose \\emph{optimal} rules to\naggregate the fixed-$k$-NN information for classification, regression, and\ndensity estimation that achieve minimax optimal rates for the respective\nproblems. We show that the distributed algorithm with a fixed $k$ over a\nsufficiently large number of groups attains a minimax optimal error rate up to\na multiplicative logarithmic factor under some regularity conditions. Roughly\nspeaking, distributed $k$-NN rules with $M$ groups has a performance comparable\nto the standard $\\Theta(kM)$-NN rules even for fixed $k$.\n","authors":["J. Jon Ryu","Young-Han Kim"],"pdf_url":"https://arxiv.org/pdf/2202.02464v3.pdf","comment":"65 pages, 5 figures. The manuscript has been revised from scratch\n compared to the previous version. Notable differences include (1) updated\n statements and corrected proofs for classification and regression, (2)\n explicit statements and proofs for distance-selective rules, and (3) new\n analogous estimators for density estimation"},{"id":"http://arxiv.org/abs/2407.17844v3","updated":"2024-09-06T05:29:33Z","published":"2024-07-25T07:58:19Z","title":"Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease\n Classification: A Systematic Review","summary":" Parkinson's disease (PD), the second most prevalent neurodegenerative\ndisorder worldwide, frequently presents with early-stage speech impairments.\nRecent advancements in Artificial Intelligence (AI), particularly deep learning\n(DL), have significantly enhanced PD diagnosis through the analysis of speech\ndata. Nevertheless, the progress of research is restricted by the limited\navailability of publicly accessible speech-based PD datasets, primarily due to\nprivacy concerns. The goal of this systematic review is to explore the current\nlandscape of speech-based DL approaches for PD classification, based on 33\nscientific works published between January 2020 and March 2024. We discuss\ntheir available resources, capabilities, and potential limitations, and issues\nrelated to bias, explainability, and privacy. Furthermore, this review provides\nan overview of publicly accessible speech-based datasets and open-source\nmaterial for PD. The DL approaches identified are categorized into end-to-end\n(E2E) learning, transfer learning (TL), and deep acoustic feature extraction\n(DAFE). Among E2E approaches, Convolutional Neural Networks (CNNs) are\nprevalent, though Transformers are increasingly popular. E2E approaches face\nchallenges such as limited data and computational resources, especially with\nTransformers. TL addresses these issues by providing more robust PD diagnosis\nand better generalizability across languages. DAFE aims to improve the\nexplainability and interpretability of results by examining the specific\neffects of deep features on both other DL approaches and more traditional\nmachine learning (ML) methods. However, it often underperforms compared to E2E\nand TL approaches.\n","authors":["Lisanne van Gelderen","Cristian Tejedor-García"],"pdf_url":"https://arxiv.org/pdf/2407.17844v3.pdf","comment":"van Gelderen, L., & Tejedor-Garc\\'ia, C. (2024). Innovative\n Speech-Based Deep Learning Approaches for Parkinson's Disease Classification:\n A Systematic Review. Applied Sciences, 14(17). doi:10.3390/app14177873 This\n research was funded by the NWO research programme NGF AiNed Fellowship Grants\n under the project Responsible AI for Voice Diagnostics (RAIVD) - grant number\n NGF.1607.22.013"},{"id":"http://arxiv.org/abs/2312.03690v4","updated":"2024-09-06T05:02:37Z","published":"2023-12-06T18:53:45Z","title":"AI-guided inverse design and discovery of recyclable vitrimeric polymers","summary":" Vitrimer is a new, exciting class of sustainable polymers with the ability to\nheal due to their dynamic covalent adaptive network that can go through\nassociative rearrangement reactions. However, a limited choice of constituent\nmolecules restricts their property space, prohibiting full realization of their\npotential applications. To overcome this challenge, we couple molecular\ndynamics (MD) simulations and a novel graph variational autoencoder (VAE)\nmachine learning model for inverse design of vitrimer chemistries with desired\nglass transition temperature (Tg) and synthesize a novel vitrimer polymer. We\nbuild the first vitrimer dataset of one million chemistries and calculate Tg on\n8,424 of them by high-throughput MD simulations calibrated by a Gaussian\nprocess model. The proposed novel VAE employs dual graph encoders and a latent\ndimension overlapping scheme which allows for individual representation of\nmulti-component vitrimers. By constructing a continuous latent space containing\nnecessary information of vitrimers, we demonstrate high accuracy and efficiency\nof our framework in discovering novel vitrimers with desirable Tg beyond the\ntraining regime. To validate the effectiveness of our framework in experiments,\nwe generate novel vitrimer chemistries with a target Tg = 323 K. By\nincorporating chemical intuition, we synthesize a vitrimer with Tg of 311-317\nK, and experimentally demonstrate healability and flowability. The proposed\nframework offers an exciting tool for polymer chemists to design and synthesize\nnovel, sustainable vitrimer polymers for a facet of applications.\n","authors":["Yiwen Zheng","Prakash Thakolkaran","Agni K. Biswal","Jake A. Smith","Ziheng Lu","Shuxin Zheng","Bichlien H. Nguyen","Siddhant Kumar","Aniruddh Vashisth"],"pdf_url":"https://arxiv.org/pdf/2312.03690v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00125v3","updated":"2024-09-06T04:39:17Z","published":"2024-08-28T22:02:42Z","title":"A Hybrid Framework for Spatial Interpolation: Merging Data-driven with\n Domain Knowledge","summary":" Estimating spatially distributed information through the interpolation of\nscattered observation datasets often overlooks the critical role of domain\nknowledge in understanding spatial dependencies. Additionally, the features of\nthese data sets are typically limited to the spatial coordinates of the\nscattered observation locations. In this paper, we propose a hybrid framework\nthat integrates data-driven spatial dependency feature extraction with\nrule-assisted spatial dependency function mapping to augment domain knowledge.\nWe demonstrate the superior performance of our framework in two comparative\napplication scenarios, highlighting its ability to capture more localized\nspatial features in the reconstructed distribution fields. Furthermore, we\nunderscore its potential to enhance nonlinear estimation capabilities through\nthe application of transformed fuzzy rules and to quantify the inherent\nuncertainties associated with the observation data sets. Our framework\nintroduces an innovative approach to spatial information estimation by\nsynergistically combining observational data with rule-assisted domain\nknowledge.\n","authors":["Cong Zhang","Shuyi Du","Hongqing Song","Yuhe Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00125v3.pdf","comment":"21 pages, 13 figures; typos corrected, references updated; few typos\n in few equations corrected, changed to Tex source"},{"id":"http://arxiv.org/abs/2409.04022v1","updated":"2024-09-06T04:26:57Z","published":"2024-09-06T04:26:57Z","title":"Heterogeneity-Aware Cooperative Federated Edge Learning with Adaptive\n Computation and Communication Compression","summary":" Motivated by the drawbacks of cloud-based federated learning (FL),\ncooperative federated edge learning (CFEL) has been proposed to improve\nefficiency for FL over mobile edge networks, where multiple edge servers\ncollaboratively coordinate the distributed model training across a large number\nof edge devices. However, CFEL faces critical challenges arising from dynamic\nand heterogeneous device properties, which slow down the convergence and\nincrease resource consumption. This paper proposes a heterogeneity-aware CFEL\nscheme called \\textit{Heterogeneity-Aware Cooperative Edge-based Federated\nAveraging} (HCEF) that aims to maximize the model accuracy while minimizing the\ntraining time and energy consumption via adaptive computation and communication\ncompression in CFEL. By theoretically analyzing how local update frequency and\ngradient compression affect the convergence error bound in CFEL, we develop an\nefficient online control algorithm for HCEF to dynamically determine local\nupdate frequencies and compression ratios for heterogeneous devices.\nExperimental results show that compared with prior schemes, the proposed HCEF\nscheme can maintain higher model accuracy while reducing training latency and\nimproving energy efficiency simultaneously.\n","authors":["Zhenxiao Zhang","Zhidong Gao","Yuanxiong Guo","Yanmin Gong"],"pdf_url":"https://arxiv.org/pdf/2409.04022v1.pdf","comment":"20 pages, 7 figures"},{"id":"http://arxiv.org/abs/2302.01538v8","updated":"2024-09-06T03:23:40Z","published":"2023-02-03T04:24:49Z","title":"DCEM: A deep complementary energy method for solid mechanics","summary":" In recent years, the rapid advancement of deep learning has significantly\nimpacted various fields, particularly in solving partial differential equations\n(PDEs) in the realm of solid mechanics, benefiting greatly from the remarkable\napproximation capabilities of neural networks. In solving PDEs,\nPhysics-Informed Neural Networks (PINNs) and the Deep Energy Method (DEM) have\ngarnered substantial attention. The principle of minimum potential energy and\ncomplementary energy are two important variational principles in solid\nmechanics. However, the well-known Deep Energy Method (DEM) is based on the\nprinciple of minimum potential energy, but there lacks the important form of\nminimum complementary energy. To bridge this gap, we propose the deep\ncomplementary energy method (DCEM) based on the principle of minimum\ncomplementary energy. The output function of DCEM is the stress function, which\ninherently satisfies the equilibrium equation. We present numerical results\nusing the Prandtl and Airy stress functions, and compare DCEM with existing\nPINNs and DEM algorithms when modeling representative mechanical problems. The\nresults demonstrate that DCEM outperforms DEM in terms of stress accuracy and\nefficiency and has an advantage in dealing with complex displacement boundary\nconditions, which is supported by theoretical analyses and numerical\nsimulations. We extend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy\npartial differential equations. Furthermore, we propose a deep complementary\nenergy operator method (DCEM-O) by combining operator learning with physical\nequations. Initially, we train DCEM-O using high-fidelity numerical results and\nthen incorporate complementary energy. DCEM-P and DCEM-O further enhance the\naccuracy and efficiency of DCEM.\n","authors":["Yizheng Wang","Jia Sun","Timon Rabczuk","Yinghua Liu"],"pdf_url":"https://arxiv.org/pdf/2302.01538v8.pdf","comment":"50 pages, 32 figures"},{"id":"http://arxiv.org/abs/2409.01832v2","updated":"2024-09-06T03:13:22Z","published":"2024-09-03T12:30:21Z","title":"Beyond Unconstrained Features: Neural Collapse for Shallow Neural\n Networks with General Data","summary":" Neural collapse (NC) is a phenomenon that emerges at the terminal phase of\nthe training (TPT) of deep neural networks (DNNs). The features of the data in\nthe same class collapse to their respective sample means and the sample means\nexhibit a simplex equiangular tight frame (ETF). In the past few years, there\nhas been a surge of works that focus on explaining why the NC occurs and how it\naffects generalization. Since the DNNs are notoriously difficult to analyze,\nmost works mainly focus on the unconstrained feature model (UFM). While the UFM\nexplains the NC to some extent, it fails to provide a complete picture of how\nthe network architecture and the dataset affect NC. In this work, we focus on\nshallow ReLU neural networks and try to understand how the width, depth, data\ndimension, and statistical property of the training dataset influence the\nneural collapse. We provide a complete characterization of when the NC occurs\nfor two or three-layer neural networks. For two-layer ReLU neural networks, a\nsufficient condition on when the global minimizer of the regularized empirical\nrisk function exhibits the NC configuration depends on the data dimension,\nsample size, and the signal-to-noise ratio in the data instead of the network\nwidth. For three-layer neural networks, we show that the NC occurs as long as\nthe first layer is sufficiently wide. Regarding the connection between NC and\ngeneralization, we show the generalization heavily depends on the SNR\n(signal-to-noise ratio) in the data: even if the NC occurs, the generalization\ncan still be bad provided that the SNR in the data is too low. Our results\nsignificantly extend the state-of-the-art theoretical analysis of the N C under\nthe UFM by characterizing the emergence of the N C under shallow nonlinear\nnetworks and showing how it depends on data properties and network\narchitecture.\n","authors":["Wanli Hong","Shuyang Ling"],"pdf_url":"https://arxiv.org/pdf/2409.01832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04001v1","updated":"2024-09-06T03:05:35Z","published":"2024-09-06T03:05:35Z","title":"Over-parameterized regression methods and their application to\n semi-supervised learning","summary":" The minimum norm least squares is an estimation strategy under an\nover-parameterized case and, in machine learning, is known as a helpful tool\nfor understanding a nature of deep learning. In this paper, to apply it in a\ncontext of non-parametric regression problems, we established several methods\nwhich are based on thresholding of SVD (singular value decomposition)\ncomponents, wihch are referred to as SVD regression methods. We considered\nseveral methods that are singular value based thresholding, hard-thresholding\nwith cross validation, universal thresholding and bridge thresholding.\nInformation on output samples is not utilized in the first method while it is\nutilized in the other methods. We then applied them to semi-supervised\nlearning, in which unlabeled input samples are incorporated into kernel\nfunctions in a regressor. The experimental results for real data showed that,\ndepending on the datasets, the SVD regression methods is superior to a naive\nridge regression method. Unfortunately, there were no clear advantage of the\nmethods utilizing information on output samples. Furthermore, for depending on\ndatasets, incorporation of unlabeled input samples into kernels is found to\nhave certain advantages.\n","authors":["Katsuyuki Hagiwara"],"pdf_url":"https://arxiv.org/pdf/2409.04001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.12277v4","updated":"2024-09-06T03:04:51Z","published":"2022-10-21T22:07:28Z","title":"The Stochastic Proximal Distance Algorithm","summary":" Stochastic versions of proximal methods have gained much attention in\nstatistics and machine learning. These algorithms tend to admit simple,\nscalable forms, and enjoy numerical stability via implicit updates. In this\nwork, we propose and analyze a stochastic version of the recently proposed\nproximal distance algorithm, a class of iterative optimization methods that\nrecover a desired constrained estimation problem as a penalty parameter $\\rho\n\\rightarrow \\infty$. By uncovering connections to related stochastic proximal\nmethods and interpreting the penalty parameter as the learning rate, we justify\nheuristics used in practical manifestations of the proximal distance method,\nestablishing their convergence guarantees for the first time. Moreover, we\nextend recent theoretical devices to establish finite error bounds and a\ncomplete characterization of convergence rates regimes. We validate our\nanalysis via a thorough empirical study, also showing that unsurprisingly, the\nproposed method outpaces batch versions on popular learning tasks.\n","authors":["Haoyu Jiang","Jason Xu"],"pdf_url":"https://arxiv.org/pdf/2210.12277v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03996v1","updated":"2024-09-06T02:49:12Z","published":"2024-09-06T02:49:12Z","title":"Goal-Reaching Policy Learning from Non-Expert Observations via Effective\n Subgoal Guidance","summary":" In this work, we address the challenging problem of long-horizon\ngoal-reaching policy learning from non-expert, action-free observation data.\nUnlike fully labeled expert data, our data is more accessible and avoids the\ncostly process of action labeling. Additionally, compared to online learning,\nwhich often involves aimless exploration, our data provides useful guidance for\nmore efficient exploration. To achieve our goal, we propose a novel subgoal\nguidance learning strategy. The motivation behind this strategy is that\nlong-horizon goals offer limited guidance for efficient exploration and\naccurate state transition. We develop a diffusion strategy-based high-level\npolicy to generate reasonable subgoals as waypoints, preferring states that\nmore easily lead to the final goal. Additionally, we learn state-goal value\nfunctions to encourage efficient subgoal reaching. These two components\nnaturally integrate into the off-policy actor-critic framework, enabling\nefficient goal attainment through informative exploration. We evaluate our\nmethod on complex robotic navigation and manipulation tasks, demonstrating a\nsignificant performance advantage over existing methods. Our ablation study\nfurther shows that our method is robust to observation data with various\ncorruptions.\n","authors":["RenMing Huang","Shaochong Liu","Yunqiang Pei","Peng Wang","Guoqing Wang","Yang Yang","Hengtao Shen"],"pdf_url":"https://arxiv.org/pdf/2409.03996v1.pdf","comment":"Accepted to CoRL 2024"},{"id":"http://arxiv.org/abs/2409.03986v1","updated":"2024-09-06T02:20:13Z","published":"2024-09-06T02:20:13Z","title":"An Efficient and Generalizable Symbolic Regression Method for Time\n Series Analysis","summary":" Time series analysis and prediction methods currently excel in quantitative\nanalysis, offering accurate future predictions and diverse statistical\nindicators, but generally falling short in elucidating the underlying evolution\npatterns of time series. To gain a more comprehensive understanding and provide\ninsightful explanations, we utilize symbolic regression techniques to derive\nexplicit expressions for the non-linear dynamics in the evolution of time\nseries variables. However, these techniques face challenges in computational\nefficiency and generalizability across diverse real-world time series data. To\novercome these challenges, we propose \\textbf{N}eural-\\textbf{E}nhanced\n\\textbf{Mo}nte-Carlo \\textbf{T}ree \\textbf{S}earch (NEMoTS) for time series.\nNEMoTS leverages the exploration-exploitation balance of Monte-Carlo Tree\nSearch (MCTS), significantly reducing the search space in symbolic regression\nand improving expression quality. Furthermore, by integrating neural networks\nwith MCTS, NEMoTS not only capitalizes on their superior fitting capabilities\nto concentrate on more pertinent operations post-search space reduction, but\nalso replaces the complex and time-consuming simulation process, thereby\nsubstantially improving computational efficiency and generalizability in time\nseries analysis. NEMoTS offers an efficient and comprehensive approach to time\nseries analysis. Experiments with three real-world datasets demonstrate\nNEMoTS's significant superiority in performance, efficiency, reliability, and\ninterpretability, making it well-suited for large-scale real-world time series\ndata.\n","authors":["Yi Xie","Tianyu Qiu","Yun Xiong","Xiuqi Huang","Xiaofeng Gao","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2409.03986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.15408v6","updated":"2024-09-06T02:15:07Z","published":"2020-12-31T02:42:27Z","title":"Gated Ensemble of Spatio-temporal Mixture of Experts for Multi-task\n Learning in Ride-hailing System","summary":" Ride-hailing system requires efficient management of dynamic demand and\nsupply to ensure optimal service delivery, pricing strategies, and operational\nefficiency. Designing spatio-temporal forecasting models separately in a\ntask-wise and city-wise manner to forecast demand and supply-demand gap in a\nride-hailing system poses a burden for the expanding transportation network\ncompanies. Therefore, a multi-task learning architecture is proposed in this\nstudy by developing gated ensemble of spatio-temporal mixture of experts\nnetwork (GESME-Net) with convolutional recurrent neural network (CRNN),\nconvolutional neural network (CNN), and recurrent neural network (RNN) for\nsimultaneously forecasting these spatio-temporal tasks in a city as well as\nacross different cities. Furthermore, a task adaptation layer is integrated\nwith the architecture for learning joint representation in multi-task learning\nand revealing the contribution of the input features utilized in prediction.\nThe proposed architecture is tested with data from Didi Chuxing for: (i)\nsimultaneously forecasting demand and supply-demand gap in Beijing, and (ii)\nsimultaneously forecasting demand across Chengdu and Xian. In both scenarios,\nmodels from our proposed architecture outperformed the single-task and\nmulti-task deep learning benchmarks and ensemble-based machine learning\nalgorithms.\n","authors":["M. H. Rahman","S. M. Rifaat","S. N. Sadeek","M. Abrar","D. Wang"],"pdf_url":"https://arxiv.org/pdf/2012.15408v6.pdf","comment":"arXiv admin note: text overlap with arXiv:2012.08868"},{"id":"http://arxiv.org/abs/2409.03980v1","updated":"2024-09-06T02:01:03Z","published":"2024-09-06T02:01:03Z","title":"Entry-Specific Matrix Estimation under Arbitrary Sampling Patterns\n through the Lens of Network Flows","summary":" Matrix completion tackles the task of predicting missing values in a low-rank\nmatrix based on a sparse set of observed entries. It is often assumed that the\nobservation pattern is generated uniformly at random or has a very specific\nstructure tuned to a given algorithm. There is still a gap in our understanding\nwhen it comes to arbitrary sampling patterns. Given an arbitrary sampling\npattern, we introduce a matrix completion algorithm based on network flows in\nthe bipartite graph induced by the observation pattern. For additive matrices,\nthe particular flow we used is the electrical flow and we establish error upper\nbounds customized to each entry as a function of the observation set, along\nwith matching minimax lower bounds. Our results show that the minimax squared\nerror for recovery of a particular entry in the matrix is proportional to the\neffective resistance of the corresponding edge in the graph. Furthermore, we\nshow that our estimator is equivalent to the least squares estimator. We apply\nour estimator to the two-way fixed effects model and show that it enables us to\naccurately infer individual causal effects and the unit-specific and\ntime-specific confounders. For rank-$1$ matrices, we use edge-disjoint paths to\nform an estimator that achieves minimax optimal estimation when the sampling is\nsufficiently dense. Our discovery introduces a new family of estimators\nparametrized by network flows, which provide a fine-grained and intuitive\nunderstanding of the impact of the given sampling pattern on the relative\ndifficulty of estimation at an entry-specific level. This graph-based approach\nallows us to quantify the inherent complexity of matrix completion for\nindividual entries, rather than relying solely on global measures of\nperformance.\n","authors":["Yudong Chen","Xumei Xi","Christina Lee Yu"],"pdf_url":"https://arxiv.org/pdf/2409.03980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13004v5","updated":"2024-09-06T01:55:41Z","published":"2023-06-22T16:04:16Z","title":"Can Differentiable Decision Trees Enable Interpretable Reward Learning\n from Human Feedback?","summary":" Reinforcement Learning from Human Feedback (RLHF) has emerged as a popular\nparadigm for capturing human intent to alleviate the challenges of\nhand-crafting the reward values. Despite the increasing interest in RLHF, most\nworks learn black box reward functions that while expressive are difficult to\ninterpret and often require running the whole costly process of RL before we\ncan even decipher if these frameworks are actually aligned with human\npreferences. We propose and evaluate a novel approach for learning expressive\nand interpretable reward functions from preferences using Differentiable\nDecision Trees (DDTs). Our experiments across several domains, including\nCartPole, Visual Gridworld environments and Atari games, provide evidence that\nthe tree structure of our learned reward function is useful in determining the\nextent to which the reward function is aligned with human preferences. We also\nprovide experimental evidence that not only shows that reward DDTs can often\nachieve competitive RL performance when compared with larger capacity deep\nneural network reward functions but also demonstrates the diagnostic utility of\nour framework in checking alignment of learned reward functions. We also\nobserve that the choice between soft and hard (argmax) output of reward DDT\nreveals a tension between wanting highly shaped rewards to ensure good RL\nperformance, while also wanting simpler, more interpretable rewards. Videos and\ncode, are available at: https://sites.google.com/view/ddt-rlhf\n","authors":["Akansha Kalra","Daniel S. Brown"],"pdf_url":"https://arxiv.org/pdf/2306.13004v5.pdf","comment":"Accepted at RLC 2024"},{"id":"http://arxiv.org/abs/2409.03977v1","updated":"2024-09-06T01:54:35Z","published":"2024-09-06T01:54:35Z","title":"Bi-modality Images Transfer with a Discrete Process Matching Method","summary":" Recently, medical image synthesis gains more and more popularity, along with\nthe rapid development of generative models. Medical image synthesis aims to\ngenerate an unacquired image modality, often from other observed data\nmodalities. Synthesized images can be used for clinical diagnostic assistance,\ndata augmentation for model training and validation or image quality improving.\nIn the meanwhile, the flow-based models are among the successful generative\nmodels for the ability of generating realistic and high-quality synthetic\nimages. However, most flow-based models require to calculate flow ordinary\ndifferent equation (ODE) evolution steps in transfer process, for which the\nperformances are significantly limited by heavy computation time due to a large\nnumber of time iterations. In this paper, we propose a novel flow-based model,\nnamely Discrete Process Matching (DPM) to accomplish the bi-modality image\ntransfer tasks. Different to other flow matching based models, we propose to\nutilize both forward and backward ODE flow and enhance the consistency on the\nintermediate images of few discrete time steps, resulting in a transfer process\nwith much less iteration steps while maintaining high-quality generations for\nboth modalities. Our experiments on three datasets of MRI T1/T2 and CT/MRI\ndemonstrate that DPM outperforms other state-of-the-art flow-based methods for\nbi-modality image synthesis, achieving higher image quality with less\ncomputation time cost.\n","authors":["Zhe Xiong","Qiaoqiao Ding","Xiaoqun Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04298v3","updated":"2024-09-06T01:14:26Z","published":"2024-04-04T20:27:37Z","title":"SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated\n Responses","summary":" Can LLMs consistently improve their previous outputs for better results? For\nthis to be true, LLMs would need to be better at discriminating among\npreviously-generated alternatives, than generating initial responses. We\nexplore the validity of this hypothesis in practice. We first formulate a\nunified framework that allows us to compare the generative and discriminative\ncapability of any model on any task. In our resulting experimental analysis of\nseveral open-source and industrial LLMs, we observe that models are not\nreliably better at discriminating among previously-generated alternatives than\ngenerating initial responses. This finding challenges the notion that LLMs may\nbe able to enhance their performance only through their own judgment.\n","authors":["Dongwei Jiang","Jingyu Zhang","Orion Weller","Nathaniel Weir","Benjamin Van Durme","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2404.04298v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03962v1","updated":"2024-09-06T01:07:29Z","published":"2024-09-06T01:07:29Z","title":"Average Causal Effect Estimation in DAGs with Hidden Variables:\n Extensions of Back-Door and Front-Door Criteria","summary":" The identification theory for causal effects in directed acyclic graphs\n(DAGs) with hidden variables is well-developed, but methods for estimating and\ninferring functionals beyond the g-formula remain limited. Previous studies\nhave proposed semiparametric estimators for identifiable functionals in a broad\nclass of DAGs with hidden variables. While demonstrating double robustness in\nsome models, existing estimators face challenges, particularly with density\nestimation and numerical integration for continuous variables, and their\nestimates may fall outside the parameter space of the target estimand. Their\nasymptotic properties are also underexplored, especially when using flexible\nstatistical and machine learning models for nuisance estimation. This study\naddresses these challenges by introducing novel one-step corrected plug-in and\ntargeted minimum loss-based estimators of causal effects for a class of DAGs\nthat extend classical back-door and front-door criteria (known as the treatment\nprimal fixability criterion in prior literature). These estimators leverage\nmachine learning to minimize modeling assumptions while ensuring key\nstatistical properties such as asymptotic linearity, double robustness,\nefficiency, and staying within the bounds of the target parameter space. We\nestablish conditions for nuisance functional estimates in terms of L2(P)-norms\nto achieve root-n consistent causal effect estimates. To facilitate practical\napplication, we have developed the flexCausal package in R.\n","authors":["Anna Guo","Razieh Nabi"],"pdf_url":"https://arxiv.org/pdf/2409.03962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09932v2","updated":"2024-09-06T00:46:40Z","published":"2024-04-15T16:58:28Z","title":"Foundational Challenges in Assuring Alignment and Safety of Large\n Language Models","summary":" This work identifies 18 foundational challenges in assuring the alignment and\nsafety of large language models (LLMs). These challenges are organized into\nthree different categories: scientific understanding of LLMs, development and\ndeployment methods, and sociotechnical challenges. Based on the identified\nchallenges, we pose $200+$ concrete research questions.\n","authors":["Usman Anwar","Abulhair Saparov","Javier Rando","Daniel Paleka","Miles Turpin","Peter Hase","Ekdeep Singh Lubana","Erik Jenner","Stephen Casper","Oliver Sourbut","Benjamin L. Edelman","Zhaowei Zhang","Mario Günther","Anton Korinek","Jose Hernandez-Orallo","Lewis Hammond","Eric Bigelow","Alexander Pan","Lauro Langosco","Tomasz Korbak","Heidi Zhang","Ruiqi Zhong","Seán Ó hÉigeartaigh","Gabriel Recchia","Giulio Corsi","Alan Chan","Markus Anderljung","Lilian Edwards","Aleksandar Petrov","Christian Schroeder de Witt","Sumeet Ramesh Motwan","Yoshua Bengio","Danqi Chen","Philip H. S. Torr","Samuel Albanie","Tegan Maharaj","Jakob Foerster","Florian Tramer","He He","Atoosa Kasirzadeh","Yejin Choi","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2404.09932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03956v1","updated":"2024-09-06T00:43:21Z","published":"2024-09-06T00:43:21Z","title":"Algorithmic Collusion Without Threats","summary":" There has been substantial recent concern that pricing algorithms might learn\nto ``collude.'' Supra-competitive prices can emerge as a Nash equilibrium of\nrepeated pricing games, in which sellers play strategies which threaten to\npunish their competitors who refuse to support high prices, and these\nstrategies can be automatically learned. In fact, a standard economic intuition\nis that supra-competitive prices emerge from either the use of threats, or a\nfailure of one party to optimize their payoff. Is this intuition correct? Would\npreventing threats in algorithmic decision-making prevent supra-competitive\nprices when sellers are optimizing for their own revenue? No. We show that\nsupra-competitive prices can emerge even when both players are using algorithms\nwhich do not encode threats, and which optimize for their own revenue. We study\nsequential pricing games in which a first mover deploys an algorithm and then a\nsecond mover optimizes within the resulting environment. We show that if the\nfirst mover deploys any algorithm with a no-regret guarantee, and then the\nsecond mover even approximately optimizes within this now static environment,\nmonopoly-like prices arise. The result holds for any no-regret learning\nalgorithm deployed by the first mover and for any pricing policy of the second\nmover that obtains them profit at least as high as a random pricing would --\nand hence the result applies even when the second mover is optimizing only\nwithin a space of non-responsive pricing distributions which are incapable of\nencoding threats. In fact, there exists a set of strategies, neither of which\nexplicitly encode threats that form a Nash equilibrium of the simultaneous\npricing game in algorithm space, and lead to near monopoly prices. This\nsuggests that the definition of ``algorithmic collusion'' may need to be\nexpanded, to include strategies without explicitly encoded threats.\n","authors":["Eshwar Ram Arunachaleswaran","Natalie Collina","Sampath Kannan","Aaron Roth","Juba Ziani"],"pdf_url":"https://arxiv.org/pdf/2409.03956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03953v1","updated":"2024-09-06T00:34:44Z","published":"2024-09-06T00:34:44Z","title":"Epistemic Uncertainty and Observation Noise with the Neural Tangent\n Kernel","summary":" Recent work has shown that training wide neural networks with gradient\ndescent is formally equivalent to computing the mean of the posterior\ndistribution in a Gaussian Process (GP) with the Neural Tangent Kernel (NTK) as\nthe prior covariance and zero aleatoric noise \\parencite{jacot2018neural}. In\nthis paper, we extend this framework in two ways. First, we show how to deal\nwith non-zero aleatoric noise. Second, we derive an estimator for the posterior\ncovariance, giving us a handle on epistemic uncertainty. Our proposed approach\nintegrates seamlessly with standard training pipelines, as it involves training\na small number of additional predictors using gradient descent on a mean\nsquared error loss. We demonstrate the proof-of-concept of our method through\nempirical evaluation on synthetic regression.\n","authors":["Sergio Calvo-Ordoñez","Konstantina Palla","Kamil Ciosek"],"pdf_url":"https://arxiv.org/pdf/2409.03953v1.pdf","comment":"11 pages including appendix"},{"id":"http://arxiv.org/abs/2409.03948v1","updated":"2024-09-06T00:13:05Z","published":"2024-09-06T00:13:05Z","title":"The Veracity Problem: Detecting False Information and its Propagation on\n Online Social Media Networks","summary":" Detecting false information on social media is critical in mitigating its\nnegative societal impacts. To reduce the propagation of false information,\nautomated detection provide scalable, unbiased, and cost-effective methods.\nHowever, there are three potential research areas identified which once solved\nimprove detection. First, current AI-based solutions often provide a\nuni-dimensional analysis on a complex, multi-dimensional issue, with solutions\ndiffering based on the features used. Furthermore, these methods do not account\nfor the temporal and dynamic changes observed within the document's life cycle.\nSecond, there has been little research on the detection of coordinated\ninformation campaigns and in understanding the intent of the actors and the\ncampaign. Thirdly, there is a lack of consideration of cross-platform analysis,\nwith existing datasets focusing on a single platform, such as X, and detection\nmodels designed for specific platform.\n This work aims to develop methods for effective detection of false\ninformation and its propagation. To this end, firstly we aim to propose the\ncreation of an ensemble multi-faceted framework that leverages multiple aspects\nof false information. Secondly, we propose a method to identify actors and\ntheir intent when working in coordination to manipulate a narrative. Thirdly,\nwe aim to analyse the impact of cross-platform interactions on the propagation\nof false information via the creation of a new dataset.\n","authors":["Sarah Condran"],"pdf_url":"https://arxiv.org/pdf/2409.03948v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.18148v2","updated":"2024-09-06T00:02:52Z","published":"2024-07-25T15:58:56Z","title":"StraightLine: An End-to-End Resource-Aware Scheduler for Machine\n Learning Application Requests","summary":" The life cycle of machine learning (ML) applications consists of two stages:\nmodel development and model deployment. However, traditional ML systems (e.g.,\ntraining-specific or inference-specific systems) focus on one particular stage\nor phase of the life cycle of ML applications. These systems often aim at\noptimizing model training or accelerating model inference, and they frequently\nassume homogeneous infrastructure, which may not always reflect real-world\nscenarios that include cloud data centers, local servers, containers, and\nserverless platforms. We present StraightLine, an end-to-end resource-aware\nscheduler that schedules the optimal resources (e.g., container, virtual\nmachine, or serverless) for different ML application requests in a hybrid\ninfrastructure. The key innovation is an empirical dynamic placing algorithm\nthat intelligently places requests based on their unique characteristics (e.g.,\nrequest frequency, input data size, and data distribution). In contrast to\nexisting ML systems, StraightLine offers end-to-end resource-aware placement,\nthereby it can significantly reduce response time and failure rate for model\ndeployment when facing different computing resources in the hybrid\ninfrastructure.\n","authors":["Cheng-Wei Ching","Boyuan Guan","Hailu Xu","Liting Hu"],"pdf_url":"https://arxiv.org/pdf/2407.18148v2.pdf","comment":"6 pages, 8 figures, to appear in AIoTC'24"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.16879v2","updated":"2024-09-06T17:17:16Z","published":"2024-08-29T20:05:02Z","title":"MSLIQA: Enhancing Learning Representations for Image Quality Assessment\n through Multi-Scale Learning","summary":" No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due\nto the diversity of distortions and the lack of large annotated datasets. Many\nstudies have attempted to tackle these challenges by developing more accurate\nNR-IQA models, often employing complex and computationally expensive networks,\nor by bridging the domain gap between various distortions to enhance\nperformance on test datasets. In our work, we improve the performance of a\ngeneric lightweight NR-IQA model by introducing a novel augmentation strategy\nthat boosts its performance by almost 28\\%. This augmentation strategy enables\nthe network to better discriminate between different distortions in various\nparts of the image by zooming in and out. Additionally, the inclusion of\ntest-time augmentation further enhances performance, making our lightweight\nnetwork's results comparable to the current state-of-the-art models, simply\nthrough the use of augmentations.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.16879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17057v2","updated":"2024-09-06T17:15:49Z","published":"2024-08-30T07:32:19Z","title":"LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality\n Assessment Model","summary":" Recent advancements in the field of No-Reference Image Quality Assessment\n(NR-IQA) using deep learning techniques demonstrate high performance across\nmultiple open-source datasets. However, such models are typically very large\nand complex making them not so suitable for real-world deployment, especially\non resource- and battery-constrained mobile devices. To address this\nlimitation, we propose a compact, lightweight NR-IQA model that achieves\nstate-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation\nand test datasets while being also nearly 5.7 times faster than the fastest\nSOTA model. Our model features a dual-branch architecture, with each branch\nseparately trained on synthetically and authentically distorted images which\nenhances the model's generalizability across different distortion types. To\nimprove robustness under diverse real-world visual conditions, we additionally\nincorporate multiple color spaces during the training process. We also\ndemonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks\n(KANs) for final quality regression as compared to the conventional Multi-Layer\nPerceptrons (MLPs). Our evaluation considering various open-source datasets\nhighlights the practical, high-accuracy, and robust performance of our proposed\nlightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA.\n","authors":["Nasim Jamshidi Avanaki","Abhijay Ghildyal","Nabajeet Barman","Saman Zadtootaghaj"],"pdf_url":"https://arxiv.org/pdf/2408.17057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04398v1","updated":"2024-09-06T16:43:04Z","published":"2024-09-06T16:43:04Z","title":"HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale\n Space Using Wearable IMUs and LiDAR","summary":" We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture\nmethod, aimed at accurately and efficiently creating a dynamic digital world,\ncontaining large-scale indoor-outdoor scenes, diverse human motions, rich\nhuman-human interactions, and human-environment interactions. By utilizing\nbody-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human\nmotions in unconstrained space without the need for external devices and\npre-built maps. This affords great flexibility and accessibility for\nhuman-centered interaction and 4D scene capturing in various environments.\nTaking into account that IMUs can capture human spatially unrestricted poses\nbut are prone to drifting for long-period using, and while LiDAR is stable for\nglobal localization but rough for local positions and orientations, HiSC4D\nemploys a joint optimization method, harmonizing all sensors and utilizing\nenvironment cues, yielding promising results for long-term capture in large\nscenes. To promote research of egocentric human interaction in large scenes and\nfacilitate downstream tasks, we also present a dataset, containing 8 sequences\nin 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D\nhuman motions with SMPL annotations and dynamic scenes, 31k frames of cropped\nhuman point clouds, and scene mesh of the environment. A variety of scenarios,\nsuch as the basketball gym and commercial street, alongside challenging human\nmotions, such as daily greeting, one-on-one basketball playing, and tour\nguiding, demonstrate the effectiveness and the generalization ability of\nHiSC4D. The dataset and code will be publicated on\nwww.lidarhumanmotion.net/hisc4d available for research purposes.\n","authors":["Yudi Dai","Zhiyong Wang","Xiping Lin","Chenglu Wen","Lan Xu","Siqi Shen","Yuexin Ma","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04398v1.pdf","comment":"17 pages, 10 figures, Jornal"},{"id":"http://arxiv.org/abs/2409.04388v1","updated":"2024-09-06T16:27:52Z","published":"2024-09-06T16:27:52Z","title":"Question-Answering Dense Video Events","summary":" Multimodal Large Language Models (MLLMs) have shown excellent performance in\nquestion-answering of single-event videos. In this paper, we present\nquestion-answering dense video events, a novel task that requires answering and\ngrounding the dense-event questions in long videos, thus challenging MLLMs to\nfaithfully comprehend and reason about multiple events occurring over extended\ntime periods. To facilitate the study, we construct DeVE-QA - a dataset\nfeaturing 78K questions about 26K events on 10.6K long videos. We then\nbenchmark and show that existing MLLMs excelling at single-event QA struggle to\nperform well in DeVE-QA. For improvement, we propose DeVi, a novel\ntraining-free MLLM approach that highlights a hierarchical captioning module, a\ntemporal event memory module, and a self-consistency checking module to\nrespectively detect, contextualize and memorize, and ground dense-events in\nlong videos for question answering. Extensive experiments show that DeVi is\nsuperior at answering dense-event questions and grounding relevant video\nmoments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1\npercent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA\nrespectively.\n","authors":["Hangyu Qin","Junbin Xiao","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2409.04388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04013v1","updated":"2024-09-06T03:53:59Z","published":"2024-09-06T03:53:59Z","title":"3D-GP-LMVIC: Learning-based Multi-View Image Coding with 3D Gaussian\n Geometric Priors","summary":" Multi-view image compression is vital for 3D-related applications. To\neffectively model correlations between views, existing methods typically\npredict disparity between two views on a 2D plane, which works well for small\ndisparities, such as in stereo images, but struggles with larger disparities\ncaused by significant view changes. To address this, we propose a novel\napproach: learning-based multi-view image coding with 3D Gaussian geometric\npriors (3D-GP-LMVIC). Our method leverages 3D Gaussian Splatting to derive\ngeometric priors of the 3D scene, enabling more accurate disparity estimation\nacross views within the compression model. Additionally, we introduce a depth\nmap compression model to reduce redundancy in geometric information between\nviews. A multi-view sequence ordering method is also proposed to enhance\ncorrelations between adjacent views. Experimental results demonstrate that\n3D-GP-LMVIC surpasses both traditional and learning-based methods in\nperformance, while maintaining fast encoding and decoding speed.\n","authors":["Yujun Huang","Bin Chen","Niu Lian","Baoyi An","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2409.04013v1.pdf","comment":"19pages, 8 figures, conference"}]},"2024-09-09T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.05840v1","updated":"2024-09-09T17:44:00Z","published":"2024-09-09T17:44:00Z","title":"MMEvol: Empowering Multimodal Large Language Models with Evol-Instruct","summary":" The development of Multimodal Large Language Models (MLLMs) has seen\nsignificant advancements. However, the quantity and quality of multimodal\ninstruction data have emerged as significant bottlenecks in their progress.\nManually creating multimodal instruction data is both time-consuming and\ninefficient, posing challenges in producing instructions of high complexity.\nMoreover, distilling instruction data from black-box commercial models (e.g.,\nGPT-4o, GPT-4V) often results in simplistic instruction data, which constrains\nperformance to that of these models. The challenge of curating diverse and\ncomplex instruction data remains substantial. We propose MMEvol, a novel\nmultimodal instruction data evolution framework that combines fine-grained\nperception evolution, cognitive reasoning evolution, and interaction evolution.\nThis iterative approach breaks through data quality bottlenecks to generate a\ncomplex and diverse image-text instruction dataset, thereby empowering MLLMs\nwith enhanced capabilities. Beginning with an initial set of instructions,\nSEED-163K, we utilize MMEvol to systematically broadens the diversity of\ninstruction types, integrates reasoning steps to enhance cognitive\ncapabilities, and extracts detailed information from images to improve visual\nunderstanding and robustness. To comprehensively evaluate the effectiveness of\nour data, we train LLaVA-NeXT using the evolved data and conduct experiments\nacross 13 vision-language tasks. Compared to the baseline trained with seed\ndata, our approach achieves an average accuracy improvement of 3.1 points and\nreaches state-of-the-art (SOTA) performance on 9 of these tasks.\n","authors":["Run Luo","Haonan Zhang","Longze Chen","Ting-En Lin","Xiong Liu","Yuchuan Wu","Min Yang","Minzheng Wang","Pengpeng Zeng","Lianli Gao","Heng Tao Shen","Yunshui Li","Xiaobo Xia","Fei Huang","Jingkuan Song","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2409.05840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14770v5","updated":"2024-09-09T17:37:16Z","published":"2023-05-24T06:19:14Z","title":"Using Natural Language Explanations to Rescale Human Judgments","summary":" The rise of large language models (LLMs) has brought a critical need for\nhigh-quality human-labeled data, particularly for processes like human feedback\nand evaluation. A common practice is to label data via consensus annotation\nover human judgments. However, annotators' judgments for subjective tasks can\ndiffer in many ways: they may reflect different qualitative judgments about an\nexample, and they may be mapped to a labeling scheme in different ways. We show\nthat these nuances can be captured by natural language explanations, and\npropose a method to rescale ordinal annotations and explanations using LLMs.\nSpecifically, we feed annotators' Likert ratings and corresponding explanations\ninto an LLM and prompt it to produce a numeric score anchored in a scoring\nrubric. These scores should reflect the annotators' underlying assessments of\nthe example. The rubric can be designed or modified after annotation, and\ninclude distinctions that may not have been known when the original error\ntaxonomy was devised. We explore our technique in the context of rating system\noutputs for a document-grounded question answering task, where LLMs achieve\nnear-human performance. Our method rescales the raw judgments without impacting\nagreement and brings the scores closer to human judgments grounded in the same\nscoring rubric.\n","authors":["Manya Wadhwa","Jifan Chen","Junyi Jessy Li","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2305.14770v5.pdf","comment":"Data available at\n https://github.com/ManyaWadhwa/explanation_based_rescaling"},{"id":"http://arxiv.org/abs/2405.20611v2","updated":"2024-09-09T17:35:58Z","published":"2024-05-31T03:57:19Z","title":"Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in\n Lifted Compiled Code","summary":" Detecting vulnerabilities within compiled binaries is challenging due to lost\nhigh-level code structures and other factors such as architectural\ndependencies, compilers, and optimization options. To address these obstacles,\nthis research explores vulnerability detection using natural language\nprocessing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn\nsemantics from intermediate representation (LLVM IR) code. Long short-term\nmemory (LSTM) neural networks were trained on embeddings from encoders created\nusing approximately 48k LLVM functions from the Juliet dataset. This study is\npioneering in its comparison of word2vec models with multiple bidirectional\ntransformer (BERT, RoBERTa) embeddings built using LLVM code to train neural\nnetworks to detect vulnerabilities in compiled binaries. word2vec Skip-Gram\nmodels achieved 92% validation accuracy in detecting vulnerabilities,\noutperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This\nsuggests that complex contextual embeddings may not provide advantages over\nsimpler word2vec models for this task when a limited number (e.g. 48K) of data\nsamples are used to train the bidirectional transformer-based models. The\ncomparative results provide novel insights into selecting optimal embeddings\nfor learning compiler-independent semantic code representations to advance\nmachine learning detection of vulnerabilities in compiled binaries.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2405.20611v2.pdf","comment":"Updated with improvements\""},{"id":"http://arxiv.org/abs/2409.05816v1","updated":"2024-09-09T17:23:29Z","published":"2024-09-09T17:23:29Z","title":"Improving Pretraining Data Using Perplexity Correlations","summary":" Quality pretraining data is often seen as the key to high-performance\nlanguage models. However, progress in understanding pretraining data has been\nslow due to the costly pretraining runs required for data selection\nexperiments. We present a framework that avoids these costs and selects\nhigh-quality pretraining data without any LLM training of our own. Our work is\nbased on a simple observation: LLM losses on many pretraining texts are\ncorrelated with downstream benchmark performance, and selecting\nhigh-correlation documents is an effective pretraining data selection method.\nWe build a new statistical framework for data selection centered around\nestimates of perplexity-benchmark correlations and perform data selection using\na sample of 90 LLMs taken from the Open LLM Leaderboard on texts from tens of\nthousands of web domains. In controlled pretraining experiments at the 160M\nparameter scale on 8 benchmarks, our approach outperforms DSIR on every\nbenchmark, while matching the best data selector found in DataComp-LM, a\nhand-engineered bigram classifier.\n","authors":["Tristan Thrush","Christopher Potts","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2409.05816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05806v1","updated":"2024-09-09T17:11:51Z","published":"2024-09-09T17:11:51Z","title":"Benchmarking Chinese Knowledge Rectification in Large Language Models","summary":" While Large Language Models (LLMs) exhibit remarkable generative\ncapabilities, they are not without flaws, particularly in the form of\nhallucinations. This issue is even more pronounced when LLMs are applied to\nspecific languages and domains. For example, LLMs may generate nonsense\ninformation when handling Chinese ancient poetry, proverbs, or idioms, owing to\nthe lack of specific knowledge. To this end, this paper introduces a benchmark\nfor rectifying Chinese knowledge in LLMs via knowledge editing. Specifically,\nwe introduce a new Chinese dataset, CKnowEdit, by collecting seven type of\nknowledge from various sources, including classical texts, idioms, and content\nfrom Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony,\nantithesis, and logical constructs inherent in the Chinese language. Through\nthe analysis of this dataset, we uncover the challenges faced by current LLMs\nin mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge\nediting techniques on this dataset unveil the substantial scope for advancement\nin the rectification of Chinese knowledge. Code and dataset are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Tianhe Lu","Jizhan Fang","Yunzhi Yao","Xin Xu","Ningyu Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05806v1.pdf","comment":"Ongoing work; code and dataset are available at\n https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2409.05799v1","updated":"2024-09-09T17:03:38Z","published":"2024-09-09T17:03:38Z","title":"PDAF: A Phonetic Debiasing Attention Framework For Speaker Verification","summary":" Speaker verification systems are crucial for authenticating identity through\nvoice. Traditionally, these systems focus on comparing feature vectors,\noverlooking the speech's content. However, this paper challenges this by\nhighlighting the importance of phonetic dominance, a measure of the frequency\nor duration of phonemes, as a crucial cue in speaker verification. A novel\nPhoneme Debiasing Attention Framework (PDAF) is introduced, integrating with\nexisting attention frameworks to mitigate biases caused by phonetic dominance.\nPDAF adjusts the weighting for each phoneme and influences feature extraction,\nallowing for a more nuanced analysis of speech. This approach paves the way for\nmore accurate and reliable identity authentication through voice. Furthermore,\nby employing various weighting strategies, we evaluate the influence of\nphonetic features on the efficacy of the speaker verification system.\n","authors":["Massa Baali","Abdulhamid Aldoobi","Hira Dhamyal","Rita Singh","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2409.05799v1.pdf","comment":"Accepted to SLT"},{"id":"http://arxiv.org/abs/2408.14774v2","updated":"2024-09-09T16:41:36Z","published":"2024-08-27T04:31:58Z","title":"Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning","summary":" We introduce Instruct-SkillMix, an automated approach for creating diverse,\nhigh quality SFT data. The Instruct-SkillMix pipeline involves two stages, each\nleveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to\nextract core \"skills\" for instruction-following, either from existing datasets,\nor by directly prompting the model; (2) Data generation: uses the powerful LLM\nto generate (instruction, response) data that exhibit a randomly chosen pair of\nthese skills. Here, the use of random skill combinations promotes diversity and\ndifficulty.\n Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from\nInstruct-SkillMix leads to strong gains on instruction following benchmarks\nsuch as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples,\nLLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0.\nTo our knowledge, this achieves state-of-the-art performance among all models\nthat have only undergone SFT (no RL methods) and competes with proprietary\nmodels such as Claude 3 Opus and LLaMA-3.1-405B-Instruct.\n Ablation studies also suggest plausible reasons for why creating open\ninstruction-tuning datasets via naive crowd-sourcing has proved difficult.\nIntroducing low quality answers (\"shirkers\") in $20\\%$ of Instruct-SkillMix\nexamples causes performance to plummet, sometimes catastrophically.\n The Instruct-SkillMix pipeline is flexible and is adaptable to other\nsettings.\n","authors":["Simran Kaur","Simon Park","Anirudh Goyal","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2408.14774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05771v1","updated":"2024-09-09T16:33:16Z","published":"2024-09-09T16:33:16Z","title":"Evidence from fMRI Supports a Two-Phase Abstraction Process in Language\n Models","summary":" Research has repeatedly demonstrated that intermediate hidden states\nextracted from large language models are able to predict measured brain\nresponse to natural language stimuli. Yet, very little is known about the\nrepresentation properties that enable this high prediction performance. Why is\nit the intermediate layers, and not the output layers, that are most capable\nfor this unique and highly general transfer task? In this work, we show that\nevidence from language encoding models in fMRI supports the existence of a\ntwo-phase abstraction process within LLMs. We use manifold learning methods to\nshow that this abstraction process naturally arises over the course of training\na language model and that the first \"composition\" phase of this abstraction\nprocess is compressed into fewer layers as training continues. Finally, we\ndemonstrate a strong correspondence between layerwise encoding performance and\nthe intrinsic dimensionality of representations from LLMs. We give initial\nevidence that this correspondence primarily derives from the inherent\ncompositionality of LLMs and not their next-word prediction properties.\n","authors":["Emily Cheng","Richard J. Antonello"],"pdf_url":"https://arxiv.org/pdf/2409.05771v1.pdf","comment":"Equal contribution from both authors. Submitted to NeurIPS NeuroAI\n workshop 2024"},{"id":"http://arxiv.org/abs/2406.10999v3","updated":"2024-09-09T16:28:09Z","published":"2024-06-16T16:25:22Z","title":"Balancing Rigor and Utility: Mitigating Cognitive Biases in Large\n Language Models for Multiple-Choice Questions","summary":" This paper examines the role of cognitive biases in the decision-making\nprocesses of large language models (LLMs), challenging the conventional goal of\neliminating all biases. We show that certain cognitive biases when properly\nbalanced, can enhance decision-making efficiency through rational deviations\nand heuristic shortcuts. By introducing heuristic moderation and an abstention\noption, which allows LLMs to withhold responses when uncertain, we reduce error\nrates, improve decision accuracy, and optimize decision rates. Using the\nBalance Rigor and Utility (BRU) dataset, developed through expert\ncollaboration, our findings demonstrate that targeted inspection of cognitive\nbiases aligns LLM decisions more closely with human reasoning, enhancing\nreliability and suggesting strategies for future improvements. This approach\noffers a novel way to leverage cognitive biases to improve the practical\nutility of LLMs across various applications.\n","authors":["Liman Wang","Hanyang Zhong","Wenting Cao","Zeyuan Sun"],"pdf_url":"https://arxiv.org/pdf/2406.10999v3.pdf","comment":"This article is currently under review. All data will be open on\n GitHub once the review is complete.\n https://github.com/limanwang/Balancing-Rigor-and-Utility"},{"id":"http://arxiv.org/abs/2311.18799v2","updated":"2024-09-09T16:00:04Z","published":"2023-11-30T18:43:51Z","title":"X-InstructBLIP: A Framework for aligning X-Modal instruction-aware\n representations to LLMs and Emergent Cross-modal Reasoning","summary":" Recent research has achieved significant advancements in visual reasoning\ntasks through learning image-to-language projections and leveraging the\nimpressive reasoning abilities of Large Language Models (LLMs). This paper\nintroduces an efficient and effective framework that integrates multiple\nmodalities (images, 3D, audio and video) to a frozen LLM and demonstrates an\nemergent ability for cross-modal reasoning (2+ modality inputs). Our approach\nexplores two distinct projection mechanisms: Q-Formers and Linear Projections\n(LPs). Through extensive experimentation across all four modalities on 16\nbenchmarks, we explore both methods and assess their adaptability in integrated\nand separate cross-modal reasoning. The Q-Former projection demonstrates\nsuperior performance in single modality scenarios and adaptability in joint\nversus discriminative reasoning involving two or more modalities. However, it\nexhibits lower generalization capabilities than linear projection in contexts\nwhere task-modality data are limited. To enable this framework, we devise a\nscalable pipeline that automatically generates high-quality, instruction-tuning\ndatasets from readily available captioning data across different modalities,\nand contribute 24K QA data for audio and 250K QA data for 3D. To facilitate\nfurther research in cross-modal reasoning, we introduce the DisCRn\n(Discriminative Cross-modal Reasoning) benchmark comprising 9K audio-video QA\nsamples and 28K image-3D QA samples that require the model to reason\ndiscriminatively across disparate input modalities.\n","authors":["Artemis Panagopoulou","Le Xue","Ning Yu","Junnan Li","Dongxu Li","Shafiq Joty","Ran Xu","Silvio Savarese","Caiming Xiong","Juan Carlos Niebles"],"pdf_url":"https://arxiv.org/pdf/2311.18799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09481v2","updated":"2024-09-09T15:57:34Z","published":"2024-08-18T13:51:01Z","title":"PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal\n Conversational Aspect-based Sentiment Analysis","summary":" While existing Aspect-based Sentiment Analysis (ABSA) has received extensive\neffort and advancement, there are still gaps in defining a more holistic\nresearch target seamlessly integrating multimodality, conversation context,\nfine-granularity, and also covering the changing sentiment dynamics as well as\ncognitive causal rationales. This paper bridges the gaps by introducing a\nmultimodal conversational ABSA, where two novel subtasks are proposed: 1)\nPanoptic Sentiment Sextuple Extraction, panoramically recognizing holder,\ntarget, aspect, opinion, sentiment, rationale from multi-turn multi-party\nmultimodal dialogue. 2) Sentiment Flipping Analysis, detecting the dynamic\nsentiment transformation throughout the conversation with the causal reasons.\nTo benchmark the tasks, we construct PanoSent, a dataset annotated both\nmanually and automatically, featuring high quality, large scale, multimodality,\nmultilingualism, multi-scenarios, and covering both implicit and explicit\nsentiment elements. To effectively address the tasks, we devise a novel\nChain-of-Sentiment reasoning framework, together with a novel multimodal large\nlanguage model (namely Sentica) and a paraphrase-based verification mechanism.\nExtensive evaluations demonstrate the superiority of our methods over strong\nbaselines, validating the efficacy of all our proposed methods. The work is\nexpected to open up a new era for the ABSA community, and thus all our codes\nand data are open at https://PanoSent.github.io/\n","authors":["Meng Luo","Hao Fei","Bobo Li","Shengqiong Wu","Qian Liu","Soujanya Poria","Erik Cambria","Mong-Li Lee","Wynne Hsu"],"pdf_url":"https://arxiv.org/pdf/2408.09481v2.pdf","comment":"Accepted by ACM MM 2024 (Oral)"},{"id":"http://arxiv.org/abs/2409.05732v1","updated":"2024-09-09T15:42:19Z","published":"2024-09-09T15:42:19Z","title":"Towards Democratizing Multilingual Large Language Models For Medicine\n Through A Two-Stage Instruction Fine-tuning Approach","summary":" Open-source, multilingual medical large language models (LLMs) have the\npotential to serve linguistically diverse populations across different regions.\nAdapting generic LLMs for healthcare often requires continual pretraining, but\nthis approach is computationally expensive and sometimes impractical.\nInstruction fine-tuning on a specific task may not always guarantee optimal\nperformance due to the lack of broader domain knowledge that the model needs to\nunderstand and reason effectively in diverse scenarios. To address these\nchallenges, we introduce two multilingual instruction fine-tuning datasets,\nMMed-IFT and MMed-IFT-MC, containing over 200k high-quality medical samples in\nsix languages. We propose a two-stage training paradigm: the first stage\ninjects general medical knowledge using MMed-IFT, while the second stage\nfine-tunes task-specific multiple-choice questions with MMed-IFT-MC. Our method\nachieves competitive results on both English and multilingual benchmarks,\nstriking a balance between computational efficiency and performance. We plan to\nmake our dataset and model weights public at\n\\url{https://github.com/SpassMed/Med-Llama3} in the future.\n","authors":["Meng Zhou","Surajsinh Parmar","Anubhav Bhatti"],"pdf_url":"https://arxiv.org/pdf/2409.05732v1.pdf","comment":"Technical Report v1, work in progress"},{"id":"http://arxiv.org/abs/2409.05721v1","updated":"2024-09-09T15:33:07Z","published":"2024-09-09T15:33:07Z","title":"Referring Expression Generation in Visually Grounded Dialogue with\n Discourse-aware Comprehension Guiding","summary":" We propose an approach to referring expression generation (REG) in visually\ngrounded dialogue that is meant to produce referring expressions (REs) that are\nboth discriminative and discourse-appropriate. Our method constitutes a\ntwo-stage process. First, we model REG as a text- and image-conditioned\nnext-token prediction task. REs are autoregressively generated based on their\npreceding linguistic context and a visual representation of the referent.\nSecond, we propose the use of discourse-aware comprehension guiding as part of\na generate-and-rerank strategy through which candidate REs generated with our\nREG model are reranked based on their discourse-dependent discriminatory power.\nResults from our human evaluation indicate that our proposed two-stage approach\nis effective in producing discriminative REs, with higher performance in terms\nof text-image retrieval accuracy for reranked REs compared to those generated\nusing greedy decoding.\n","authors":["Bram Willemsen","Gabriel Skantze"],"pdf_url":"https://arxiv.org/pdf/2409.05721v1.pdf","comment":"Accepted for publication at INLG 2024"},{"id":"http://arxiv.org/abs/2409.05677v1","updated":"2024-09-09T14:44:19Z","published":"2024-09-09T14:44:19Z","title":"RegNLP in Action: Facilitating Compliance Through Automated Information\n Retrieval and Answer Generation","summary":" Regulatory documents, issued by governmental regulatory bodies, establish\nrules, guidelines, and standards that organizations must adhere to for legal\ncompliance. These documents, characterized by their length, complexity and\nfrequent updates, are challenging to interpret, requiring significant\nallocation of time and expertise on the part of organizations to ensure ongoing\ncompliance.Regulatory Natural Language Processing (RegNLP) is a\nmultidisciplinary subfield aimed at simplifying access to and interpretation of\nregulatory rules and obligations. We define an Automated Question-Passage\nGeneration task for RegNLP, create the ObliQA dataset containing 27,869\nquestions derived from the Abu Dhabi Global Markets (ADGM) financial regulation\ndocument collection, design a baseline Regulatory Information Retrieval and\nAnswer Generation system, and evaluate it with RePASs, a novel evaluation\nmetric that tests whether generated answers accurately capture all relevant\nobligations and avoid contradictions.\n","authors":["Tuba Gokhan","Kexin Wang","Iryna Gurevych","Ted Briscoe"],"pdf_url":"https://arxiv.org/pdf/2409.05677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05674v1","updated":"2024-09-09T14:41:57Z","published":"2024-09-09T14:41:57Z","title":"Evaluation of real-time transcriptions using end-to-end ASR models","summary":" Automatic Speech Recognition (ASR) or Speech-to-text (STT) has greatly\nevolved in the last few years. Traditional architectures based on pipelines\nhave been replaced by joint end-to-end (E2E) architectures that simplify and\nstreamline the model training process. In addition, new AI training methods,\nsuch as weak-supervised learning have reduced the need for high-quality audio\ndatasets for model training. However, despite all these advancements, little to\nno research has been done on real-time transcription. In real-time scenarios,\nthe audio is not pre-recorded, and the input audio must be fragmented to be\nprocessed by the ASR systems. To achieve real-time requirements, these\nfragments must be as short as possible to reduce latency. However, audio cannot\nbe split at any point as dividing an utterance into two separate fragments will\ngenerate an incorrect transcription. Also, shorter fragments provide less\ncontext for the ASR model. For this reason, it is necessary to design and test\ndifferent splitting algorithms to optimize the quality and delay of the\nresulting transcription. In this paper, three audio splitting algorithms are\nevaluated with different ASR models to determine their impact on both the\nquality of the transcription and the end-to-end delay. The algorithms are\nfragmentation at fixed intervals, voice activity detection (VAD), and\nfragmentation with feedback. The results are compared to the performance of the\nsame model, without audio fragmentation, to determine the effects of this\ndivision. The results show that VAD fragmentation provides the best quality\nwith the highest delay, whereas fragmentation at fixed intervals provides the\nlowest quality and the lowest delay. The newly proposed feedback algorithm\nexchanges a 2-4% increase in WER for a reduction of 1.5-2s delay, respectively,\nto the VAD splitting.\n","authors":["Carlos Arriaga","Alejandro Pozo","Javier Conde","Alvaro Alonso"],"pdf_url":"https://arxiv.org/pdf/2409.05674v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.16528v3","updated":"2024-09-09T14:31:26Z","published":"2024-05-26T11:29:57Z","title":"LoQT: Low-Rank Adapters for Quantized Pre-Training","summary":" Training of large neural networks requires significant computational\nresources. Despite advances using low-rank adapters and quantization,\npretraining of models such as LLMs on consumer hardware has not been possible\nwithout model sharding, offloading during training, or per-layer gradient\nupdates. To address these limitations, we propose LoQT, a method for\nefficiently training quantized models. LoQT uses gradient-based tensor\nfactorization to initialize low-rank trainable weight matrices that are\nperiodically merged into quantized full-rank weight matrices. Our approach is\nsuitable for both pretraining and fine-tuning of models, which we demonstrate\nexperimentally for language modeling and downstream task adaptation. We find\nthat LoQT enables efficient training of models up to 7B parameters on a\nconsumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B\nparameter model using per-layer gradient updates on the same hardware.\n","authors":["Sebastian Loeschcke","Mads Toftrup","Michael J. Kastoryano","Serge Belongie","Vésteinn Snæbjarnarson"],"pdf_url":"https://arxiv.org/pdf/2405.16528v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05653v1","updated":"2024-09-09T14:19:21Z","published":"2024-09-09T14:19:21Z","title":"Revisiting English Winogender Schemas for Consistency, Coverage, and\n Grammatical Case","summary":" While measuring bias and robustness in coreference resolution are important\ngoals, such measurements are only as good as the tools we use to measure them\nwith. Winogender schemas (Rudinger et al., 2018) are an influential dataset\nproposed to evaluate gender bias in coreference resolution, but a closer look\nat the data reveals issues with the instances that compromise their use for\nreliable evaluation, including treating different grammatical cases of pronouns\nin the same way, violations of template constraints, and typographical errors.\nWe identify these issues and fix them, contributing a new dataset: Winogender\n2.0. Our changes affect performance with state-of-the-art supervised\ncoreference resolution systems as well as all model sizes of the language model\nFLAN-T5, with F1 dropping on average 0.1 points. We also propose a new method\nto evaluate pronominal bias in coreference resolution that goes beyond the\nbinary. With this method and our new dataset which is balanced for grammatical\ncase, we empirically demonstrate that bias characteristics vary not just across\npronoun sets, but also across surface forms of those sets.\n","authors":["Vagrant Gautam","Julius Steuer","Eileen Bingert","Ray Johns","Anne Lauscher","Dietrich Klakow"],"pdf_url":"https://arxiv.org/pdf/2409.05653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00369v3","updated":"2024-09-09T13:50:30Z","published":"2024-08-31T07:10:16Z","title":"An Empirical Study on Information Extraction using Large Language Models","summary":" Human-like large language models (LLMs), especially the most powerful and\npopular ones in OpenAI's GPT family, have proven to be very helpful for many\nnatural language processing (NLP) related tasks. Therefore, various attempts\nhave been made to apply LLMs to information extraction (IE), which is a\nfundamental NLP task that involves extracting information from unstructured\nplain text. To demonstrate the latest representative progress in LLMs'\ninformation extraction ability, we assess the information extraction ability of\nGPT-4 (the latest version of GPT at the time of writing this paper) from four\nperspectives: Performance, Evaluation Criteria, Robustness, and Error Types.\nOur results suggest a visible performance gap between GPT-4 and\nstate-of-the-art (SOTA) IE methods. To alleviate this problem, considering the\nLLMs' human-like characteristics, we propose and analyze the effects of a\nseries of simple prompt-based methods, which can be generalized to other LLMs\nand NLP tasks. Rich experiments show our methods' effectiveness and some of\ntheir remaining issues in improving GPT-4's information extraction ability.\n","authors":["Ridong Han","Chaohao Yang","Tao Peng","Prayag Tiwari","Xiang Wan","Lu Liu","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.00369v3.pdf","comment":"Need to submit this paper as the replacement of arXiv:2305.14450"},{"id":"http://arxiv.org/abs/2409.05601v1","updated":"2024-09-09T13:35:52Z","published":"2024-09-09T13:35:52Z","title":"Longer is (Not Necessarily) Stronger: Punctuated Long-Sequence Training\n for Enhanced Speech Recognition and Translation","summary":" This paper presents a new method for training sequence-to-sequence models for\nspeech recognition and translation tasks. Instead of the traditional approach\nof training models on short segments containing only lowercase or partial\npunctuation and capitalization (PnC) sentences, we propose training on longer\nutterances that include complete sentences with proper punctuation and\ncapitalization. We achieve this by using the FastConformer architecture which\nallows training 1 Billion parameter models with sequences up to 60 seconds long\nwith full attention. However, while training with PnC enhances the overall\nperformance, we observed that accuracy plateaus when training on sequences\nlonger than 40 seconds across various evaluation settings. Our proposed method\nsignificantly improves punctuation and capitalization accuracy, showing a 25%\nrelative word error rate (WER) improvement on the Earnings-21 and Earnings-22\nbenchmarks. Additionally, training on longer audio segments increases the\noverall model accuracy across speech recognition and translation benchmarks.\nThe model weights and training code are open-sourced though NVIDIA NeMo.\n","authors":["Nithin Rao Koluguri","Travis Bartley","Hainan Xu","Oleksii Hrinchuk","Jagadeesh Balam","Boris Ginsburg","Georg Kucsko"],"pdf_url":"https://arxiv.org/pdf/2409.05601v1.pdf","comment":"Accepted at SLT 2024"},{"id":"http://arxiv.org/abs/2409.05592v1","updated":"2024-09-09T13:23:14Z","published":"2024-09-09T13:23:14Z","title":"ExDDI: Explaining Drug-Drug Interaction Predictions with Natural\n Language","summary":" Predicting unknown drug-drug interactions (DDIs) is crucial for improving\nmedication safety. Previous efforts in DDI prediction have typically focused on\nbinary classification or predicting DDI categories, with the absence of\nexplanatory insights that could enhance trust in these predictions. In this\nwork, we propose to generate natural language explanations for DDI predictions,\nenabling the model to reveal the underlying pharmacodynamics and\npharmacokinetics mechanisms simultaneously as making the prediction. To do\nthis, we have collected DDI explanations from DDInter and DrugBank and\ndeveloped various models for extensive experiments and analysis. Our models can\nprovide accurate explanations for unknown DDIs between known drugs. This paper\ncontributes new tools to the field of DDI prediction and lays a solid\nfoundation for further research on generating explanations for DDI predictions.\n","authors":["Zhaoyue Sun","Jiazheng Li","Gabriele Pergola","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2409.05592v1.pdf","comment":"17 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.05591v1","updated":"2024-09-09T13:20:31Z","published":"2024-09-09T13:20:31Z","title":"MemoRAG: Moving towards Next-Gen RAG Via Memory-Inspired Knowledge\n Discovery","summary":" Retrieval-Augmented Generation (RAG) leverages retrieval tools to access\nexternal databases, thereby enhancing the generation quality of large language\nmodels (LLMs) through optimized context. However, the existing retrieval\nmethods are constrained inherently, as they can only perform relevance matching\nbetween explicitly stated queries and well-formed knowledge, but unable to\nhandle tasks involving ambiguous information needs or unstructured knowledge.\nConsequently, existing RAG systems are primarily effective for straightforward\nquestion-answering tasks. In this work, we propose \\textbf{MemoRAG}, a novel\nretrieval-augmented generation paradigm empowered by long-term memory. MemoRAG\nadopts a dual-system architecture. On the one hand, it employs a \\textit{light\nbut long-range} LLM to form the global memory of database. Once a task is\npresented, it generates draft answers, cluing the retrieval tools to locate\nuseful information within the database. On the other hand, it leverages an\n\\textit{expensive but expressive} LLM, which generates the ultimate answer\nbased on the retrieved information. Building on this general framework, we\nfurther optimize MemoRAG's performance by enhancing its cluing mechanism and\nmemorization capacity. In our experiment, MemoRAG achieves superior performance\nacross a variety of evaluation tasks, including both complex ones where\nconventional RAG fails and straightforward ones where RAG is commonly applied.\n","authors":["Hongjin Qian","Peitian Zhang","Zheng Liu","Kelong Mao","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2409.05591v1.pdf","comment":"Codes and models are in https://github.com/qhjqhj00/MemoRAG"},{"id":"http://arxiv.org/abs/2409.05583v1","updated":"2024-09-09T13:12:11Z","published":"2024-09-09T13:12:11Z","title":"Spatially-Aware Speaker for Vision-and-Language Navigation Instruction\n Generation","summary":" Embodied AI aims to develop robots that can \\textit{understand} and execute\nhuman language instructions, as well as communicate in natural languages. On\nthis front, we study the task of generating highly detailed navigational\ninstructions for the embodied robots to follow. Although recent studies have\ndemonstrated significant leaps in the generation of step-by-step instructions\nfrom sequences of images, the generated instructions lack variety in terms of\ntheir referral to objects and landmarks. Existing speaker models learn\nstrategies to evade the evaluation metrics and obtain higher scores even for\nlow-quality sentences. In this work, we propose SAS (Spatially-Aware Speaker),\nan instruction generator or \\textit{Speaker} model that utilises both\nstructural and semantic knowledge of the environment to produce richer\ninstructions. For training, we employ a reward learning method in an\nadversarial setting to avoid systematic bias introduced by language evaluation\nmetrics. Empirically, our method outperforms existing instruction generation\nmodels, evaluated using standard metrics. Our code is available at\n\\url{https://github.com/gmuraleekrishna/SAS}.\n","authors":["Muraleekrishna Gopinathan","Martin Masek","Jumana Abu-Khalaf","David Suter"],"pdf_url":"https://arxiv.org/pdf/2409.05583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12325v2","updated":"2024-09-09T13:10:50Z","published":"2024-08-22T12:00:31Z","title":"Improving Factuality in Large Language Models via Decoding-Time\n Hallucinatory and Truthful Comparators","summary":" Despite their remarkable capabilities, Large Language Models (LLMs) are prone\nto generate responses that contradict verifiable facts, i.e., unfaithful\nhallucination content. Existing efforts generally focus on optimizing model\nparameters or editing semantic representations, which compromise the internal\nfactual knowledge of target LLMs. In addition, hallucinations typically exhibit\nmultifaceted patterns in downstream tasks, limiting the model's holistic\nperformance across tasks. In this paper, we propose a Comparator-driven\nDecoding-Time (CDT) framework to alleviate the response hallucination. Firstly,\nwe construct hallucinatory and truthful comparators with multi-task fine-tuning\nsamples. In this case, we present an instruction prototype-guided mixture of\nexperts strategy to enhance the ability of the corresponding comparators to\ncapture different hallucination or truthfulness patterns in distinct task\ninstructions. CDT constrains next-token predictions to factuality-robust\ndistributions by contrasting the logit differences between the target LLMs and\nthese comparators. Systematic experiments on multiple downstream tasks show\nthat our framework can significantly improve the model performance and response\nfactuality.\n","authors":["Dingkang Yang","Dongling Xiao","Jinjie Wei","Mingcheng Li","Zhaoyu Chen","Ke Li","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.12325v2.pdf","comment":"Hallucination Mitigation in LLMs"},{"id":"http://arxiv.org/abs/2401.11944v3","updated":"2024-09-09T12:38:11Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n Benchmark","summary":" As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU. CMMMU includes\n12k manually collected multimodal questions from college exams, quizzes, and\ntextbooks, covering six core disciplines: Art & Design, Business, Science,\nHealth & Medicine, Humanities & Social Science, and Tech & Engineering, like\nits companion, MMMU. These questions span 30 subjects and comprise 39 highly\nheterogeneous image types, such as charts, diagrams, maps, tables, music\nsheets, and chemical structures. CMMMU focuses on complex perception and\nreasoning with domain-specific knowledge in the Chinese context. We evaluate 11\nopen-source LLMs and one proprietary GPT-4V(ision). Even GPT-4V only achieves\naccuracies of 42%, indicating a large space for improvement. CMMMU will boost\nthe community to build the next-generation LMMs towards expert artificial\nintelligence and promote the democratization of LMMs by providing diverse\nlanguage contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05556v1","updated":"2024-09-09T12:25:10Z","published":"2024-09-09T12:25:10Z","title":"SciAgents: Automating scientific discovery through multi-agent\n intelligent graph reasoning","summary":" A key challenge in artificial intelligence is the creation of systems capable\nof autonomously advancing scientific understanding by exploring novel domains,\nidentifying complex patterns, and uncovering previously unseen connections in\nvast scientific data. In this work, we present SciAgents, an approach that\nleverages three core concepts: (1) the use of large-scale ontological knowledge\ngraphs to organize and interconnect diverse scientific concepts, (2) a suite of\nlarge language models (LLMs) and data retrieval tools, and (3) multi-agent\nsystems with in-situ learning capabilities. Applied to biologically inspired\nmaterials, SciAgents reveals hidden interdisciplinary relationships that were\npreviously considered unrelated, achieving a scale, precision, and exploratory\npower that surpasses traditional human-driven research methods. The framework\nautonomously generates and refines research hypotheses, elucidating underlying\nmechanisms, design principles, and unexpected material properties. By\nintegrating these capabilities in a modular fashion, the intelligent system\nyields material discoveries, critique and improve existing hypotheses, retrieve\nup-to-date data about existing research, and highlights their strengths and\nlimitations. Our case studies demonstrate scalable capabilities to combine\ngenerative AI, ontological representations, and multi-agent modeling,\nharnessing a `swarm of intelligence' similar to biological systems. This\nprovides new avenues for materials discovery and accelerates the development of\nadvanced materials by unlocking Nature's design principles.\n","authors":["Alireza Ghafarollahi","Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2409.05556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05530v1","updated":"2024-09-09T11:38:06Z","published":"2024-09-09T11:38:06Z","title":"QiBERT -- Classifying Online Conversations Messages with BERT as a\n Feature","summary":" Recent developments in online communication and their usage in everyday life\nhave caused an explosion in the amount of a new genre of text data, short text.\nThus, the need to classify this type of text based on its content has a\nsignificant implication in many areas. Online debates are no exception, once\nthese provide access to information about opinions, positions and preferences\nof its users. This paper aims to use data obtained from online social\nconversations in Portuguese schools (short text) to observe behavioural trends\nand to see if students remain engaged in the discussion when stimulated. This\nproject used the state of the art (SoA) Machine Learning (ML) algorithms and\nmethods, through BERT based models to classify if utterances are in or out of\nthe debate subject. Using SBERT embeddings as a feature, with supervised\nlearning, the proposed model achieved results above 0.95 average accuracy for\nclassifying online messages. Such improvements can help social scientists\nbetter understand human communication, behaviour, discussion and persuasion.\n","authors":["Bruno D. Ferreira-Saraiva","Zuil Pirola","João P. Matos-Carvalho","Manuel Marques-Pita"],"pdf_url":"https://arxiv.org/pdf/2409.05530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04073v2","updated":"2024-09-09T11:33:00Z","published":"2024-09-06T07:29:01Z","title":"AnyMatch -- Efficient Zero-Shot Entity Matching with a Small Language\n Model","summary":" Entity matching (EM) is the problem of determining whether two records refer\nto same real-world entity, which is crucial in data integration, e.g., for\nproduct catalogs or address databases. A major drawback of many EM approaches\nis their dependence on labelled examples. We thus focus on the challenging\nsetting of zero-shot entity matching where no labelled examples are available\nfor an unseen target dataset. Recently, large language models (LLMs) have shown\npromising results for zero-shot EM, but their low throughput and high\ndeployment cost limit their applicability and scalability.\n We revisit the zero-shot EM problem with AnyMatch, a small language model\nfine-tuned in a transfer learning setup. We propose several novel data\nselection techniques to generate fine-tuning data for our model, e.g., by\nselecting difficult pairs to match via an AutoML filter, by generating\nadditional attribute-level examples, and by controlling label imbalance in the\ndata.\n We conduct an extensive evaluation of the prediction quality and deployment\ncost of our model, in a comparison to thirteen baselines on nine benchmark\ndatasets. We find that AnyMatch provides competitive prediction quality despite\nits small parameter size: it achieves the second-highest F1 score overall, and\noutperforms several other approaches that employ models with hundreds of\nbillions of parameters. Furthermore, our approach exhibits major cost benefits:\nthe average prediction quality of AnyMatch is within 4.4% of the\nstate-of-the-art method MatchGPT with the proprietary trillion-parameter model\nGPT-4, yet AnyMatch requires four orders of magnitude less parameters and\nincurs a 3,899 times lower inference cost (in dollars per 1,000 tokens).\n","authors":["Zeyu Zhang","Paul Groth","Iacer Calixto","Sebastian Schelter"],"pdf_url":"https://arxiv.org/pdf/2409.04073v2.pdf","comment":"12 pages excluding references, 3 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2409.05521v1","updated":"2024-09-09T11:28:02Z","published":"2024-09-09T11:28:02Z","title":"Harmonic Reasoning in Large Language Models","summary":" Large Language Models (LLMs) are becoming very popular and are used for many\ndifferent purposes, including creative tasks in the arts. However, these models\nsometimes have trouble with specific reasoning tasks, especially those that\ninvolve logical thinking and counting. This paper looks at how well LLMs\nunderstand and reason when dealing with musical tasks like figuring out notes\nfrom intervals and identifying chords and scales. We tested GPT-3.5 and GPT-4o\nto see how they handle these tasks. Our results show that while LLMs do well\nwith note intervals, they struggle with more complicated tasks like recognizing\nchords and scales. This points out clear limits in current LLM abilities and\nshows where we need to make them better, which could help improve how they\nthink and work in both artistic and other complex areas. We also provide an\nautomatically generated benchmark data set for the described tasks.\n","authors":["Anna Kruspe"],"pdf_url":"https://arxiv.org/pdf/2409.05521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11381v2","updated":"2024-09-09T11:18:16Z","published":"2024-08-21T07:20:48Z","title":"RAGLAB: A Modular and Research-Oriented Unified Framework for\n Retrieval-Augmented Generation","summary":" Large Language Models (LLMs) demonstrate human-level capabilities in\ndialogue, reasoning, and knowledge retention. However, even the most advanced\nLLMs face challenges such as hallucinations and real-time updating of their\nknowledge. Current research addresses this bottleneck by equipping LLMs with\nexternal knowledge, a technique known as Retrieval Augmented Generation (RAG).\nHowever, two key issues constrained the development of RAG. First, there is a\ngrowing lack of comprehensive and fair comparisons between novel RAG\nalgorithms. Second, open-source tools such as LlamaIndex and LangChain employ\nhigh-level abstractions, which results in a lack of transparency and limits the\nability to develop novel algorithms and evaluation metrics. To close this gap,\nwe introduce RAGLAB, a modular and research-oriented open-source library.\nRAGLAB reproduces 6 existing algorithms and provides a comprehensive ecosystem\nfor investigating RAG algorithms. Leveraging RAGLAB, we conduct a fair\ncomparison of 6 RAG algorithms across 10 benchmarks. With RAGLAB, researchers\ncan efficiently compare the performance of various algorithms and develop novel\nalgorithms.\n","authors":["Xuanwang Zhang","Yunze Song","Yidong Wang","Shuyun Tang","Xinfeng Li","Zhengran Zeng","Zhen Wu","Wei Ye","Wenyuan Xu","Yue Zhang","Xinyu Dai","Shikun Zhang","Qingsong Wen"],"pdf_url":"https://arxiv.org/pdf/2408.11381v2.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.05486v1","updated":"2024-09-09T10:30:00Z","published":"2024-09-09T10:30:00Z","title":"Elsevier Arena: Human Evaluation of Chemistry/Biology/Health\n Foundational Large Language Models","summary":" The quality and capabilities of large language models cannot be currently\nfully assessed with automated, benchmark evaluations. Instead, human\nevaluations that expand on traditional qualitative techniques from natural\nlanguage generation literature are required. One recent best-practice consists\nin using A/B-testing frameworks, which capture preferences of human evaluators\nfor specific models. In this paper we describe a human evaluation experiment\nfocused on the biomedical domain (health, biology, chemistry/pharmacology)\ncarried out at Elsevier. In it a large but not massive (8.8B parameter)\ndecoder-only foundational transformer trained on a relatively small (135B\ntokens) but highly curated collection of Elsevier datasets is compared to\nOpenAI's GPT-3.5-turbo and Meta's foundational 7B parameter Llama 2 model\nagainst multiple criteria. Results indicate -- even if IRR scores were\ngenerally low -- a preference towards GPT-3.5-turbo, and hence towards models\nthat possess conversational abilities, are very large and were trained on very\nlarge datasets. But at the same time, indicate that for less massive models\ntraining on smaller but well-curated training sets can potentially give rise to\nviable alternatives in the biomedical domain.\n","authors":["Camilo Thorne","Christian Druckenbrodt","Kinga Szarkowska","Deepika Goyal","Pranita Marajan","Vijay Somanath","Corey Harper","Mao Yan","Tony Scerri"],"pdf_url":"https://arxiv.org/pdf/2409.05486v1.pdf","comment":"11 pages, 5 tables, 6 figures"},{"id":"http://arxiv.org/abs/2409.03753v2","updated":"2024-09-09T10:04:00Z","published":"2024-09-05T17:59:15Z","title":"WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild","summary":" The increasing availability of real-world conversation data offers exciting\nopportunities for researchers to study user-chatbot interactions. However, the\nsheer volume of this data makes manually examining individual conversations\nimpractical. To overcome this challenge, we introduce WildVis, an interactive\ntool that enables fast, versatile, and large-scale conversation analysis.\nWildVis provides search and visualization capabilities in the text and\nembedding spaces based on a list of criteria. To manage million-scale datasets,\nwe implemented optimizations including search index construction, embedding\nprecomputation and compression, and caching to ensure responsive user\ninteractions within seconds. We demonstrate WildVis' utility through three case\nstudies: facilitating chatbot misuse research, visualizing and comparing topic\ndistributions across datasets, and characterizing user-specific conversation\npatterns. WildVis is open-source and designed to be extendable, supporting\nadditional datasets and customized search and visualization functionalities.\n","authors":["Yuntian Deng","Wenting Zhao","Jack Hessel","Xiang Ren","Claire Cardie","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02795v3","updated":"2024-09-09T09:31:30Z","published":"2024-09-04T15:11:55Z","title":"Towards a Unified View of Preference Learning for Large Language Models:\n A Survey","summary":" Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of\nthe crucial factors to achieve success is aligning the LLM's output with human\npreferences. This alignment process often requires only a small amount of data\nto efficiently enhance the LLM's performance. While effective, research in this\narea spans multiple domains, and the methods involved are relatively complex to\nunderstand. The relationships between different methods have been\nunder-explored, limiting the development of the preference alignment. In light\nof this, we break down the existing popular alignment strategies into different\ncomponents and provide a unified framework to study the current alignment\nstrategies, thereby establishing connections among them. In this survey, we\ndecompose all the strategies in preference learning into four components:\nmodel, data, feedback, and algorithm. This unified view offers an in-depth\nunderstanding of existing alignment algorithms and also opens up possibilities\nto synergize the strengths of different strategies. Furthermore, we present\ndetailed working examples of prevalent existing algorithms to facilitate a\ncomprehensive understanding for the readers. Finally, based on our unified\nperspective, we explore the challenges and future research directions for\naligning large language models with human preferences.\n","authors":["Bofei Gao","Feifan Song","Yibo Miao","Zefan Cai","Zhe Yang","Liang Chen","Helan Hu","Runxin Xu","Qingxiu Dong","Ce Zheng","Wen Xiao","Ge Zhang","Daoguang Zan","Keming Lu","Bowen Yu","Dayiheng Liu","Zeyu Cui","Jian Yang","Lei Sha","Houfeng Wang","Zhifang Sui","Peiyi Wang","Tianyu Liu","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2409.02795v3.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.05448v1","updated":"2024-09-09T09:04:56Z","published":"2024-09-09T09:04:56Z","title":"Representational Analysis of Binding in Large Language Models","summary":" Entity tracking is essential for complex reasoning. To perform in-context\nentity tracking, language models (LMs) must bind an entity to its attribute\n(e.g., bind a container to its content) to recall attribute for a given entity.\nFor example, given a context mentioning ``The coffee is in Box Z, the stone is\nin Box M, the map is in Box H'', to infer ``Box Z contains the coffee'' later,\nLMs must bind ``Box Z'' to ``coffee''. To explain the binding behaviour of LMs,\nFeng and Steinhardt (2023) introduce a Binding ID mechanism and state that LMs\nuse a abstract concept called Binding ID (BI) to internally mark\nentity-attribute pairs. However, they have not directly captured the BI\ndeterminant information from entity activations. In this work, we provide a\nnovel view of the Binding ID mechanism by localizing the prototype of BI\ninformation. Specifically, we discover that there exists a low-rank subspace in\nthe hidden state (or activation) of LMs, that primarily encodes the order of\nentity and attribute and which is used as the prototype of BI to causally\ndetermine the binding. To identify this subspace, we choose principle component\nanalysis as our first attempt and it is empirically proven to be effective.\nMoreover, we also discover that when editing representations along directions\nin the subspace, LMs tend to bind a given entity to other attributes\naccordingly. For example, by patching activations along the BI encoding\ndirection we can make the LM to infer ``Box Z contains the stone'' and ``Box Z\ncontains the map''.\n","authors":["Qin Dai","Benjamin Heinzerling","Kentaro Inui"],"pdf_url":"https://arxiv.org/pdf/2409.05448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15297v2","updated":"2024-09-09T08:53:18Z","published":"2024-08-27T11:31:12Z","title":"YOLO-Stutter: End-to-end Region-Wise Speech Dysfluency Detection","summary":" Dysfluent speech detection is the bottleneck for disordered speech analysis\nand spoken language learning. Current state-of-the-art models are governed by\nrule-based systems which lack efficiency and robustness, and are sensitive to\ntemplate design. In this paper, we propose YOLO-Stutter: a first end-to-end\nmethod that detects dysfluencies in a time-accurate manner. YOLO-Stutter takes\nimperfect speech-text alignment as input, followed by a spatial feature\naggregator, and a temporal dependency extractor to perform region-wise boundary\nand class predictions. We also introduce two dysfluency corpus, VCTK-Stutter\nand VCTK-TTS, that simulate natural spoken dysfluencies including repetition,\nblock, missing, replacement, and prolongation. Our end-to-end method achieves\nstate-of-the-art performance with a minimum number of trainable parameters for\non both simulated data and real aphasia speech. Code and datasets are\nopen-sourced at https://github.com/rorizzz/YOLO-Stutter\n","authors":["Xuanru Zhou","Anshul Kashyap","Steve Li","Ayati Sharma","Brittany Morin","David Baquirin","Jet Vonk","Zoe Ezzes","Zachary Miller","Maria Luisa Gorno Tempini","Jiachen Lian","Gopala Krishna Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2408.15297v2.pdf","comment":"Interspeech 2024"},{"id":"http://arxiv.org/abs/2409.05423v1","updated":"2024-09-09T08:24:29Z","published":"2024-09-09T08:24:29Z","title":"STLM Engineering Report: Dropout","summary":" In this work we explore the relevance of dropout for modern language models,\nparticularly in the context of models on the scale of <100M parameters. We\nexplore it's relevance firstly in the regime of improving the sample efficiency\nof models given small, high quality datasets, and secondly in the regime of\nimproving the quality of its fit on larger datasets where models may underfit.\nWe find that concordant with conventional wisdom, dropout remains effective in\nthe overfitting scenario, and that furthermore it may have some relevance for\nimproving the fit of models even in the case of excess data, as suggested by\nprevious research. In the process we find that the existing explanation for the\nmechanism behind this performance gain is not applicable in the case of\nlanguage modelling.\n","authors":["Dylan Hillier","Leon Guertler","Bobby Cheng","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2409.05423v1.pdf","comment":"6 pages, 3 figures, For code base see\n https://github.com/LeonGuertler/SuperTinyLanguageModels"},{"id":"http://arxiv.org/abs/2405.05966v2","updated":"2024-09-09T08:21:13Z","published":"2024-05-09T17:59:32Z","title":"Natural Language Processing RELIES on Linguistics","summary":" Large Language Models (LLMs) have become capable of generating highly fluent\ntext in certain languages, without modules specially designed to capture\ngrammar or semantic coherence. What does this mean for the future of linguistic\nexpertise in NLP? We highlight several aspects in which NLP (still) relies on\nlinguistics, or where linguistic thinking can illuminate new directions. We\nargue our case around the acronym RELIES that encapsulates six major facets\nwhere linguistics contributes to NLP: Resources, Evaluation, Low-resource\nsettings, Interpretability, Explanation, and the Study of language. This list\nis not exhaustive, nor is linguistics the main point of reference for every\neffort under these themes; but at a macro level, these facets highlight the\nenduring importance of studying machine systems vis-\\`a-vis systems of human\nlanguage.\n","authors":["Juri Opitz","Shira Wein","Nathan Schneider"],"pdf_url":"https://arxiv.org/pdf/2405.05966v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05401v1","updated":"2024-09-09T07:57:43Z","published":"2024-09-09T07:57:43Z","title":"NLLB-E5: A Scalable Multilingual Retrieval Model","summary":" Despite significant progress in multilingual information retrieval, the lack\nof models capable of effectively supporting multiple languages, particularly\nlow-resource like Indic languages, remains a critical challenge. This paper\npresents NLLB-E5: A Scalable Multilingual Retrieval Model. NLLB-E5 leverages\nthe in-built multilingual capabilities in the NLLB encoder for translation\ntasks. It proposes a distillation approach from multilingual retriever E5 to\nprovide a zero-shot retrieval approach handling multiple languages, including\nall major Indic languages, without requiring multilingual training data. We\nevaluate the model on a comprehensive suite of existing benchmarks, including\nHindi-BEIR, highlighting its robust performance across diverse languages and\ntasks. Our findings uncover task and domain-specific challenges, providing\nvaluable insights into the retrieval performance, especially for low-resource\nlanguages. NLLB-E5 addresses the urgent need for an inclusive, scalable, and\nlanguage-agnostic text retrieval model, advancing the field of multilingual\ninformation access and promoting digital inclusivity for millions of users\nglobally.\n","authors":["Arkadeep Acharya","Rudra Murthy","Vishwajeet Kumar","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2409.05401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05385v1","updated":"2024-09-09T07:32:30Z","published":"2024-09-09T07:32:30Z","title":"Towards Building a Robust Knowledge Intensive Question Answering Model\n with Large Language Models","summary":" The development of LLMs has greatly enhanced the intelligence and fluency of\nquestion answering, while the emergence of retrieval enhancement has enabled\nmodels to better utilize external information. However, the presence of noise\nand errors in retrieved information poses challenges to the robustness of LLMs.\nIn this work, to evaluate the model's performance under multiple interferences,\nwe first construct a dataset based on machine reading comprehension datasets\nsimulating various scenarios, including critical information absence, noise,\nand conflicts. To address the issue of model accuracy decline caused by noisy\nexternal information, we propose a data augmentation-based fine-tuning method\nto enhance LLM's robustness against noise. Additionally, contrastive learning\napproach is utilized to preserve the model's discrimination capability of\nexternal information. We have conducted experiments on both existing LLMs and\nour approach, the results are evaluated by GPT-4, which indicates that our\nproposed methods improve model robustness while strengthening the model's\ndiscrimination capability.\n","authors":["Hong Xingyun Hong","Shao Yan Shao","Wang Zhilin Wang","Duan Manni Duan","Jin Xiongnan"],"pdf_url":"https://arxiv.org/pdf/2409.05385v1.pdf","comment":"This paper has been accepted by NLPCC-2024"},{"id":"http://arxiv.org/abs/2408.14840v2","updated":"2024-09-09T06:57:22Z","published":"2024-08-27T07:51:26Z","title":"CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) constitutes a foundational task, directed\ntowards learning representations for entities and relations within knowledge\ngraphs (KGs), with the objective of crafting representations comprehensive\nenough to approximate the logical and symbolic interconnections among entities.\nIn this paper, we define a metric Z-counts to measure the difficulty of\ntraining each triple ($<$head entity, relation, tail entity$>$) in KGs with\ntheoretical analysis. Based on this metric, we propose \\textbf{CL4KGE}, an\nefficient \\textbf{C}urriculum \\textbf{L}earning based training strategy for\n\\textbf{KGE}. This method includes a difficulty measurer and a training\nscheduler that aids in the training of KGE models. Our approach possesses the\nflexibility to act as a plugin within a wide range of KGE models, with the\nadded advantage of adaptability to the majority of KGs in existence. The\nproposed method has been evaluated on popular KGE models, and the results\ndemonstrate that it enhances the state-of-the-art methods. The use of Z-counts\nas a metric has enabled the identification of challenging triples in KGs, which\nhelps in devising effective training strategies.\n","authors":["Yang Liu","Chuan Zhou","Peng Zhang","Yanan Cao","Yongchao Liu","Zhao Li","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14840v2.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.05368v1","updated":"2024-09-09T06:55:38Z","published":"2024-09-09T06:55:38Z","title":"Application Specific Compression of Deep Learning Models","summary":" Large Deep Learning models are compressed and deployed for specific\napplications. However, current Deep Learning model compression methods do not\nutilize the information about the target application. As a result, the\ncompressed models are application agnostic. Our goal is to customize the model\ncompression process to create a compressed model that will perform better for\nthe target application. Our method, Application Specific Compression (ASC),\nidentifies and prunes components of the large Deep Learning model that are\nredundant specifically for the given target application. The intuition of our\nwork is to prune the parts of the network that do not contribute significantly\nto updating the data representation for the given application. We have\nexperimented with the BERT family of models for three applications: Extractive\nQA, Natural Language Inference, and Paraphrase Identification. We observe that\ncustomized compressed models created using ASC method perform better than\nexisting model compression methods and off-the-shelf compressed models.\n","authors":["Rohit Raj Rai","Angana Borah","Amit Awekar"],"pdf_url":"https://arxiv.org/pdf/2409.05368v1.pdf","comment":"Accepted in the Proceedings of the 8th Joint International Conference\n on Data Science & Management of Data (12th ACM IKDD CODS and 30th COMAD) for\n the Short Research Paper track, 5 pages"},{"id":"http://arxiv.org/abs/2409.05367v1","updated":"2024-09-09T06:55:37Z","published":"2024-09-09T06:55:37Z","title":"Diagnostic Reasoning in Natural Language: Computational Model and\n Application","summary":" Diagnostic reasoning is a key component of expert work in many domains. It is\na hard, time-consuming activity that requires expertise, and AI research has\ninvestigated the ways automated systems can support this process. Yet, due to\nthe complexity of natural language, the applications of AI for diagnostic\nreasoning to language-related tasks are lacking. To close this gap, we\ninvestigate diagnostic abductive reasoning (DAR) in the context of\nlanguage-grounded tasks (NL-DAR). We propose a novel modeling framework for\nNL-DAR based on Pearl's structural causal models and instantiate it in a\ncomprehensive study of scientific paper assessment in the biomedical domain. We\nuse the resulting dataset to investigate the human decision-making process in\nNL-DAR and determine the potential of LLMs to support structured\ndecision-making over text. Our framework, open resources and tools lay the\ngroundwork for the empirical study of collaborative diagnostic reasoning in the\nage of LLMs, in the scholarly domain and beyond.\n","authors":["Nils Dycke","Matej Zečević","Ilia Kuznetsov","Beatrix Suess","Kristian Kersting","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2409.05367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05356v1","updated":"2024-09-09T06:28:47Z","published":"2024-09-09T06:28:47Z","title":"IndicVoices-R: Unlocking a Massive Multilingual Multi-speaker Speech\n Corpus for Scaling Indian TTS","summary":" Recent advancements in text-to-speech (TTS) synthesis show that large-scale\nmodels trained with extensive web data produce highly natural-sounding output.\nHowever, such data is scarce for Indian languages due to the lack of\nhigh-quality, manually subtitled data on platforms like LibriVox or YouTube. To\naddress this gap, we enhance existing large-scale ASR datasets containing\nnatural conversations collected in low-quality environments to generate\nhigh-quality TTS training data. Our pipeline leverages the cross-lingual\ngeneralization of denoising and speech enhancement models trained on English\nand applied to Indian languages. This results in IndicVoices-R (IV-R), the\nlargest multilingual Indian TTS dataset derived from an ASR dataset, with 1,704\nhours of high-quality speech from 10,496 speakers across 22 Indian languages.\nIV-R matches the quality of gold-standard TTS datasets like LJSpeech, LibriTTS,\nand IndicTTS. We also introduce the IV-R Benchmark, the first to assess\nzero-shot, few-shot, and many-shot speaker generalization capabilities of TTS\nmodels on Indian voices, ensuring diversity in age, gender, and style. We\ndemonstrate that fine-tuning an English pre-trained model on a combined dataset\nof high-quality IndicTTS and our IV-R dataset results in better zero-shot\nspeaker generalization compared to fine-tuning on the IndicTTS dataset alone.\nFurther, our evaluation reveals limited zero-shot generalization for Indian\nvoices in TTS models trained on prior datasets, which we improve by fine-tuning\nthe model on our data containing diverse set of speakers across language\nfamilies. We open-source all data and code, releasing the first TTS model for\nall 22 official Indian languages.\n","authors":["Ashwin Sankar","Srija Anand","Praveen Srinivasa Varadhan","Sherry Thomas","Mehak Singal","Shridhar Kumar","Deovrat Mehendale","Aditi Krishana","Giri Raju","Mitesh Khapra"],"pdf_url":"https://arxiv.org/pdf/2409.05356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14482v2","updated":"2024-09-09T06:19:07Z","published":"2024-07-19T17:35:47Z","title":"ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG\n Capabilities","summary":" In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K\ncontext window, designed to bridge the gap between open-source LLMs and leading\nproprietary models (e.g., GPT-4-Turbo) in long-context understanding and\nretrieval-augmented generation (RAG) capabilities. These two capabilities are\nessential for LLMs to process large volumes of information that cannot fit into\na single prompt and are complementary to each other, depending on the\ndownstream tasks and computational budgets. We present a detailed continued\ntraining recipe to extend the context window of Llama3-70B-base from 8K to 128K\ntokens, along with a three-stage instruction tuning process to enhance the\nmodel's instruction-following, RAG performance, and long-context understanding\ncapabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model\noutperforms most existing state-of-the-art models, including\nGPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on\nultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only\na 4K context window, showing the strong long context capability across varying\nsequence lengths. We further provide extensive comparisons between direct\nlong-context and RAG solutions using the same state-of-the-art long-context\nLLMs. Interestingly, we find that the performance of strong long-context LLMs\nusing RAG improves when retrieving a larger number of chunks. With a large set\nof top-k chunks, RAG consistently outperforms direct long-context solution\nusing the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B\nand Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To\nadvance research in this field, we open-sourced the model weights, training\ndata, and the evaluation setup for the for the community:\nhttps://chatqa2-project.github.io/\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Chejian Xu","Zihan Liu","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2407.14482v2.pdf","comment":"v2: major update with significantly improved results"},{"id":"http://arxiv.org/abs/2407.15186v3","updated":"2024-09-09T06:17:21Z","published":"2024-07-21T14:48:23Z","title":"A Survey on Employing Large Language Models for Text-to-SQL Tasks","summary":" The increasing volume of data stored in relational databases has led to the\nneed for efficient querying and utilization of this data in various sectors.\nHowever, writing SQL queries requires specialized knowledge, which poses a\nchallenge for non-professional users trying to access and query databases.\nText-to-SQL parsing solves this issue by converting natural language queries\ninto SQL queries, thus making database access more accessible for non-expert\nusers. To take advantage of the recent developments in Large Language Models\n(LLMs), a range of new methods have emerged, with a primary focus on prompt\nengineering and fine-tuning. This survey provides a comprehensive overview of\nLLMs in text-to-SQL tasks, discussing benchmark datasets, prompt engineering,\nfine-tuning methods, and future research directions. We hope this review will\nenable readers to gain a broader understanding of the recent advances in this\nfield and offer some insights into its future trajectory.\n","authors":["Liang Shi","Zhengju Tang","Nan Zhang","Xiaotong Zhang","Zhi Yang"],"pdf_url":"https://arxiv.org/pdf/2407.15186v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.03259v3","updated":"2024-09-09T05:27:28Z","published":"2024-04-04T07:31:56Z","title":"Advancing Aspect-Based Sentiment Analysis through Deep Learning Models","summary":" Aspect-based sentiment analysis predicts sentiment polarity with fine\ngranularity. While graph convolutional networks (GCNs) are widely utilized for\nsentimental feature extraction, their naive application for syntactic feature\nextraction can compromise information preservation. This study introduces an\ninnovative edge-enhanced GCN, named SentiSys, to navigate the syntactic graph\nwhile preserving intact feature information, leading to enhanced performance.\nSpecifically,we first integrate a bidirectional long short-term memory\n(Bi-LSTM) network and a self-attention-based transformer. This combination\nfacilitates effective text encoding, preventing the loss of information and\npredicting long dependency text. A bidirectional GCN (Bi-GCN) with message\npassing is then employed to encode relationships between entities.\nAdditionally, unnecessary information is filtered out using an aspect-specific\nmasking technique. To validate the effectiveness of our proposed model, we\nconduct extensive evaluation experiments on four benchmark datasets. The\nexperimental results demonstrate enhanced performance in aspect-based sentiment\nanalysis with the use of SentiSys.\n","authors":["Chen Li","Huidong Tang","Jinli Zhang","Xiujing Guo","Debo Cheng","Yasuhiko Morimoto"],"pdf_url":"https://arxiv.org/pdf/2404.03259v3.pdf","comment":"This paper has already been accepted by the 20th International\n Conference on Advanced Data Mining and Applications (ADMA2024)"},{"id":"http://arxiv.org/abs/2403.19159v2","updated":"2024-09-09T04:39:31Z","published":"2024-03-28T06:03:47Z","title":"Disentangling Length from Quality in Direct Preference Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) has been a crucial\ncomponent in the recent success of Large Language Models. However, RLHF is know\nto exploit biases in human preferences, such as verbosity. A well-formatted and\neloquent answer is often more highly rated by users, even when it is less\nhelpful and objective. A number of approaches have been developed to control\nthose biases in the classical RLHF literature, but the problem remains\nrelatively under-explored for Direct Alignment Algorithms such as Direct\nPreference Optimization (DPO). Unlike classical RLHF, DPO does not train a\nseparate reward model or use reinforcement learning directly, so previous\napproaches developed to control verbosity cannot be directly applied to this\nsetting. Our work makes several contributions. For the first time, we study the\nlength problem in the DPO setting, showing significant exploitation in DPO and\nlinking it to out-of-distribution bootstrapping. We then develop a principled\nbut simple regularization strategy that prevents length exploitation, while\nstill maintaining improvements in model quality. We demonstrate these effects\nacross datasets on summarization and dialogue, where we achieve up to 20\\%\nimprovement in win rates when controlling for length, despite the GPT4 judge's\nwell-known verbosity bias.\n","authors":["Ryan Park","Rafael Rafailov","Stefano Ermon","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2403.19159v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14795v3","updated":"2024-09-09T04:38:16Z","published":"2023-05-24T06:48:41Z","title":"MQuAKE: Assessing Knowledge Editing in Language Models via Multi-Hop\n Questions","summary":" The information stored in large language models (LLMs) falls out of date\nquickly, and retraining from scratch is often not an option. This has recently\ngiven rise to a range of techniques for injecting new facts through updating\nmodel weights. Current evaluation paradigms are extremely limited, mainly\nvalidating the recall of edited facts, but changing one fact should cause\nrippling changes to the model's related beliefs. If we edit the UK Prime\nMinister to now be Rishi Sunak, then we should get a different answer to Who is\nmarried to the British Prime Minister? In this work, we present a benchmark,\nMQuAKE (Multi-hop Question Answering for Knowledge Editing), comprising\nmulti-hop questions that assess whether edited models correctly answer\nquestions where the answer should change as an entailed consequence of edited\nfacts. While we find that current knowledge-editing approaches can recall\nedited facts accurately, they fail catastrophically on the constructed\nmulti-hop questions. We thus propose a simple memory-based approach, MeLLo,\nwhich stores all edited facts externally while prompting the language model\niteratively to generate answers that are consistent with the edited facts.\nWhile MQuAKE remains challenging, we show that MeLLo scales well with LLMs\n(e.g., OpenAI GPT-3.5-turbo) and outperforms previous model editors by a large\nmargin.\n","authors":["Zexuan Zhong","Zhengxuan Wu","Christopher D. Manning","Christopher Potts","Danqi Chen"],"pdf_url":"https://arxiv.org/pdf/2305.14795v3.pdf","comment":"EMNLP 2023. Our code and datasets are available at\n https://github.com/princeton-nlp/MQuAKE"},{"id":"http://arxiv.org/abs/2409.05292v1","updated":"2024-09-09T03:00:53Z","published":"2024-09-09T03:00:53Z","title":"Mpox Narrative on Instagram: A Labeled Multilingual Dataset of Instagram\n Posts on Mpox for Sentiment, Hate Speech, and Anxiety Analysis","summary":" The world is currently experiencing an outbreak of mpox, which has been\ndeclared a Public Health Emergency of International Concern by WHO. No prior\nwork related to social media mining has focused on the development of a dataset\nof Instagram posts about the mpox outbreak. The work presented in this paper\naims to address this research gap and makes two scientific contributions to\nthis field. First, it presents a multilingual dataset of 60,127 Instagram posts\nabout mpox, published between July 23, 2022, and September 5, 2024. The\ndataset, available at https://dx.doi.org/10.21227/7fvc-y093, contains Instagram\nposts about mpox in 52 languages. For each of these posts, the Post ID, Post\nDescription, Date of publication, language, and translated version of the post\n(translation to English was performed using the Google Translate API) are\npresented as separate attributes in the dataset. After developing this dataset,\nsentiment analysis, hate speech detection, and anxiety or stress detection were\nperformed. This process included classifying each post into (i) one of the\nsentiment classes, i.e., fear, surprise, joy, sadness, anger, disgust, or\nneutral, (ii) hate or not hate, and (iii) anxiety/stress detected or no\nanxiety/stress detected. These results are presented as separate attributes in\nthe dataset. Second, this paper presents the results of performing sentiment\nanalysis, hate speech analysis, and anxiety or stress analysis. The variation\nof the sentiment classes - fear, surprise, joy, sadness, anger, disgust, and\nneutral were observed to be 27.95%, 2.57%, 8.69%, 5.94%, 2.69%, 1.53%, and\n50.64%, respectively. In terms of hate speech detection, 95.75% of the posts\ndid not contain hate and the remaining 4.25% of the posts contained hate.\nFinally, 72.05% of the posts did not indicate any anxiety/stress, and the\nremaining 27.95% of the posts represented some form of anxiety/stress.\n","authors":["Nirmalya Thakur"],"pdf_url":"https://arxiv.org/pdf/2409.05292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05286v1","updated":"2024-09-09T02:41:00Z","published":"2024-09-09T02:41:00Z","title":"Seek and Solve Reasoning for Table Question Answering","summary":" Table-based Question Answering (TQA) involves answering questions based on\ntabular data. The complexity of table structures and question logic makes this\ntask difficult even for Large Language Models (LLMs). This paper improves TQA\nperformance by leveraging LLMs' reasoning capabilities. Inspired by how humans\nsolve TQA tasks, we propose a Seek-and-Solve pipeline that instructs the LLM to\nfirst seek relevant information and then answer questions. The two stages are\nintegrated at the reasoning level, and their Chain of Thought (CoT) paths are\nintegrated into a coherent Seek-and-Solve CoT (SS-CoT). Furthermore, we present\na compact single-stage TQA-solving prompt distilled from the pipeline.\nExperiments demonstrate that under In-Context Learning settings, using samples\nwith SS-CoT paths as demonstrations, the TQA-solving prompt can effectively\nguide the LLM to solve complex TQA tasks, resulting in improved performance and\nreliability. Our results highlight the importance of properly eliciting LLMs'\nreasoning capabilities in solving complex TQA tasks.\n","authors":["Ruya Jiang","Chun Wang","Weihong Deng"],"pdf_url":"https://arxiv.org/pdf/2409.05286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05283v1","updated":"2024-09-09T02:28:53Z","published":"2024-09-09T02:28:53Z","title":"On the Relationship between Truth and Political Bias in Language Models","summary":" Language model alignment research often attempts to ensure that models are\nnot only helpful and harmless, but also truthful and unbiased. However,\noptimizing these objectives simultaneously can obscure how improving one aspect\nmight impact the others. In this work, we focus on analyzing the relationship\nbetween two concepts essential in both language model alignment and political\nscience: \\textit{truthfulness} and \\textit{political bias}. We train reward\nmodels on various popular truthfulness datasets and subsequently evaluate their\npolitical bias. Our findings reveal that optimizing reward models for\ntruthfulness on these datasets tends to result in a left-leaning political\nbias. We also find that existing open-source reward models (i.e. those trained\non standard human preference datasets) already show a similar bias and that the\nbias is larger for larger models. These results raise important questions about\nboth the datasets used to represent truthfulness and what language models\ncapture about the relationship between truth and politics.\n","authors":["Suyash Fulay","William Brannon","Shrestha Mohanty","Cassandra Overney","Elinor Poole-Dayan","Deb Roy","Jad Kabbara"],"pdf_url":"https://arxiv.org/pdf/2409.05283v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05275v1","updated":"2024-09-09T01:59:29Z","published":"2024-09-09T01:59:29Z","title":"RexUniNLU: Recursive Method with Explicit Schema Instructor for\n Universal NLU","summary":" Information Extraction (IE) and Text Classification (CLS) serve as the\nfundamental pillars of NLU, with both disciplines relying on analyzing input\nsequences to categorize outputs into pre-established schemas. However, there is\nno existing encoder-based model that can unify IE and CLS tasks from this\nperspective. To fully explore the foundation shared within NLU tasks, we have\nproposed a Recursive Method with Explicit Schema Instructor for Universal NLU.\nSpecifically, we firstly redefine the true universal information extraction\n(UIE) with a formal formulation that covers almost all extraction schemas,\nincluding quadruples and quintuples which remain unsolved for previous UIE\nmodels. Then, we expands the formulation to all CLS and multi-modal NLU tasks.\nBased on that, we introduce RexUniNLU, an universal NLU solution that employs\nexplicit schema constraints for IE and CLS, which encompasses all IE and CLS\ntasks and prevent incorrect connections between schema and input sequence. To\navoid interference between different schemas, we reset the position ids and\nattention mask matrices. Extensive experiments are conducted on IE, CLS in both\nEnglish and Chinese, and multi-modality, revealing the effectiveness and\nsuperiority. Our codes are publicly released.\n","authors":["Chengyuan Liu","Shihang Wang","Fubang Zhao","Kun Kuang","Yangyang Kang","Weiming Lu","Changlong Sun","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2409.05275v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2304.14770"},{"id":"http://arxiv.org/abs/2305.03511v2","updated":"2024-09-09T01:44:27Z","published":"2023-05-02T15:33:09Z","title":"Shared Latent Space by Both Languages in Non-Autoregressive Neural\n Machine Translation","summary":" Non-autoregressive neural machine translation (NAT) offers substantial\ntranslation speed up compared to autoregressive neural machine translation (AT)\nat the cost of translation quality. Latent variable modeling has emerged as a\npromising approach to bridge this quality gap, particularly for addressing the\nchronic multimodality problem in NAT. In the previous works that used latent\nvariable modeling, they added an auxiliary model to estimate the posterior\ndistribution of the latent variable conditioned on the source and target\nsentences. However, it causes several disadvantages, such as redundant\ninformation extraction in the latent variable, increasing the number of\nparameters, and a tendency to ignore some information from the inputs. In this\npaper, we propose a novel latent variable modeling that integrates a dual\nreconstruction perspective and an advanced hierarchical latent modeling with a\nshared intermediate latent space across languages. This latent variable\nmodeling hypothetically alleviates or prevents the above disadvantages. In our\nexperiment results, we present comprehensive demonstrations that our proposed\napproach infers superior latent variables which lead better translation\nquality. Finally, in the benchmark translation tasks, such as WMT, we\ndemonstrate that our proposed method significantly improves translation quality\ncompared to previous NAT baselines including the state-of-the-art NAT model.\n","authors":["DongNyeong Heo","Heeyoul Choi"],"pdf_url":"https://arxiv.org/pdf/2305.03511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05257v1","updated":"2024-09-09T00:40:47Z","published":"2024-09-09T00:40:47Z","title":"UPCS: Unbiased Persona Construction for Dialogue Generation","summary":" Narrative systems, such as dialogue and storytelling systems, often utilize\npersona profiles to enhance personalized interactions. Existing persona\nprofiles frequently exhibit biases, posing risks to system integrity and\nfairness. To address this, we introduce the UPCS framework, which categorizes\ncharacter descriptions into eight dimensions, including bias mitigation\nstrategies. Experimental results demonstrate UPCS's superiority in accuracy,\ndiversity, bias elimination, and user satisfaction, marking a significant\nadvancement in persona construction for reliable narrative systems.\n","authors":["Kuiyun Chen","Yanbin Wei"],"pdf_url":"https://arxiv.org/pdf/2409.05257v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.05867v1","updated":"2024-09-09T17:59:57Z","published":"2024-09-09T17:59:57Z","title":"Flash Cache: Reducing Bias in Radiance Cache Based Inverse Rendering","summary":" State-of-the-art techniques for 3D reconstruction are largely based on\nvolumetric scene representations, which require sampling multiple points to\ncompute the color arriving along a ray. Using these representations for more\ngeneral inverse rendering -- reconstructing geometry, materials, and lighting\nfrom observed images -- is challenging because recursively path-tracing such\nvolumetric representations is expensive. Recent works alleviate this issue\nthrough the use of radiance caches: data structures that store the\nsteady-state, infinite-bounce radiance arriving at any point from any\ndirection. However, these solutions rely on approximations that introduce bias\ninto the renderings and, more importantly, into the gradients used for\noptimization. We present a method that avoids these approximations while\nremaining computationally efficient. In particular, we leverage two techniques\nto reduce variance for unbiased estimators of the rendering equation: (1) an\nocclusion-aware importance sampler for incoming illumination and (2) a fast\ncache architecture that can be used as a control variate for the radiance from\na high-quality, but more expensive, volumetric cache. We show that by removing\nthese biases our approach improves the generality of radiance cache based\ninverse rendering, as well as increasing quality in the presence of challenging\nlight transport effects such as specular reflections.\n","authors":["Benjamin Attal","Dor Verbin","Ben Mildenhall","Peter Hedman","Jonathan T. Barron","Matthew O'Toole","Pratul P. Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2409.05867v1.pdf","comment":"Website: https://benattal.github.io/flash-cache/"},{"id":"http://arxiv.org/abs/2409.05864v1","updated":"2024-09-09T17:59:45Z","published":"2024-09-09T17:59:45Z","title":"Neural MP: A Generalist Neural Motion Planner","summary":" The current paradigm for motion planning generates solutions from scratch for\nevery new problem, which consumes significant amounts of time and computational\nresources. For complex, cluttered scenes, motion planning approaches can often\ntake minutes to produce a solution, while humans are able to accurately and\nsafely reach any goal in seconds by leveraging their prior experience. We seek\nto do the same by applying data-driven learning at scale to the problem of\nmotion planning. Our approach builds a large number of complex scenes in\nsimulation, collects expert data from a motion planner, then distills it into a\nreactive generalist policy. We then combine this with lightweight optimization\nto obtain a safe path for real world deployment. We perform a thorough\nevaluation of our method on 64 motion planning tasks across four diverse\nenvironments with randomized poses, scenes and obstacles, in the real world,\ndemonstrating an improvement of 23%, 17% and 79% motion planning success rate\nover state of the art sampling, optimization and learning based planning\nmethods. Video results available at mihdalal.github.io/neuralmotionplanner\n","authors":["Murtaza Dalal","Jiahui Yang","Russell Mendonca","Youssef Khaky","Ruslan Salakhutdinov","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2409.05864v1.pdf","comment":"Website at mihdalal.github.io/neuralmotionplanner. Main paper: 7\n pages, 4 figures, 2 tables. Appendix: 9 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.05863v1","updated":"2024-09-09T17:59:15Z","published":"2024-09-09T17:59:15Z","title":"Promptable Closed-loop Traffic Simulation","summary":" Simulation stands as a cornerstone for safe and efficient autonomous driving\ndevelopment. At its core a simulation system ought to produce realistic,\nreactive, and controllable traffic patterns. In this paper, we propose ProSim,\na multimodal promptable closed-loop traffic simulation framework. ProSim allows\nthe user to give a complex set of numerical, categorical or textual prompts to\ninstruct each agent's behavior and intention. ProSim then rolls out a traffic\nscenario in a closed-loop manner, modeling each agent's interaction with other\ntraffic participants. Our experiments show that ProSim achieves high prompt\ncontrollability given different user prompts, while reaching competitive\nperformance on the Waymo Sim Agents Challenge when no prompt is given. To\nsupport research on promptable traffic simulation, we create\nProSim-Instruct-520k, a multimodal prompt-scenario paired driving dataset with\nover 10M text prompts for over 520k real-world driving scenarios. We will\nrelease code of ProSim as well as data and labeling tools of\nProSim-Instruct-520k at https://ariostgx.github.io/ProSim.\n","authors":["Shuhan Tan","Boris Ivanovic","Yuxiao Chen","Boyi Li","Xinshuo Weng","Yulong Cao","Philipp Krähenbühl","Marco Pavone"],"pdf_url":"https://arxiv.org/pdf/2409.05863v1.pdf","comment":"Accepted to CoRL 2024. Website available at\n https://ariostgx.github.io/ProSim"},{"id":"http://arxiv.org/abs/2409.05862v1","updated":"2024-09-09T17:59:13Z","published":"2024-09-09T17:59:13Z","title":"Evaluating Multiview Object Consistency in Humans and Image Models","summary":" We introduce a benchmark to directly evaluate the alignment between human\nobservers and vision models on a 3D shape inference task. We leverage an\nexperimental design from the cognitive sciences which requires zero-shot visual\ninferences about object shape: given a set of images, participants identify\nwhich contain the same/different objects, despite considerable viewpoint\nvariation. We draw from a diverse range of images that include common objects\n(e.g., chairs) as well as abstract shapes (i.e., procedurally generated\n`nonsense' objects). After constructing over 2000 unique image sets, we\nadminister these tasks to human participants, collecting 35K trials of\nbehavioral data from over 500 participants. This includes explicit choice\nbehaviors as well as intermediate measures, such as reaction time and gaze\ndata. We then evaluate the performance of common vision models (e.g., DINOv2,\nMAE, CLIP). We find that humans outperform all models by a wide margin. Using a\nmulti-scale evaluation approach, we identify underlying similarities and\ndifferences between models and humans: while human-model performance is\ncorrelated, humans allocate more time/processing on challenging trials. All\nimages, data, and code can be accessed via our project page.\n","authors":["Tyler Bonnen","Stephanie Fu","Yutong Bai","Thomas O'Connell","Yoni Friedman","Nancy Kanwisher","Joshua B. Tenenbaum","Alexei A. Efros"],"pdf_url":"https://arxiv.org/pdf/2409.05862v1.pdf","comment":"Project page: https:/tzler.github.io/MOCHI/ Code:\n https://github.com/tzler/mochi_code Huggingface dataset:\n https://huggingface.co/datasets/tzler/MOCHI"},{"id":"http://arxiv.org/abs/2408.08381v3","updated":"2024-09-09T17:58:42Z","published":"2024-08-15T18:54:31Z","title":"Pre-processing and Compression: Understanding Hidden Representation\n Refinement Across Imaging Domains via Intrinsic Dimension","summary":" In recent years, there has been interest in how geometric properties such as\nintrinsic dimension (ID) of a neural network's hidden representations change\nthrough its layers, and how such properties are predictive of important model\nbehavior such as generalization ability. However, evidence has begun to emerge\nthat such behavior can change significantly depending on the domain of the\nnetwork's training data, such as natural versus medical images. Here, we\nfurther this inquiry by exploring how the ID of a network's learned\nrepresentations changes through its layers, in essence, characterizing how the\nnetwork successively refines the information content of input data to be used\nfor predictions. Analyzing eleven natural and medical image datasets across six\nnetwork architectures, we find that how ID changes through the network differs\nnoticeably between natural and medical image models. Specifically, medical\nimage models peak in representation ID earlier in the network, implying a\ndifference in the image features and their abstractness that are typically used\nfor downstream tasks in these domains. Additionally, we discover a strong\ncorrelation of this peak representation ID with the ID of the data in its input\nspace, implying that the intrinsic information content of a model's learned\nrepresentations is guided by that of the data it was trained on. Overall, our\nfindings emphasize notable discrepancies in network behavior between natural\nand non-natural imaging domains regarding hidden representation information\ncontent, and provide further insights into how a network's learned features are\nshaped by its training data.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.08381v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05847v1","updated":"2024-09-09T17:45:45Z","published":"2024-09-09T17:45:45Z","title":"LSVOS Challenge Report: Large-scale Complex and Long Video Object\n Segmentation","summary":" Despite the promising performance of current video segmentation models on\nexisting benchmarks, these models still struggle with complex scenes. In this\npaper, we introduce the 6th Large-scale Video Object Segmentation (LSVOS)\nchallenge in conjunction with ECCV 2024 workshop. This year's challenge\nincludes two tasks: Video Object Segmentation (VOS) and Referring Video Object\nSegmentation (RVOS). In this year, we replace the classic YouTube-VOS and\nYouTube-RVOS benchmark with latest datasets MOSE, LVOS, and MeViS to assess VOS\nunder more challenging complex environments. This year's challenge attracted\n129 registered teams from more than 20 institutes across over 8 countries. This\nreport include the challenge and dataset introduction, and the methods used by\ntop 7 teams in two tracks. More details can be found in our homepage\nhttps://lsvos.github.io/.\n","authors":["Henghui Ding","Lingyi Hong","Chang Liu","Ning Xu","Linjie Yang","Yuchen Fan","Deshui Miao","Yameng Gu","Xin Li","Zhenyu He","Yaowei Wang","Ming-Hsuan Yang","Jinming Chai","Qin Ma","Junpei Zhang","Licheng Jiao","Fang Liu","Xinyu Liu","Jing Zhang","Kexin Zhang","Xu Liu","LingLing Li","Hao Fang","Feiyu Pan","Xiankai Lu","Wei Zhang","Runmin Cong","Tuyen Tran","Bin Cao","Yisi Zhang","Hanyi Wang","Xingjian He","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2409.05847v1.pdf","comment":"ECCV 2024 LSVOS Challenge Report: https://lsvos.github.io/"},{"id":"http://arxiv.org/abs/2409.05834v1","updated":"2024-09-09T17:40:30Z","published":"2024-09-09T17:40:30Z","title":"Vision-Driven 2D Supervised Fine-Tuning Framework for Bird's Eye View\n Perception","summary":" Visual bird's eye view (BEV) perception, due to its excellent perceptual\ncapabilities, is progressively replacing costly LiDAR-based perception systems,\nespecially in the realm of urban intelligent driving. However, this type of\nperception still relies on LiDAR data to construct ground truth databases, a\nprocess that is both cumbersome and time-consuming. Moreover, most massproduced\nautonomous driving systems are only equipped with surround camera sensors and\nlack LiDAR data for precise annotation. To tackle this challenge, we propose a\nfine-tuning method for BEV perception network based on visual 2D semantic\nperception, aimed at enhancing the model's generalization capabilities in new\nscene data. Considering the maturity and development of 2D perception\ntechnologies, our method significantly reduces the dependency on high-cost BEV\nground truths and shows promising industrial application prospects. Extensive\nexperiments and comparative analyses conducted on the nuScenes and Waymo public\ndatasets demonstrate the effectiveness of our proposed method.\n","authors":["Lei He","Qiaoyi Wang","Honglin Sun","Qing Xu","Bolin Gao","Shengbo Eben Li","Jianqiang Wang","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.05834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01322v2","updated":"2024-09-09T17:38:21Z","published":"2024-09-02T15:21:46Z","title":"Guide-and-Rescale: Self-Guidance Mechanism for Effective Tuning-Free\n Real Image Editing","summary":" Despite recent advances in large-scale text-to-image generative models,\nmanipulating real images with these models remains a challenging problem. The\nmain limitations of existing editing methods are that they either fail to\nperform with consistent quality on a wide range of image edits or require\ntime-consuming hyperparameter tuning or fine-tuning of the diffusion model to\npreserve the image-specific appearance of the input image. We propose a novel\napproach that is built upon a modified diffusion sampling process via the\nguidance mechanism. In this work, we explore the self-guidance technique to\npreserve the overall structure of the input image and its local regions\nappearance that should not be edited. In particular, we explicitly introduce\nlayout-preserving energy functions that are aimed to save local and global\nstructures of the source image. Additionally, we propose a noise rescaling\nmechanism that allows to preserve noise distribution by balancing the norms of\nclassifier-free guidance and our proposed guiders during generation. Such a\nguiding approach does not require fine-tuning the diffusion model and exact\ninversion process. As a result, the proposed method provides a fast and\nhigh-quality editing mechanism. In our experiments, we show through human\nevaluation and quantitative analysis that the proposed method allows to produce\ndesired editing which is more preferable by humans and also achieves a better\ntrade-off between editing quality and preservation of the original image. Our\ncode is available at https://github.com/FusionBrainLab/Guide-and-Rescale.\n","authors":["Vadim Titov","Madina Khalmatova","Alexandra Ivanova","Dmitry Vetrov","Aibek Alanov"],"pdf_url":"https://arxiv.org/pdf/2409.01322v2.pdf","comment":"Accepted to ECCV 2024. The project page is available at\n https://fusionbrainlab.github.io/Guide-and-Rescale"},{"id":"http://arxiv.org/abs/2405.14977v2","updated":"2024-09-09T17:33:57Z","published":"2024-05-23T18:27:07Z","title":"A Lost Opportunity for Vision-Language Models: A Comparative Study of\n Online Test-Time Adaptation for Vision-Language Models","summary":" In deep learning, maintaining model robustness against distribution shifts is\ncritical. This work explores a broad range of possibilities to adapt\nvision-language foundation models at test-time, with a particular emphasis on\nCLIP and its variants. The study systematically examines prompt-based\ntechniques and existing test-time adaptation methods, aiming to improve the\nrobustness under distribution shift in diverse real-world scenarios.\nSpecifically, the investigation covers various prompt engineering strategies,\nincluding handcrafted prompts, prompt ensembles, and prompt learning\ntechniques. Additionally, we introduce a vision-text-space ensemble that\nsubstantially enhances average performance compared to text-space-only\nensembles. Since online test-time adaptation has shown to be effective to\nmitigate performance drops under distribution shift, the study extends its\nscope to evaluate the effectiveness of existing test-time adaptation methods\nthat were originally designed for vision-only classification models. Through\nextensive experimental evaluations conducted across multiple datasets and\ndiverse model architectures, the research demonstrates the effectiveness of\nthese adaptation strategies. Code is available at:\nhttps://github.com/mariodoebler/test-time-adaptation\n","authors":["Mario Döbler","Robert A. Marsden","Tobias Raichle","Bin Yang"],"pdf_url":"https://arxiv.org/pdf/2405.14977v2.pdf","comment":"Accepted at ECCV 2024 OOD-CV Workshop"},{"id":"http://arxiv.org/abs/2404.02999v2","updated":"2024-09-09T17:33:09Z","published":"2024-04-03T18:40:48Z","title":"MeshBrush: Painting the Anatomical Mesh with Neural Stylization for\n Endoscopy","summary":" Style transfer is a promising approach to close the sim-to-real gap in\nmedical endoscopy. Rendering synthetic endoscopic videos by traversing\npre-operative scans (such as MRI or CT) can generate structurally accurate\nsimulations as well as ground truth camera poses and depth maps. Although\nimage-to-image (I2I) translation models such as CycleGAN can imitate realistic\nendoscopic images from these simulations, they are unsuitable for\nvideo-to-video synthesis due to the lack of temporal consistency, resulting in\nartifacts between frames. We propose MeshBrush, a neural mesh stylization\nmethod to synthesize temporally consistent videos with differentiable\nrendering. MeshBrush uses the underlying geometry of patient imaging data while\nleveraging existing I2I methods. With learned per-vertex textures, the stylized\nmesh guarantees consistency while producing high-fidelity outputs. We\ndemonstrate that mesh stylization is a promising approach for creating\nrealistic simulations for downstream tasks such as training networks and\npreoperative planning. Although our method is tested and designed for\nureteroscopy, its components are transferable to general endoscopic and\nlaparoscopic procedures. The code will be made public on GitHub.\n","authors":["John J. Han","Ayberk Acar","Nicholas Kavoussi","Jie Ying Wu"],"pdf_url":"https://arxiv.org/pdf/2404.02999v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.05819v1","updated":"2024-09-09T17:28:57Z","published":"2024-09-09T17:28:57Z","title":"GASP: Gaussian Splatting for Physic-Based Simulations","summary":" Physics simulation is paramount for modeling and utilization of 3D scenes in\nvarious real-world applications. However, its integration with state-of-the-art\n3D scene rendering techniques such as Gaussian Splatting (GS) remains\nchallenging. Existing models use additional meshing mechanisms, including\ntriangle or tetrahedron meshing, marching cubes, or cage meshes. As an\nalternative, we can modify the physics grounded Newtonian dynamics to align\nwith 3D Gaussian components. Current models take the first-order approximation\nof a deformation map, which locally approximates the dynamics by linear\ntransformations. In contrast, our Gaussian Splatting for Physics-Based\nSimulations (GASP) model uses such a map (without any modifications) and flat\nGaussian distributions, which are parameterized by three points (mesh faces).\nSubsequently, each 3D point (mesh face node) is treated as a discrete entity\nwithin a 3D space. Consequently, the problem of modeling Gaussian components is\nreduced to working with 3D points. Additionally, the information on mesh faces\ncan be used to incorporate further properties into the physics model,\nfacilitating the use of triangles. Resulting solution can be integrated into\nany physics engine that can be treated as a black box. As demonstrated in our\nstudies, the proposed model exhibits superior performance on a diverse range of\nbenchmark datasets designed for 3D object rendering.\n","authors":["Piotr Borycki","Weronika Smolak","Joanna Waczyńska","Marcin Mazur","Sławomir Tadeja","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2409.05819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05817v1","updated":"2024-09-09T17:23:39Z","published":"2024-09-09T17:23:39Z","title":"VFA: Vision Frequency Analysis of Foundation Models and Human","summary":" Machine learning models often struggle with distribution shifts in real-world\nscenarios, whereas humans exhibit robust adaptation. Models that better align\nwith human perception may achieve higher out-of-distribution generalization. In\nthis study, we investigate how various characteristics of large-scale computer\nvision models influence their alignment with human capabilities and robustness.\nOur findings indicate that increasing model and data size and incorporating\nrich semantic information and multiple modalities enhance models' alignment\nwith human perception and their overall robustness. Our empirical analysis\ndemonstrates a strong correlation between out-of-distribution accuracy and\nhuman alignment.\n","authors":["Mohammad-Javad Darvishi-Bayazi","Md Rifat Arefin","Jocelyn Faubert","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2409.05817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05809v1","updated":"2024-09-09T17:12:42Z","published":"2024-09-09T17:12:42Z","title":"A Flexible Framework for Universal Computational Aberration Correction\n via Automatic Lens Library Generation and Domain Adaptation","summary":" Emerging universal Computational Aberration Correction (CAC) paradigms\nprovide an inspiring solution to light-weight and high-quality imaging without\nrepeated data preparation and model training to accommodate new lens designs.\nHowever, the training databases in these approaches, i.e., the lens libraries\n(LensLibs), suffer from their limited coverage of real-world aberration\nbehaviors. In this work, we set up an OmniLens framework for universal CAC,\nconsidering both the generalization ability and flexibility. OmniLens extends\nthe idea of universal CAC to a broader concept, where a base model is trained\nfor three cases, including zero-shot CAC with the pre-trained model, few-shot\nCAC with a little lens-specific data for fine-tuning, and domain adaptive CAC\nusing domain adaptation for lens-descriptions-unknown lens. In terms of\nOmniLens's data foundation, we first propose an Evolution-based Automatic\nOptical Design (EAOD) pipeline to construct LensLib automatically, coined\nAODLib, whose diversity is enriched by an evolution framework, with\ncomprehensive constraints and a hybrid optimization strategy for achieving\nrealistic aberration behaviors. For network design, we introduce the guidance\nof high-quality codebook priors to facilitate zero-shot CAC and few-shot CAC,\nwhich enhances the model's generalization ability, while also boosting its\nconvergence in a few-shot case. Furthermore, based on the statistical\nobservation of dark channel priors in optical degradation, we design an\nunsupervised regularization term to adapt the base model to the target\ndescriptions-unknown lens using its aberration images without ground truth. We\nvalidate OmniLens on 4 manually designed low-end lenses with various structures\nand aberration behaviors. Remarkably, the base model trained on AODLib exhibits\nstrong generalization capabilities, achieving 97% of the lens-specific\nperformance in a zero-shot setting.\n","authors":["Qi Jiang","Yao Gao","Shaohua Gao","Zhonghua Yi","Lei Sun","Hao Shi","Kailun Yang","Kaiwei Wang","Jian Bai"],"pdf_url":"https://arxiv.org/pdf/2409.05809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05800v1","updated":"2024-09-09T17:03:43Z","published":"2024-09-09T17:03:43Z","title":"Input Space Mode Connectivity in Deep Neural Networks","summary":" We extend the concept of loss landscape mode connectivity to the input space\nof deep neural networks. Mode connectivity was originally studied within\nparameter space, where it describes the existence of low-loss paths between\ndifferent solutions (loss minimizers) obtained through gradient descent. We\npresent theoretical and empirical evidence of its presence in the input space\nof deep networks, thereby highlighting the broader nature of the phenomenon. We\nobserve that different input images with similar predictions are generally\nconnected, and for trained models, the path tends to be simple, with only a\nsmall deviation from being a linear path. Our methodology utilizes real,\ninterpolated, and synthetic inputs created using the input optimization\ntechnique for feature visualization. We conjecture that input space mode\nconnectivity in high-dimensional spaces is a geometric effect that takes place\neven in untrained models and can be explained through percolation theory. We\nexploit mode connectivity to obtain new insights about adversarial examples and\ndemonstrate its potential for adversarial detection. Additionally, we discuss\napplications for the interpretability of deep networks.\n","authors":["Jakub Vrabel","Ori Shem-Ur","Yaron Oz","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2409.05800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09990v2","updated":"2024-09-09T16:59:57Z","published":"2024-05-16T11:21:02Z","title":"A Comprehensive Evaluation of Histopathology Foundation Models for\n Ovarian Cancer Subtype Classification","summary":" Large pretrained transformers are increasingly being developed as generalised\nfoundation models which can underpin powerful task-specific artificial\nintelligence models. Histopathology foundation models show great promise across\nmany tasks, but analyses have typically been limited by arbitrary\nhyperparameters that were not tuned to the specific task. We report the most\nrigorous single-task validation of histopathology foundation models to date,\nspecifically in ovarian cancer morphological subtyping. Attention-based\nmultiple instance learning classifiers were compared using three\nImageNet-pretrained feature extractors and fourteen histopathology foundation\nmodels. The training set consisted of 1864 whole slide images from 434 ovarian\ncarcinoma cases at Leeds Teaching Hospitals NHS Trust. Five-class\nclassification performance was evaluated through five-fold cross-validation,\nand these cross-validation models were ensembled for hold-out testing and\nexternal validation on the Transcanadian Study and OCEAN Challenge datasets.\nThe best-performing model used the H-optimus-0 foundation model, with\nfive-class balanced accuracies of 89%, 97%, and 74% in the test sets.\nNormalisations and augmentations aided the performance of the\nImageNet-pretrained ResNets, but these were still outperformed by 13 of the 14\nfoundation models. Hyperparameter tuning the downstream classifiers improved\nperformance by a median 1.9% balanced accuracy, with many improvements being\nstatistically significant. Histopathology foundation models offer a clear\nbenefit to ovarian cancer subtyping, improving classification performance to a\ndegree where clinical utility is tangible, albeit with an increased\ncomputational burden. Such models could provide a second opinion to\nhistopathologists diagnosing challenging cases and may improve the accuracy,\nobjectivity, and efficiency of pathological diagnoses overall.\n","authors":["Jack Breen","Katie Allen","Kieran Zucker","Lucy Godson","Nicolas M. Orsi","Nishant Ravikumar"],"pdf_url":"https://arxiv.org/pdf/2405.09990v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.19227v5","updated":"2024-09-09T16:51:21Z","published":"2024-04-30T03:13:06Z","title":"Espresso: Robust Concept Filtering in Text-to-Image Models","summary":" Diffusion based text-to-image models are trained on large datasets scraped\nfrom the Internet, potentially containing unacceptable concepts (e.g.,\ncopyright infringing or unsafe). We need concept removal techniques (CRTs)\nwhich are effective in preventing the generation of images with unacceptable\nconcepts, utility-preserving on acceptable concepts, and robust against evasion\nwith adversarial prompts. None of the prior CRTs satisfy all these requirements\nsimultaneously. We introduce Espresso, the first robust concept filter based on\nContrastive Language-Image Pre-Training (CLIP). We configure CLIP to identify\nunacceptable concepts in generated images using the distance of their\nembeddings to the text embeddings of both unacceptable and acceptable concepts.\nThis lets us fine-tune for robustness by separating the text embeddings of\nunacceptable and acceptable concepts while preserving their pairing with image\nembeddings for utility. We present a pipeline to evaluate various CRTs, attacks\nagainst them, and show that Espresso, is more effective and robust than prior\nCRTs, while retaining utility.\n","authors":["Anudeep Das","Vasisht Duddu","Rui Zhang","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2404.19227v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05786v1","updated":"2024-09-09T16:48:42Z","published":"2024-09-09T16:48:42Z","title":"Leveraging Object Priors for Point Tracking","summary":" Point tracking is a fundamental problem in computer vision with numerous\napplications in AR and robotics. A common failure mode in long-term point\ntracking occurs when the predicted point leaves the object it belongs to and\nlands on the background or another object. We identify this as the failure to\ncorrectly capture objectness properties in learning to track. To address this\nlimitation of prior work, we propose a novel objectness regularization approach\nthat guides points to be aware of object priors by forcing them to stay inside\nthe the boundaries of object instances. By capturing objectness cues at\ntraining time, we avoid the need to compute object masks during testing. In\naddition, we leverage contextual attention to enhance the feature\nrepresentation for capturing objectness at the feature level more effectively.\nAs a result, our approach achieves state-of-the-art performance on three point\ntracking benchmarks, and we further validate the effectiveness of our\ncomponents via ablation studies. The source code is available at:\nhttps://github.com/RehgLab/tracking_objectness\n","authors":["Bikram Boote","Anh Thai","Wenqi Jia","Ozgur Kara","Stefan Stojanov","James M. Rehg","Sangmin Lee"],"pdf_url":"https://arxiv.org/pdf/2409.05786v1.pdf","comment":"ECCV 2024 ILR Workshop"},{"id":"http://arxiv.org/abs/2407.11820v2","updated":"2024-09-09T16:43:34Z","published":"2024-07-16T15:08:30Z","title":"Stepping Stones: A Progressive Training Strategy for Audio-Visual\n Semantic Segmentation","summary":" Audio-Visual Segmentation (AVS) aims to achieve pixel-level localization of\nsound sources in videos, while Audio-Visual Semantic Segmentation (AVSS), as an\nextension of AVS, further pursues semantic understanding of audio-visual\nscenes. However, since the AVSS task requires the establishment of audio-visual\ncorrespondence and semantic understanding simultaneously, we observe that\nprevious methods have struggled to handle this mashup of objectives in\nend-to-end training, resulting in insufficient learning and sub-optimization.\nTherefore, we propose a two-stage training strategy called \\textit{Stepping\nStones}, which decomposes the AVSS task into two simple subtasks from\nlocalization to semantic understanding, which are fully optimized in each stage\nto achieve step-by-step global optimization. This training strategy has also\nproved its generalization and effectiveness on existing methods. To further\nimprove the performance of AVS tasks, we propose a novel framework Adaptive\nAudio Visual Segmentation, in which we incorporate an adaptive audio query\ngenerator and integrate masked attention into the transformer decoder,\nfacilitating the adaptive fusion of visual and audio features. Extensive\nexperiments demonstrate that our methods achieve state-of-the-art results on\nall three AVS benchmarks. The project homepage can be accessed at\nhttps://gewu-lab.github.io/stepping_stones/.\n","authors":["Juncheng Ma","Peiwen Sun","Yaoting Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2407.11820v2.pdf","comment":"ECCV2024 poster. Project url:\n https://gewu-lab.github.io/stepping_stones"},{"id":"http://arxiv.org/abs/2407.19001v2","updated":"2024-09-09T16:43:23Z","published":"2024-07-26T17:59:51Z","title":"PromptCCD: Learning Gaussian Mixture Prompt Pool for Continual Category\n Discovery","summary":" We tackle the problem of Continual Category Discovery (CCD), which aims to\nautomatically discover novel categories in a continuous stream of unlabeled\ndata while mitigating the challenge of catastrophic forgetting -- an open\nproblem that persists even in conventional, fully supervised continual\nlearning. To address this challenge, we propose PromptCCD, a simple yet\neffective framework that utilizes a Gaussian Mixture Model (GMM) as a prompting\nmethod for CCD. At the core of PromptCCD lies the Gaussian Mixture Prompting\n(GMP) module, which acts as a dynamic pool that updates over time to facilitate\nrepresentation learning and prevent forgetting during category discovery.\nMoreover, GMP enables on-the-fly estimation of category numbers, allowing\nPromptCCD to discover categories in unlabeled data without prior knowledge of\nthe category numbers. We extend the standard evaluation metric for Generalized\nCategory Discovery (GCD) to CCD and benchmark state-of-the-art methods on\ndiverse public datasets. PromptCCD significantly outperforms existing methods,\ndemonstrating its effectiveness. Project page:\nhttps://visual-ai.github.io/promptccd .\n","authors":["Fernando Julio Cendra","Bingchen Zhao","Kai Han"],"pdf_url":"https://arxiv.org/pdf/2407.19001v2.pdf","comment":"ECCV 2024, Project page: https://visual-ai.github.io/promptccd"},{"id":"http://arxiv.org/abs/2409.01421v2","updated":"2024-09-09T16:37:39Z","published":"2024-09-02T18:57:07Z","title":"DiffCSG: Differentiable CSG via Rasterization","summary":" Differentiable rendering is a key ingredient for inverse rendering and\nmachine learning, as it allows to optimize scene parameters (shape, materials,\nlighting) to best fit target images. Differentiable rendering requires that\neach scene parameter relates to pixel values through differentiable operations.\nWhile 3D mesh rendering algorithms have been implemented in a differentiable\nway, these algorithms do not directly extend to Constructive-Solid-Geometry\n(CSG), a popular parametric representation of shapes, because the underlying\nboolean operations are typically performed with complex black-box\nmesh-processing libraries. We present an algorithm, DiffCSG, to render CSG\nmodels in a differentiable manner. Our algorithm builds upon CSG rasterization,\nwhich displays the result of boolean operations between primitives without\nexplicitly computing the resulting mesh and, as such, bypasses black-box mesh\nprocessing. We describe how to implement CSG rasterization within a\ndifferentiable rendering pipeline, taking special care to apply antialiasing\nalong primitive intersections to obtain gradients in such critical areas. Our\nalgorithm is simple and fast, can be easily incorporated into modern machine\nlearning setups, and enables a range of applications for computer-aided design,\nincluding direct and image-based editing of CSG primitives. Code and data:\nhttps://yyyyyhc.github.io/DiffCSG/.\n","authors":["Haocheng Yuan","Adrien Bousseau","Hao Pan","Chengquan Zhang","Niloy J. Mitra","Changjian Li"],"pdf_url":"https://arxiv.org/pdf/2409.01421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05773v1","updated":"2024-09-09T16:34:36Z","published":"2024-09-09T16:34:36Z","title":"Creativity and Visual Communication from Machine to Musician: Sharing a\n Score through a Robotic Camera","summary":" This paper explores the integration of visual communication and musical\ninteraction by implementing a robotic camera within a \"Guided Harmony\" musical\ngame. We aim to examine co-creative behaviors between human musicians and\nrobotic systems. Our research explores existing methodologies like\nimprovisational game pieces and extends these concepts to include robotic\nparticipation using a PTZ camera. The robotic system interprets and responds to\nnonverbal cues from musicians, creating a collaborative and adaptive musical\nexperience. This initial case study underscores the importance of intuitive\nvisual communication channels. We also propose future research directions,\nincluding parameters for refining the visual cue toolkit and data collection\nmethods to understand human-machine co-creativity further. Our findings\ncontribute to the broader understanding of machine intelligence in augmenting\nhuman creativity, particularly in musical settings.\n","authors":["Ross Greer","Laura Fleig","Shlomo Dubnov"],"pdf_url":"https://arxiv.org/pdf/2409.05773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05770v1","updated":"2024-09-09T16:33:00Z","published":"2024-09-09T16:33:00Z","title":"Consensus-based Distributed Quantum Kernel Learning for Speech\n Recognition","summary":" This paper presents a Consensus-based Distributed Quantum Kernel Learning\n(CDQKL) framework aimed at improving speech recognition through distributed\nquantum computing.CDQKL addresses the challenges of scalability and data\nprivacy in centralized quantum kernel learning. It does this by distributing\ncomputational tasks across quantum terminals, which are connected through\nclassical channels. This approach enables the exchange of model parameters\nwithout sharing local training data, thereby maintaining data privacy and\nenhancing computational efficiency. Experimental evaluations on benchmark\nspeech emotion recognition datasets demonstrate that CDQKL achieves competitive\nclassification accuracy and scalability compared to centralized and local\nquantum kernel learning models. The distributed nature of CDQKL offers\nadvantages in privacy preservation and computational efficiency, making it\nsuitable for data-sensitive fields such as telecommunications, automotive, and\nfinance. The findings suggest that CDQKL can effectively leverage distributed\nquantum computing for large-scale machine-learning tasks.\n","authors":["Kuan-Cheng Chen","Wenxuan Ma","Xiaotian Xu"],"pdf_url":"https://arxiv.org/pdf/2409.05770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16729v2","updated":"2024-09-09T16:27:00Z","published":"2024-08-29T17:20:59Z","title":"Prediction-Feedback DETR for Temporal Action Detection","summary":" Temporal Action Detection (TAD) is fundamental yet challenging for real-world\nvideo applications. Leveraging the unique benefits of transformers, various\nDETR-based approaches have been adopted in TAD. However, it has recently been\nidentified that the attention collapse in self-attention causes the performance\ndegradation of DETR for TAD. Building upon previous research, this paper newly\naddresses the attention collapse problem in cross-attention within DETR-based\nTAD methods. Moreover, our findings reveal that cross-attention exhibits\npatterns distinct from predictions, indicating a short-cut phenomenon. To\nresolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR),\nwhich utilizes predictions to restore the collapse and align the cross- and\nself-attention with predictions. Specifically, we devise novel\nprediction-feedback objectives using guidance from the relations of the\npredictions. As a result, Pred-DETR significantly alleviates the collapse and\nachieves state-of-the-art performance among DETR-based methods on various\nchallenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and\nFineAction.\n","authors":["Jihwan Kim","Miso Lee","Cheol-Ho Cho","Jihyun Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2408.16729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02349v2","updated":"2024-09-09T16:17:29Z","published":"2024-02-04T05:25:12Z","title":"3D Lymphoma Segmentation on PET/CT Images via Multi-Scale Information\n Fusion with Cross-Attention","summary":" Background: Accurate segmentation of diffuse large B-cell lymphoma (DLBCL)\nlesions is challenging due to their complex patterns in medical imaging.\n Objective: This study aims to develop a precise segmentation method for DLBCL\nusing 18F-Fluorodeoxyglucose (FDG) positron emission tomography (PET) and\ncomputed tomography (CT) images.\n Methods: We propose a 3D dual-branch encoder segmentation method using\nshifted window transformers and a Multi-Scale Information Fusion (MSIF) module.\nTo enhance feature integration, the MSIF module performs multi-scale feature\nfusion using cross-attention mechanisms with a shifted window framework. A\ngated neural network within the MSIF module dynamically balances the\ncontributions from each modality. The model was optimized using the Dice\nSimilarity Coefficient (DSC) loss function. Additionally, we computed the total\nmetabolic tumor volume (TMTV) and performed statistical analyses.\n Results: The model was trained and validated on a dataset of 165 DLBCL\npatients using 5-fold cross-validation, achieving a DSC of 0.7512. Statistical\nanalysis showed a significant improvement over comparative methods (p < 0.05).\nAdditionally, a Pearson correlation coefficient of 0.91 and an R^2 of 0.89 were\nobserved when comparing manual annotations to segmentation results for TMTV\nmeasurement.\n Conclusion: This study presents an effective automatic segmentation method\nfor DLBCL that leverages the complementary strengths of PET and CT imaging. Our\nmethod has the potential to improve diagnostic interpretations and assist in\ntreatment planning for DLBCL patients.\n","authors":["Huan Huang","Liheng Qiu","Shenmiao Yang","Longxi Li","Jiaofen Nan","Yanting Li","Chuang Han","Fubao Zhu","Chen Zhao","Weihua Zhou"],"pdf_url":"https://arxiv.org/pdf/2402.02349v2.pdf","comment":"19 pages, 7 figures; reference added"},{"id":"http://arxiv.org/abs/2408.13152v2","updated":"2024-09-09T16:16:19Z","published":"2024-08-23T15:20:53Z","title":"Long-term Pre-training for Temporal Action Detection with Transformers","summary":" Temporal action detection (TAD) is challenging, yet fundamental for\nreal-world video applications. Recently, DETR-based models for TAD have been\nprevailing thanks to their unique benefits. However, transformers demand a huge\ndataset, and unfortunately data scarcity in TAD causes a severe degeneration.\nIn this paper, we identify two crucial problems from data scarcity: attention\ncollapse and imbalanced performance. To this end, we propose a new pre-training\nstrategy, Long-Term Pre-training (LTP), tailored for transformers. LTP has two\nmain components: 1) class-wise synthesis, 2) long-term pretext tasks. Firstly,\nwe synthesize long-form video features by merging video snippets of a target\nclass and non-target classes. They are analogous to untrimmed data used in TAD,\ndespite being created from trimmed data. In addition, we devise two types of\nlong-term pretext tasks to learn long-term dependency. They impose long-term\nconditions such as finding second-to-fourth or short-duration actions. Our\nextensive experiments show state-of-the-art performances in DETR-based methods\non ActivityNet-v1.3 and THUMOS14 by a large margin. Moreover, we demonstrate\nthat LTP significantly relieves the data scarcity issues in TAD.\n","authors":["Jihwan Kim","Miso Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2408.13152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18917v5","updated":"2024-09-09T16:16:05Z","published":"2023-10-29T06:10:46Z","title":"TivNe-SLAM: Dynamic Mapping and Tracking via Time-Varying Neural\n Radiance Fields","summary":" Previous attempts to integrate Neural Radiance Fields (NeRF) into the\nSimultaneous Localization and Mapping (SLAM) framework either rely on the\nassumption of static scenes or require the ground truth camera poses, which\nimpedes their application in real-world scenarios. This paper proposes a\ntime-varying representation to track and reconstruct the dynamic scenes.\nFirstly, two processes, a tracking process and a mapping process, are\nmaintained simultaneously in our framework. In the tracking process, all input\nimages are uniformly sampled and then progressively trained in a\nself-supervised paradigm. In the mapping process, we leverage motion masks to\ndistinguish dynamic objects from the static background, and sample more pixels\nfrom dynamic areas. Secondly, the parameter optimization for both processes is\ncomprised of two stages: the first stage associates time with 3D positions to\nconvert the deformation field to the canonical field. The second stage\nassociates time with the embeddings of the canonical field to obtain colors and\na Signed Distance Function (SDF). Lastly, we propose a novel keyframe selection\nstrategy based on the overlapping rate. Our approach is evaluated on two\nsynthetic datasets and one real-world dataset, and the experiments validate\nthat our method achieves competitive results in both tracking and mapping when\ncompared to existing state-of-the-art NeRF-based dynamic SLAM systems.\n","authors":["Chengyao Duan","Zhiliu Yang"],"pdf_url":"https://arxiv.org/pdf/2310.18917v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05749v1","updated":"2024-09-09T16:03:26Z","published":"2024-09-09T16:03:26Z","title":"ReL-SAR: Representation Learning for Skeleton Action Recognition with\n Convolutional Transformers and BYOL","summary":" To extract robust and generalizable skeleton action recognition features,\nlarge amounts of well-curated data are typically required, which is a\nchallenging task hindered by annotation and computation costs. Therefore,\nunsupervised representation learning is of prime importance to leverage\nunlabeled skeleton data. In this work, we investigate unsupervised\nrepresentation learning for skeleton action recognition. For this purpose, we\ndesigned a lightweight convolutional transformer framework, named ReL-SAR,\nexploiting the complementarity of convolutional and attention layers for\njointly modeling spatial and temporal cues in skeleton sequences. We also use a\nSelection-Permutation strategy for skeleton joints to ensure more informative\ndescriptions from skeletal data. Finally, we capitalize on Bootstrap Your Own\nLatent (BYOL) to learn robust representations from unlabeled skeleton sequence\ndata. We achieved very competitive results on limited-size datasets: MCAD,\nIXMAS, JHMDB, and NW-UCLA, showing the effectiveness of our proposed method\nagainst state-of-the-art methods in terms of both performance and computational\nefficiency. To ensure reproducibility and reusability, the source code\nincluding all implementation parameters is provided at:\nhttps://github.com/SafwenNaimi/Representation-Learning-for-Skeleton-Action-Recognition-with-Convolutional-Transformers-and-BYOL\n","authors":["Safwen Naimi","Wassim Bouachir","Guillaume-Alexandre Bilodeau"],"pdf_url":"https://arxiv.org/pdf/2409.05749v1.pdf","comment":"8 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2311.18799v2","updated":"2024-09-09T16:00:04Z","published":"2023-11-30T18:43:51Z","title":"X-InstructBLIP: A Framework for aligning X-Modal instruction-aware\n representations to LLMs and Emergent Cross-modal Reasoning","summary":" Recent research has achieved significant advancements in visual reasoning\ntasks through learning image-to-language projections and leveraging the\nimpressive reasoning abilities of Large Language Models (LLMs). This paper\nintroduces an efficient and effective framework that integrates multiple\nmodalities (images, 3D, audio and video) to a frozen LLM and demonstrates an\nemergent ability for cross-modal reasoning (2+ modality inputs). Our approach\nexplores two distinct projection mechanisms: Q-Formers and Linear Projections\n(LPs). Through extensive experimentation across all four modalities on 16\nbenchmarks, we explore both methods and assess their adaptability in integrated\nand separate cross-modal reasoning. The Q-Former projection demonstrates\nsuperior performance in single modality scenarios and adaptability in joint\nversus discriminative reasoning involving two or more modalities. However, it\nexhibits lower generalization capabilities than linear projection in contexts\nwhere task-modality data are limited. To enable this framework, we devise a\nscalable pipeline that automatically generates high-quality, instruction-tuning\ndatasets from readily available captioning data across different modalities,\nand contribute 24K QA data for audio and 250K QA data for 3D. To facilitate\nfurther research in cross-modal reasoning, we introduce the DisCRn\n(Discriminative Cross-modal Reasoning) benchmark comprising 9K audio-video QA\nsamples and 28K image-3D QA samples that require the model to reason\ndiscriminatively across disparate input modalities.\n","authors":["Artemis Panagopoulou","Le Xue","Ning Yu","Junnan Li","Dongxu Li","Shafiq Joty","Ran Xu","Silvio Savarese","Caiming Xiong","Juan Carlos Niebles"],"pdf_url":"https://arxiv.org/pdf/2311.18799v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05742v1","updated":"2024-09-09T15:56:34Z","published":"2024-09-09T15:56:34Z","title":"Robust Loss Functions for Object Grasping under Limited Ground Truth","summary":" Object grasping is a crucial technology enabling robots to perceive and\ninteract with the environment sufficiently. However, in practical applications,\nresearchers are faced with missing or noisy ground truth while training the\nconvolutional neural network, which decreases the accuracy of the model.\nTherefore, different loss functions are proposed to deal with these problems to\nimprove the accuracy of the neural network. For missing ground truth, a new\npredicted category probability method is defined for unlabeled samples, which\nworks effectively in conjunction with the pseudo-labeling method. Furthermore,\nfor noisy ground truth, a symmetric loss function is introduced to resist the\ncorruption of label noises. The proposed loss functions are powerful, robust,\nand easy to use. Experimental results based on the typical grasping neural\nnetwork show that our method can improve performance by 2 to 13 percent.\n","authors":["Yangfan Deng","Mengyao Zhang","Yong Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.05742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03887v2","updated":"2024-09-09T15:53:27Z","published":"2024-09-05T19:50:26Z","title":"The Influence of Faulty Labels in Data Sets on Human Pose Estimation","summary":" In this study we provide empirical evidence demonstrating that the quality of\ntraining data impacts model performance in Human Pose Estimation (HPE).\nInaccurate labels in widely used data sets, ranging from minor errors to severe\nmislabeling, can negatively influence learning and distort performance metrics.\nWe perform an in-depth analysis of popular HPE data sets to show the extent and\nnature of label inaccuracies. Our findings suggest that accounting for the\nimpact of faulty labels will facilitate the development of more robust and\naccurate HPE models for a variety of real-world applications. We show improved\nperformance with cleansed data.\n","authors":["Arnold Schwarz","Levente Hernadi","Felix Bießmann","Kristian Hildebrand"],"pdf_url":"https://arxiv.org/pdf/2409.03887v2.pdf","comment":"15 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2409.05721v1","updated":"2024-09-09T15:33:07Z","published":"2024-09-09T15:33:07Z","title":"Referring Expression Generation in Visually Grounded Dialogue with\n Discourse-aware Comprehension Guiding","summary":" We propose an approach to referring expression generation (REG) in visually\ngrounded dialogue that is meant to produce referring expressions (REs) that are\nboth discriminative and discourse-appropriate. Our method constitutes a\ntwo-stage process. First, we model REG as a text- and image-conditioned\nnext-token prediction task. REs are autoregressively generated based on their\npreceding linguistic context and a visual representation of the referent.\nSecond, we propose the use of discourse-aware comprehension guiding as part of\na generate-and-rerank strategy through which candidate REs generated with our\nREG model are reranked based on their discourse-dependent discriminatory power.\nResults from our human evaluation indicate that our proposed two-stage approach\nis effective in producing discriminative REs, with higher performance in terms\nof text-image retrieval accuracy for reranked REs compared to those generated\nusing greedy decoding.\n","authors":["Bram Willemsen","Gabriel Skantze"],"pdf_url":"https://arxiv.org/pdf/2409.05721v1.pdf","comment":"Accepted for publication at INLG 2024"},{"id":"http://arxiv.org/abs/2409.05699v1","updated":"2024-09-09T15:12:28Z","published":"2024-09-09T15:12:28Z","title":"Boosting CNN-based Handwriting Recognition Systems with Learnable\n Relaxation Labeling","summary":" The primary challenge for handwriting recognition systems lies in managing\nlong-range contextual dependencies, an issue that traditional models often\nstruggle with. To mitigate it, attention mechanisms have recently been employed\nto enhance context-aware labelling, thereby achieving state-of-the-art\nperformance. In the field of pattern recognition and image analysis, however,\nthe use of contextual information in labelling problems has a long history and\ngoes back at least to the early 1970's. Among the various approaches developed\nin those years, Relaxation Labelling (RL) processes have played a prominent\nrole and have been the method of choice in the field for more than a decade.\nContrary to recent transformer-based architectures, RL processes offer a\nprincipled approach to the use of contextual constraints, having a solid\ntheoretic foundation grounded on variational inequality and game theory, as\nwell as effective algorithms with convergence guarantees. In this paper, we\npropose a novel approach to handwriting recognition that integrates the\nstrengths of two distinct methodologies. In particular, we propose integrating\n(trainable) RL processes with various well-established neural architectures and\nwe introduce a sparsification technique that accelerates the convergence of the\nalgorithm and enhances the overall system's performance. Experiments over\nseveral benchmark datasets show that RL processes can improve the\ngeneralisation ability, even surpassing in some cases transformer-based\narchitectures.\n","authors":["Sara Ferro","Alessandro Torcinovich","Arianna Traviglia","Marcello Pelillo"],"pdf_url":"https://arxiv.org/pdf/2409.05699v1.pdf","comment":"26 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.05697v1","updated":"2024-09-09T15:11:45Z","published":"2024-09-09T15:11:45Z","title":"Segmentation by Factorization: Unsupervised Semantic Segmentation for\n Pathology by Factorizing Foundation Model Features","summary":" We introduce Segmentation by Factorization (F-SEG), an unsupervised\nsegmentation method for pathology that generates segmentation masks from\npre-trained deep learning models. F-SEG allows the use of pre-trained deep\nneural networks, including recently developed pathology foundation models, for\nsemantic segmentation. It achieves this without requiring additional training\nor finetuning, by factorizing the spatial features extracted by the models into\nsegmentation masks and their associated concept features. We create generic\ntissue phenotypes for H&E images by training clustering models for multiple\nnumbers of clusters on features extracted from several deep learning models on\nThe Cancer Genome Atlas Program (TCGA), and then show how the clusters can be\nused for factorizing corresponding segmentation masks using off-the-shelf deep\nlearning models. Our results show that F-SEG provides robust unsupervised\nsegmentation capabilities for H&E pathology images, and that the segmentation\nquality is greatly improved by utilizing pathology foundation models. We\ndiscuss and propose methods for evaluating the performance of unsupervised\nsegmentation in pathology.\n","authors":["Jacob Gildenblat","Ofir Hadar"],"pdf_url":"https://arxiv.org/pdf/2409.05697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04398v2","updated":"2024-09-09T15:08:06Z","published":"2024-09-06T16:43:04Z","title":"HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale\n Space Using Wearable IMUs and LiDAR","summary":" We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture\nmethod, aimed at accurately and efficiently creating a dynamic digital world,\ncontaining large-scale indoor-outdoor scenes, diverse human motions, rich\nhuman-human interactions, and human-environment interactions. By utilizing\nbody-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human\nmotions in unconstrained space without the need for external devices and\npre-built maps. This affords great flexibility and accessibility for\nhuman-centered interaction and 4D scene capturing in various environments.\nTaking into account that IMUs can capture human spatially unrestricted poses\nbut are prone to drifting for long-period using, and while LiDAR is stable for\nglobal localization but rough for local positions and orientations, HiSC4D\nemploys a joint optimization method, harmonizing all sensors and utilizing\nenvironment cues, yielding promising results for long-term capture in large\nscenes. To promote research of egocentric human interaction in large scenes and\nfacilitate downstream tasks, we also present a dataset, containing 8 sequences\nin 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D\nhuman motions with SMPL annotations and dynamic scenes, 31k frames of cropped\nhuman point clouds, and scene mesh of the environment. A variety of scenarios,\nsuch as the basketball gym and commercial street, alongside challenging human\nmotions, such as daily greeting, one-on-one basketball playing, and tour\nguiding, demonstrate the effectiveness and the generalization ability of\nHiSC4D. The dataset and code will be publicated on\nwww.lidarhumanmotion.net/hisc4d available for research purposes.\n","authors":["Yudi Dai","Zhiyong Wang","Xiping Lin","Chenglu Wen","Lan Xu","Siqi Shen","Yuexin Ma","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04398v2.pdf","comment":"17 pages, 10 figures, Jornal"},{"id":"http://arxiv.org/abs/2409.05688v1","updated":"2024-09-09T15:01:29Z","published":"2024-09-09T15:01:29Z","title":"LayeredFlow: A Real-World Benchmark for Non-Lambertian Multi-Layer\n Optical Flow","summary":" Achieving 3D understanding of non-Lambertian objects is an important task\nwith many useful applications, but most existing algorithms struggle to deal\nwith such objects. One major obstacle towards progress in this field is the\nlack of holistic non-Lambertian benchmarks -- most benchmarks have low scene\nand object diversity, and none provide multi-layer 3D annotations for objects\noccluded by transparent surfaces. In this paper, we introduce LayeredFlow, a\nreal world benchmark containing multi-layer ground truth annotation for optical\nflow of non-Lambertian objects. Compared to previous benchmarks, our benchmark\nexhibits greater scene and object diversity, with 150k high quality optical\nflow and stereo pairs taken over 185 indoor and outdoor scenes and 360 unique\nobjects. Using LayeredFlow as evaluation data, we propose a new task called\nmulti-layer optical flow. To provide training data for this task, we introduce\na large-scale densely-annotated synthetic dataset containing 60k images within\n30 scenes tailored for non-Lambertian objects. Training on our synthetic\ndataset enables model to predict multi-layer optical flow, while fine-tuning\nexisting optical flow methods on the dataset notably boosts their performance\non non-Lambertian objects without compromising the performance on diffuse\nobjects. Data is available at https://layeredflow.cs.princeton.edu.\n","authors":["Hongyu Wen","Erich Liang","Jia Deng"],"pdf_url":"https://arxiv.org/pdf/2409.05688v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2305.09868v3","updated":"2024-09-09T15:00:32Z","published":"2023-05-17T00:45:41Z","title":"The Principle of Uncertain Maximum Entropy","summary":" The principle of maximum entropy is a well-established technique for choosing\na distribution that matches available information while minimizing bias. It\nfinds broad use across scientific disciplines and in machine learning. However,\nthe principle as defined by is susceptible to noise and error in observations.\nThis forces real-world practitioners to use relaxed versions of the principle\nin an ad hoc way, negatively impacting interpretation. To address this\nsituation, we present a new principle we call uncertain maximum entropy that\ngeneralizes the classic principle and provides interpretable solutions\nirrespective of the observational methods in use. We introduce a convex\napproximation and expectation-maximization based algorithm for finding\nsolutions to our new principle. Finally, we contrast this new technique with\ntwo simpler generally applicable solutions theoretically and experimentally\nshow our technique provides superior accuracy.\n","authors":["Kenneth Bogert","Matthew Kothe"],"pdf_url":"https://arxiv.org/pdf/2305.09868v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11500v4","updated":"2024-09-09T14:52:15Z","published":"2023-09-20T17:59:32Z","title":"Auto-ACD: A Large-scale Dataset for Audio-Language Representation\n Learning","summary":" Recently, the AI community has made significant strides in developing\npowerful foundation models, driven by large-scale multimodal datasets. However,\nfor audio representation learning, existing datasets suffer from limitations in\nthe following aspects: insufficient volume, simplistic content, and arduous\ncollection procedures. To establish an audio dataset with high-quality\ncaptions, we propose an innovative, automatic approach leveraging multimodal\ninputs, such as video frames, audio streams. Specifically, we construct a\nlarge-scale, high-quality, audio-language dataset, named as Auto-ACD,\ncomprising over 1.5M audio-text pairs. We exploit a series of pre-trained\nmodels or APIs, to determine audio-visual synchronisation, generate image\ncaptions, object detection, or audio tags for specific videos. Subsequently, we\nemploy LLM to paraphrase a congruent caption for each audio, guided by the\nextracted multi-modality clues. To demonstrate the effectiveness of the\nproposed dataset, we train widely used models on our dataset and show\nperformance improvement on various downstream tasks, for example,\naudio-language retrieval, audio captioning, zero-shot classification. In\naddition, we establish a novel benchmark with environmental information and\nprovide a benchmark for audio-text tasks.\n","authors":["Luoyi Sun","Xuenan Xu","Mengyue Wu","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2309.11500v4.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2409.05681v1","updated":"2024-09-09T14:49:54Z","published":"2024-09-09T14:49:54Z","title":"SX-Stitch: An Efficient VMS-UNet Based Framework for Intraoperative\n Scoliosis X-Ray Image Stitching","summary":" In scoliosis surgery, the limited field of view of the C-arm X-ray machine\nrestricts the surgeons' holistic analysis of spinal structures .This paper\npresents an end-to-end efficient and robust intraoperative X-ray image\nstitching method for scoliosis surgery,named SX-Stitch. The method is divided\ninto two stages:segmentation and stitching. In the segmentation stage, We\npropose a medical image segmentation model named Vision Mamba of Spine-UNet\n(VMS-UNet), which utilizes the state space Mamba to capture long-distance\ncontextual information while maintaining linear computational complexity, and\nincorporates the SimAM attention mechanism, significantly improving the\nsegmentation performance.In the stitching stage, we simplify the alignment\nprocess between images to the minimization of a registration energy function.\nThe total energy function is then optimized to order unordered images, and a\nhybrid energy function is introduced to optimize the best seam, effectively\neliminating parallax artifacts. On the clinical dataset, Sx-Stitch demonstrates\nsuperiority over SOTA schemes both qualitatively and quantitatively.\n","authors":["Yi Li","Heting Gao","Mingde He","Jinqian Liang","Jason Gu","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2409.05681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05680v1","updated":"2024-09-09T14:48:07Z","published":"2024-09-09T14:48:07Z","title":"Cherenkov Imaged Bio-morphological Features Verify Patient Positioning\n with Deformable Tissue Translocation in Breast Radiotherapy","summary":" Accurate patient positioning is critical for precise radiotherapy dose\ndelivery, as positioning errors can significantly affect treatment outcomes.\nThis study introduces a novel method for tracking loco-regional tissue\ndeformation through Cherenkov image analysis during fractionated breast cancer\nradiotherapy. The primary goal was to develop and test an algorithm for\nCherenkov-based regional position accuracy quantification, specifically for\nloco-regional deformations, which lack ideal quantification methods in\nradiotherapy. Blood vessel detection and segmentation were developed in\nCherenkov images using a tissue phantom with incremental movements, and later\napplied to images from fractionated whole breast radiotherapy in human patients\n(n=10). A combined rigid and non-rigid registration technique was used to\ndetect inter- and intra-fractional positioning variations. This approach\nquantified positioning variations in two parts: a global shift from rigid\nregistration and a two-dimensional variation map of loco-regional deformation\nfrom non-rigid registration. The methodology was validated using an\nanthropomorphic chest phantom experiment, where known treatment couch\ntranslations and respiratory motion were simulated to assess inter- and\nintra-fractional uncertainties, yielding an average accuracy of 0.83 mm for\ncouch translations up to 20 mm. Analysis of clinical Cherenkov data from ten\nbreast cancer patients showed an inter-fraction setup variation of 3.7 plus\nminus 2.4 mm relative to the first fraction and loco-regional deformations\n(95th percentile) of up to 3.3 plus minus 1.9 mm. This study presents a\nCherenkov-based approach to quantify global and local positioning variations,\ndemonstrating feasibility in addressing loco-regional deformations that\nconventional imaging techniques fail to capture.\n","authors":["Yao Chen","Savannah M. Decker","Petr Bruza","David J. Gladstone","Lesley A. Jarvis","Brian W. Pogue","Kimberley S. Samkoe","Rongxiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05680v1.pdf","comment":"25 pages, 4 figures, 1 table, journal under review"},{"id":"http://arxiv.org/abs/2409.05679v1","updated":"2024-09-09T14:47:57Z","published":"2024-09-09T14:47:57Z","title":"AnomalyCD: A benchmark for Earth anomaly change detection with\n high-resolution and time-series observations","summary":" Various Earth anomalies have destroyed the stable, balanced state, resulting\nin fatalities and serious destruction of property. With the advantages of\nlarge-scale and precise observation, high-resolution remote sensing images have\nbeen widely used for anomaly monitoring and localization. Powered by the deep\nrepresentation, the existing methods have achieved remarkable advances,\nprimarily in classification and change detection techniques. However, labeled\nsamples are difficult to acquire due to the low probability of anomaly\noccurrence, and the trained models are limited to fixed anomaly categories,\nwhich hinders the application for anomalies with few samples or unknown\nanomalies. In this paper, to tackle this problem, we propose the anomaly change\ndetection (AnomalyCD) technique, which accepts time-series observations and\nlearns to identify anomalous changes by learning from the historical normal\nchange pattern. Compared to the existing techniques, AnomalyCD processes an\nunfixed number of time steps and can localize the various anomalies in a\nunified manner, without human supervision. To benchmark AnomalyCD, we\nconstructed a high-resolution dataset with time-series images dedicated to\nvarious Earth anomalies (the AnomalyCDD dataset). AnomalyCDD contains\nhigh-resolution (from 0.15 to 2.39 m/pixel), time-series (from 3 to 7 time\nsteps), and large-scale images (1927.93 km2 in total) collected globally\nFurthermore, we developed a zero-shot baseline model (AnomalyCDM), which\nimplements the AnomalyCD technique by extracting a general representation from\nthe segment anything model (SAM) and conducting temporal comparison to\ndistinguish the anomalous changes from normal changes. AnomalyCDM is designed\nas a two-stage workflow to enhance the efficiency, and has the ability to\nprocess the unseen images directly, without retraining for each scene.\n","authors":["Jingtao Li","Qian Zhu","Xinyu Wang","Hengwei Zhao","Yanfei Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.05679v1.pdf","comment":"remote sensing benchmark"},{"id":"http://arxiv.org/abs/2409.00442v2","updated":"2024-09-09T14:45:27Z","published":"2024-08-31T12:30:52Z","title":"Separation of Body and Background in Radiological Images. A Practical\n Python Code","summary":" Radiological images, such as magnetic resonance imaging (MRI) and computed\ntomography (CT) images, typically consist of a body part and a dark background.\nFor many analyses, it is necessary to separate the body part from the\nbackground. In this article, we present a Python code designed to separate body\nand background regions in 2D and 3D radiological images. We tested the\nalgorithm on various MRI and CT images of different body parts, including the\nbrain, neck, and abdominal regions. Additionally, we introduced a method for\nintensity normalization and outlier restriction, adjusted for data conversion\ninto 8-bit unsigned integer (UINT8) format, and examined its effects on\nbody-background separation. Our Python code is available for use with proper\ncitation.\n","authors":["Seyedeh Fahimeh Hosseini","Faezeh Shalbafzadeh","Behzad Amanpour-Gharaei"],"pdf_url":"https://arxiv.org/pdf/2409.00442v2.pdf","comment":"14 pages, 8 figures. typos corrected"},{"id":"http://arxiv.org/abs/2404.17486v2","updated":"2024-09-09T14:45:16Z","published":"2024-04-26T15:42:24Z","title":"TextGaze: Gaze-Controllable Face Generation with Natural Language","summary":" Generating face image with specific gaze information has attracted\nconsiderable attention. Existing approaches typically input gaze values\ndirectly for face generation, which is unnatural and requires annotated gaze\ndatasets for training, thereby limiting its application. In this paper, we\npresent a novel gaze-controllable face generation task. Our approach inputs\ntextual descriptions that describe human gaze and head behavior and generates\ncorresponding face images. Our work first introduces a text-of-gaze dataset\ncontaining over 90k text descriptions spanning a dense distribution of gaze and\nhead poses. We further propose a gaze-controllable text-to-face method. Our\nmethod contains a sketch-conditioned face diffusion module and a model-based\nsketch diffusion module. We define a face sketch based on facial landmarks and\neye segmentation map. The face diffusion module generates face images from the\nface sketch, and the sketch diffusion module employs a 3D face model to\ngenerate face sketch from text description. Experiments on the FFHQ dataset\nshow the effectiveness of our method. We will release our dataset and code for\nfuture research.\n","authors":["Hengfei Wang","Zhongqun Zhang","Yihua Cheng","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2404.17486v2.pdf","comment":"ACM MM2024"},{"id":"http://arxiv.org/abs/2308.09012v2","updated":"2024-09-09T14:42:59Z","published":"2023-08-17T14:30:26Z","title":"FashionLOGO: Prompting Multimodal Large Language Models for Fashion Logo\n Embeddings","summary":" Logo embedding models convert the product logos in images into vectors,\nenabling their utilization for logo recognition and detection within e-commerce\nplatforms. This facilitates the enforcement of intellectual property rights and\nenhances product search capabilities. However, current methods treat logo\nembedding as a purely visual problem. A noteworthy issue is that visual models\ncapture features more than logos. Instead, we view this as a multimodal task,\nusing text as auxiliary information to facilitate the visual model's\nunderstanding of the logo. The emerging Multimodal Large Language Models\n(MLLMs) have demonstrated remarkable capabilities in both visual and textual\nunderstanding. Inspired by this, we propose an approach, \\textbf{FashionLOGO},\nto explore how to prompt MLLMs to generate appropriate text for product images,\nwhich can help visual models achieve better logo embeddings. We adopt a\ncross-attention transformer block that enables visual embedding to\nautomatically learn supplementary knowledge from textual embedding. Our\nextensive experiments on real-world datasets prove that FashionLOGO is capable\nof generating generic and robust logo embeddings, achieving state-of-the-art\nperformance in all benchmarks.\n","authors":["Zhen Wang","Da Li","Yulin Su","Min Yang","Minghui Qiu","Walton Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09012v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05666v1","updated":"2024-09-09T14:37:33Z","published":"2024-09-09T14:37:33Z","title":"Robust Real-time Segmentation of Bio-Morphological Features in Human\n Cherenkov Imaging during Radiotherapy via Deep Learning","summary":" Cherenkov imaging enables real-time visualization of megavoltage X-ray or\nelectron beam delivery to the patient during Radiation Therapy (RT).\nBio-morphological features, such as vasculature, seen in these images are\npatient-specific signatures that can be used for verification of positioning\nand motion management that are essential to precise RT treatment. However until\nnow, no concerted analysis of this biological feature-based tracking was\nutilized because of the slow speed and accuracy of conventional image\nprocessing for feature segmentation. This study demonstrated the first deep\nlearning framework for such an application, achieving video frame rate\nprocessing. To address the challenge of limited annotation of these features in\nCherenkov images, a transfer learning strategy was applied. A fundus\nphotography dataset including 20,529 patch retina images with ground-truth\nvessel annotation was used to pre-train a ResNet segmentation framework.\nSubsequently, a small Cherenkov dataset (1,483 images from 212 treatment\nfractions of 19 breast cancer patients) with known annotated vasculature masks\nwas used to fine-tune the model for accurate segmentation prediction. This deep\nlearning framework achieved consistent and rapid segmentation of\nCherenkov-imaged bio-morphological features on another 19 patients, including\nsubcutaneous veins, scars, and pigmented skin. Average segmentation by the\nmodel achieved Dice score of 0.85 and required less than 0.7 milliseconds\nprocessing time per instance. The model demonstrated outstanding consistency\nagainst input image variances and speed compared to conventional manual\nsegmentation methods, laying the foundation for online segmentation in\nreal-time monitoring in a prospective setting.\n","authors":["Shiru Wang","Yao Chen","Lesley A. Jarvis","Yucheng Tang","David J. Gladstone","Kimberley S. Samkoe","Brian W. Pogue","Petr Bruza","Rongxiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05666v1.pdf","comment":"9 pages, 7 figures, 1 table, journal under review"},{"id":"http://arxiv.org/abs/2409.05662v1","updated":"2024-09-09T14:35:23Z","published":"2024-09-09T14:35:23Z","title":"Real-Time Human Action Recognition on Embedded Platforms","summary":" With advancements in computer vision and deep learning, video-based human\naction recognition (HAR) has become practical. However, due to the complexity\nof the computation pipeline, running HAR on live video streams incurs excessive\ndelays on embedded platforms. This work tackles the real-time performance\nchallenges of HAR with four contributions: 1) an experimental study identifying\na standard Optical Flow (OF) extraction technique as the latency bottleneck in\na state-of-the-art HAR pipeline, 2) an exploration of the latency-accuracy\ntradeoff between the standard and deep learning approaches to OF extraction,\nwhich highlights the need for a novel, efficient motion feature extractor, 3)\nthe design of Integrated Motion Feature Extractor (IMFE), a novel single-shot\nneural network architecture for motion feature extraction with drastic\nimprovement in latency, 4) the development of RT-HARE, a real-time HAR system\ntailored for embedded platforms. Experimental results on an Nvidia Jetson\nXavier NX platform demonstrated that RT-HARE realizes real-time HAR at a video\nframe rate of 30 frames per second while delivering high levels of recognition\naccuracy.\n","authors":["Ruiqi Wang","Zichen Wang","Peiqi Gao","Mingzhen Li","Jaehwan Jeong","Yihang Xu","Yejin Lee","Lisa Connor","Chenyang Lu"],"pdf_url":"https://arxiv.org/pdf/2409.05662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17793v3","updated":"2024-09-09T14:21:50Z","published":"2024-04-27T06:18:23Z","title":"CLFT: Camera-LiDAR Fusion Transformer for Semantic Segmentation in\n Autonomous Driving","summary":" Critical research about camera-and-LiDAR-based semantic object segmentation\nfor autonomous driving significantly benefited from the recent development of\ndeep learning. Specifically, the vision transformer is the novel ground-breaker\nthat successfully brought the multi-head-attention mechanism to computer vision\napplications. Therefore, we propose a vision-transformer-based network to carry\nout camera-LiDAR fusion for semantic segmentation applied to autonomous\ndriving. Our proposal uses the novel progressive-assemble strategy of vision\ntransformers on a double-direction network and then integrates the results in a\ncross-fusion strategy over the transformer decoder layers. Unlike other works\nin the literature, our camera-LiDAR fusion transformers have been evaluated in\nchallenging conditions like rain and low illumination, showing robust\nperformance. The paper reports the segmentation results over the vehicle and\nhuman classes in different modalities: camera-only, LiDAR-only, and\ncamera-LiDAR fusion. We perform coherent controlled benchmark experiments of\nCLFT against other networks that are also designed for semantic segmentation.\nThe experiments aim to evaluate the performance of CLFT independently from two\nperspectives: multimodal sensor fusion and backbone architectures. The\nquantitative assessments show our CLFT networks yield an improvement of up to\n10% for challenging dark-wet conditions when comparing with\nFully-Convolutional-Neural-Network-based (FCN) camera-LiDAR fusion neural\nnetwork. Contrasting to the network with transformer backbone but using single\nmodality input, the all-around improvement is 5-10%.\n","authors":["Junyi Gu","Mauro Bellone","Tomáš Pivoňka","Raivo Sell"],"pdf_url":"https://arxiv.org/pdf/2404.17793v3.pdf","comment":"Accepted to IEEE Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2409.05650v1","updated":"2024-09-09T14:16:27Z","published":"2024-09-09T14:16:27Z","title":"Replay Consolidation with Label Propagation for Continual Object\n Detection","summary":" Object Detection is a highly relevant computer vision problem with many\napplications such as robotics and autonomous driving. Continual Learning~(CL)\nconsiders a setting where a model incrementally learns new information while\nretaining previously acquired knowledge. This is particularly challenging since\nDeep Learning models tend to catastrophically forget old knowledge while\ntraining on new data. In particular, Continual Learning for Object\nDetection~(CLOD) poses additional difficulties compared to CL for\nClassification. In CLOD, images from previous tasks may contain unknown classes\nthat could reappear labeled in future tasks. These missing annotations cause\ntask interference issues for replay-based approaches. As a result, most works\nin the literature have focused on distillation-based approaches. However, these\napproaches are effective only when there is a strong overlap of classes across\ntasks. To address the issues of current methodologies, we propose a novel\ntechnique to solve CLOD called Replay Consolidation with Label Propagation for\nObject Detection (RCLPOD). Based on the replay method, our solution avoids task\ninterference issues by enhancing the buffer memory samples. Our method is\nevaluated against existing techniques in CLOD literature, demonstrating its\nsuperior performance on established benchmarks like VOC and COCO.\n","authors":["Riccardo De Monte","Davide Dalle Pezze","Marina Ceccon","Francesco Pasti","Francesco Paissan","Elisabetta Farella","Gian Antonio Susto","Nicola Bellotto"],"pdf_url":"https://arxiv.org/pdf/2409.05650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05642v1","updated":"2024-09-09T14:12:23Z","published":"2024-09-09T14:12:23Z","title":"Prototype-Driven Multi-Feature Generation for Visible-Infrared Person\n Re-identification","summary":" The primary challenges in visible-infrared person re-identification arise\nfrom the differences between visible (vis) and infrared (ir) images, including\ninter-modal and intra-modal variations. These challenges are further\ncomplicated by varying viewpoints and irregular movements. Existing methods\noften rely on horizontal partitioning to align part-level features, which can\nintroduce inaccuracies and have limited effectiveness in reducing modality\ndiscrepancies. In this paper, we propose a novel Prototype-Driven Multi-feature\ngeneration framework (PDM) aimed at mitigating cross-modal discrepancies by\nconstructing diversified features and mining latent semantically similar\nfeatures for modal alignment. PDM comprises two key components: Multi-Feature\nGeneration Module (MFGM) and Prototype Learning Module (PLM). The MFGM\ngenerates diversity features closely distributed from modality-shared features\nto represent pedestrians. Additionally, the PLM utilizes learnable prototypes\nto excavate latent semantic similarities among local features between visible\nand infrared modalities, thereby facilitating cross-modal instance-level\nalignment. We introduce the cosine heterogeneity loss to enhance prototype\ndiversity for extracting rich local features. Extensive experiments conducted\non the SYSU-MM01 and LLCM datasets demonstrate that our approach achieves\nstate-of-the-art performance. Our codes are available at\nhttps://github.com/mmunhappy/ICASSP2025-PDM.\n","authors":["Jiarui Li","Zhen Qiu","Yilin Yang","Yuqi Li","Zeyu Dong","Chuanguang Yang"],"pdf_url":"https://arxiv.org/pdf/2409.05642v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2409.05636v1","updated":"2024-09-09T14:07:38Z","published":"2024-09-09T14:07:38Z","title":"3D-SAR Tomography and Machine Learning for High-Resolution Tree Height\n Estimation","summary":" Accurately estimating forest biomass is crucial for global carbon cycle\nmodelling and climate change mitigation. Tree height, a key factor in biomass\ncalculations, can be measured using Synthetic Aperture Radar (SAR) technology.\nThis study applies machine learning to extract forest height data from two SAR\nproducts: Single Look Complex (SLC) images and tomographic cubes, in\npreparation for the ESA Biomass Satellite mission. We use the TomoSense\ndataset, containing SAR and LiDAR data from Germany's Eifel National Park, to\ndevelop and evaluate height estimation models. Our approach includes classical\nmethods, deep learning with a 3D U-Net, and Bayesian-optimized techniques. By\ntesting various SAR frequencies and polarimetries, we establish a baseline for\nfuture height and biomass modelling. Best-performing models predict forest\nheight to be within 2.82m mean absolute error for canopies around 30m,\nadvancing our ability to measure global carbon stocks and support climate\naction.\n","authors":["Grace Colverd","Jumpei Takami","Laura Schade","Karol Bot","Joseph A. Gallego-Mejia"],"pdf_url":"https://arxiv.org/pdf/2409.05636v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05624v1","updated":"2024-09-09T13:56:22Z","published":"2024-09-09T13:56:22Z","title":"Renormalized Connection for Scale-preferred Object Detection in\n Satellite Imagery","summary":" Satellite imagery, due to its long-range imaging, brings with it a variety of\nscale-preferred tasks, such as the detection of tiny/small objects, making the\nprecise localization and detection of small objects of interest a challenging\ntask. In this article, we design a Knowledge Discovery Network (KDN) to\nimplement the renormalization group theory in terms of efficient feature\nextraction. Renormalized connection (RC) on the KDN enables ``synergistic\nfocusing'' of multi-scale features. Based on our observations of KDN, we\nabstract a class of RCs with different connection strengths, called n21C, and\ngeneralize it to FPN-based multi-branch detectors. In a series of FPN\nexperiments on the scale-preferred tasks, we found that the\n``divide-and-conquer'' idea of FPN severely hampers the detector's learning in\nthe right direction due to the large number of large-scale negative samples and\ninterference from background noise. Moreover, these negative samples cannot be\neliminated by the focal loss function. The RCs extends the multi-level\nfeature's ``divide-and-conquer'' mechanism of the FPN-based detectors to a wide\nrange of scale-preferred tasks, and enables synergistic effects of multi-level\nfeatures on the specific learning goal. In addition, interference activations\nin two aspects are greatly reduced and the detector learns in a more correct\ndirection. Extensive experiments of 17 well-designed detection architectures\nembedded with n21s on five different levels of scale-preferred tasks validate\nthe effectiveness and efficiency of the RCs. Especially the simplest linear\nform of RC, E421C performs well in all tasks and it satisfies the scaling\nproperty of RGT. We hope that our approach will transfer a large number of\nwell-designed detectors from the computer vision community to the remote\nsensing community.\n","authors":["Fan Zhang","Lingling Li","Licheng Jiao","Xu Liu","Fang Liu","Shuyuan Yang","Biao Hou"],"pdf_url":"https://arxiv.org/pdf/2409.05624v1.pdf","comment":"24 pages, 14 figures Journal"},{"id":"http://arxiv.org/abs/2409.05617v1","updated":"2024-09-09T13:52:58Z","published":"2024-09-09T13:52:58Z","title":"G-NeLF: Memory- and Data-Efficient Hybrid Neural Light Field for Novel\n View Synthesis","summary":" Following the burgeoning interest in implicit neural representation, Neural\nLight Field (NeLF) has been introduced to predict the color of a ray directly.\nUnlike Neural Radiance Field (NeRF), NeLF does not create a point-wise\nrepresentation by predicting color and volume density for each point in space.\nHowever, the current NeLF methods face a challenge as they need to train a NeRF\nmodel first and then synthesize over 10K views to train NeLF for improved\nperformance. Additionally, the rendering quality of NeLF methods is lower\ncompared to NeRF methods. In this paper, we propose G-NeLF, a versatile\ngrid-based NeLF approach that utilizes spatial-aware features to unleash the\npotential of the neural network's inference capability, and consequently\novercome the difficulties of NeLF training. Specifically, we employ a\nspatial-aware feature sequence derived from a meticulously crafted grid as the\nray's representation. Drawing from our empirical studies on the adaptability of\nmulti-resolution hash tables, we introduce a novel grid-based ray\nrepresentation for NeLF that can represent the entire space with a very limited\nnumber of parameters. To better utilize the sequence feature, we design a\nlightweight ray color decoder that simulates the ray propagation process,\nenabling a more efficient inference of the ray's color. G-NeLF can be trained\nwithout necessitating significant storage overhead and with the model size of\nonly 0.95 MB to surpass previous state-of-the-art NeLF. Moreover, compared with\ngrid-based NeRF methods, e.g., Instant-NGP, we only utilize one-tenth of its\nparameters to achieve higher performance. Our code will be released upon\nacceptance.\n","authors":["Lutao Jiang","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.05617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01199v2","updated":"2024-09-09T13:49:53Z","published":"2024-09-02T12:20:42Z","title":"OD-VAE: An Omni-dimensional Video Compressor for Improving Latent Video\n Diffusion Model","summary":" Variational Autoencoder (VAE), compressing videos into latent\nrepresentations, is a crucial preceding component of Latent Video Diffusion\nModels (LVDMs). With the same reconstruction quality, the more sufficient the\nVAE's compression for videos is, the more efficient the LVDMs are. However,\nmost LVDMs utilize 2D image VAE, whose compression for videos is only in the\nspatial dimension and often ignored in the temporal dimension. How to conduct\ntemporal compression for videos in a VAE to obtain more concise latent\nrepresentations while promising accurate reconstruction is seldom explored. To\nfill this gap, we propose an omni-dimension compression VAE, named OD-VAE,\nwhich can temporally and spatially compress videos. Although OD-VAE's more\nsufficient compression brings a great challenge to video reconstruction, it can\nstill achieve high reconstructed accuracy by our fine design. To obtain a\nbetter trade-off between video reconstruction quality and compression speed,\nfour variants of OD-VAE are introduced and analyzed. In addition, a novel tail\ninitialization is designed to train OD-VAE more efficiently, and a novel\ninference strategy is proposed to enable OD-VAE to handle videos of arbitrary\nlength with limited GPU memory. Comprehensive experiments on video\nreconstruction and LVDM-based video generation demonstrate the effectiveness\nand efficiency of our proposed methods.\n","authors":["Liuhan Chen","Zongjian Li","Bin Lin","Bin Zhu","Qian Wang","Shenghai Yuan","Xing Zhou","Xinhua Cheng","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.01199v2.pdf","comment":"https://github.com/PKU-YuanGroup/Open-Sora-Plan"},{"id":"http://arxiv.org/abs/2409.05611v1","updated":"2024-09-09T13:49:09Z","published":"2024-09-09T13:49:09Z","title":"Adapted-MoE: Mixture of Experts with Test-Time Adaption for Anomaly\n Detection","summary":" Most unsupervised anomaly detection methods based on representations of\nnormal samples to distinguish anomalies have recently made remarkable progress.\nHowever, existing methods only learn a single decision boundary for\ndistinguishing the samples within the training dataset, neglecting the\nvariation in feature distribution for normal samples even in the same category\nin the real world. Furthermore, it was not considered that a distribution bias\nstill exists between the test set and the train set. Therefore, we propose an\nAdapted-MoE which contains a routing network and a series of expert models to\nhandle multiple distributions of same-category samples by divide and conquer.\nSpecifically, we propose a routing network based on representation learning to\nroute same-category samples into the subclasses feature space. Then, a series\nof expert models are utilized to learn the representation of various normal\nsamples and construct several independent decision boundaries. We propose the\ntest-time adaption to eliminate the bias between the unseen test sample\nrepresentation and the feature distribution learned by the expert model. Our\nexperiments are conducted on a dataset that provides multiple subclasses from\nthree categories, namely Texture AD benchmark. The Adapted-MoE significantly\nimproves the performance of the baseline model, achieving 2.18%-7.20% and\n1.57%-16.30% increase in I-AUROC and P-AUROC, which outperforms the current\nstate-of-the-art methods. Our code is available at https://github.com/.\n","authors":["Tianwu Lei","Silin Chen","Bohan Wang","Zhengkai Jiang","Ningmu Zou"],"pdf_url":"https://arxiv.org/pdf/2409.05611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05606v1","updated":"2024-09-09T13:39:47Z","published":"2024-09-09T13:39:47Z","title":"CustomContrast: A Multilevel Contrastive Perspective For Subject-Driven\n Text-to-Image Customization","summary":" Subject-driven text-to-image (T2I) customization has drawn significant\ninterest in academia and industry. This task enables pre-trained models to\ngenerate novel images based on unique subjects. Existing studies adopt a\nself-reconstructive perspective, focusing on capturing all details of a single\nimage, which will misconstrue the specific image's irrelevant attributes (e.g.,\nview, pose, and background) as the subject intrinsic attributes. This\nmisconstruction leads to both overfitting or underfitting of irrelevant and\nintrinsic attributes of the subject, i.e., these attributes are\nover-represented or under-represented simultaneously, causing a trade-off\nbetween similarity and controllability. In this study, we argue an ideal\nsubject representation can be achieved by a cross-differential perspective,\ni.e., decoupling subject intrinsic attributes from irrelevant attributes via\ncontrastive learning, which allows the model to focus more on intrinsic\nattributes through intra-consistency (features of the same subject are\nspatially closer) and inter-distinctiveness (features of different subjects\nhave distinguished differences). Specifically, we propose CustomContrast, a\nnovel framework, which includes a Multilevel Contrastive Learning (MCL)\nparadigm and a Multimodal Feature Injection (MFI) Encoder. The MCL paradigm is\nused to extract intrinsic features of subjects from high-level semantics to\nlow-level appearance through crossmodal semantic contrastive learning and\nmultiscale appearance contrastive learning. To facilitate contrastive learning,\nwe introduce the MFI encoder to capture cross-modal representations. Extensive\nexperiments show the effectiveness of CustomContrast in subject similarity and\ntext controllability.\n","authors":["Nan Chen","Mengqi Huang","Zhuowei Chen","Yang Zheng","Lei Zhang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2409.05606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05595v1","updated":"2024-09-09T13:29:53Z","published":"2024-09-09T13:29:53Z","title":"SynMorph: Generating Synthetic Face Morphing Dataset with Mated Samples","summary":" Face morphing attack detection (MAD) algorithms have become essential to\novercome the vulnerability of face recognition systems. To solve the lack of\nlarge-scale and public-available datasets due to privacy concerns and\nrestrictions, in this work we propose a new method to generate a synthetic face\nmorphing dataset with 2450 identities and more than 100k morphs. The proposed\nsynthetic face morphing dataset is unique for its high-quality samples,\ndifferent types of morphing algorithms, and the generalization for both single\nand differential morphing attack detection algorithms. For experiments, we\napply face image quality assessment and vulnerability analysis to evaluate the\nproposed synthetic face morphing dataset from the perspective of biometric\nsample quality and morphing attack potential on face recognition systems. The\nresults are benchmarked with an existing SOTA synthetic dataset and a\nrepresentative non-synthetic and indicate improvement compared with the SOTA.\nAdditionally, we design different protocols and study the applicability of\nusing the proposed synthetic dataset on training morphing attack detection\nalgorithms.\n","authors":["Haoyu Zhang","Raghavendra Ramachandra","Kiran Raja","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2409.05595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05587v1","updated":"2024-09-09T13:16:15Z","published":"2024-09-09T13:16:15Z","title":"DSDFormer: An Innovative Transformer-Mamba Framework for Robust\n High-Precision Driver Distraction Identification","summary":" Driver distraction remains a leading cause of traffic accidents, posing a\ncritical threat to road safety globally. As intelligent transportation systems\nevolve, accurate and real-time identification of driver distraction has become\nessential. However, existing methods struggle to capture both global contextual\nand fine-grained local features while contending with noisy labels in training\ndatasets. To address these challenges, we propose DSDFormer, a novel framework\nthat integrates the strengths of Transformer and Mamba architectures through a\nDual State Domain Attention (DSDA) mechanism, enabling a balance between\nlong-range dependencies and detailed feature extraction for robust driver\nbehavior recognition. Additionally, we introduce Temporal Reasoning Confident\nLearning (TRCL), an unsupervised approach that refines noisy labels by\nleveraging spatiotemporal correlations in video sequences. Our model achieves\nstate-of-the-art performance on the AUC-V1, AUC-V2, and 100-Driver datasets and\ndemonstrates real-time processing efficiency on the NVIDIA Jetson AGX Orin\nplatform. Extensive experimental results confirm that DSDFormer and TRCL\nsignificantly improve both the accuracy and robustness of driver distraction\ndetection, offering a scalable solution to enhance road safety.\n","authors":["Junzhou Chen","Zirui Zhang","Jing Yu","Heqiang Huang","Ronghui Zhang","Xuemiao Xu","Bin Sheng","Hong Yan"],"pdf_url":"https://arxiv.org/pdf/2409.05587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04388v2","updated":"2024-09-09T13:15:41Z","published":"2024-09-06T16:27:52Z","title":"Question-Answering Dense Video Events","summary":" Multimodal Large Language Models (MLLMs) have shown excellent performance in\nquestion-answering of single-event videos. In this paper, we present\nquestion-answering dense video events, a novel task that requires answering and\ngrounding the dense-event questions in long videos, thus challenging MLLMs to\nfaithfully comprehend and reason about multiple events occurring over extended\ntime periods. To facilitate the study, we construct DeVE-QA - a dataset\nfeaturing 78K questions about 26K events on 10.6K long videos. We then\nbenchmark and show that existing MLLMs excelling at single-event QA struggle to\nperform well in DeVE-QA. For improvement, we propose DeVi, a novel\ntraining-free MLLM approach that highlights a hierarchical captioning module, a\ntemporal event memory module, and a self-consistency checking module to\nrespectively detect, contextualize and memorize, and ground dense-events in\nlong videos for question answering. Extensive experiments show that DeVi is\nsuperior at answering dense-event questions and grounding relevant video\nmoments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1\npercent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA\nrespectively.\n","authors":["Hangyu Qin","Junbin Xiao","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2409.04388v2.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2409.05585v1","updated":"2024-09-09T13:15:03Z","published":"2024-09-09T13:15:03Z","title":"Latent 3D Brain MRI Counterfactual","summary":" The number of samples in structural brain MRI studies is often too small to\nproperly train deep learning models. Generative models show promise in\naddressing this issue by effectively learning the data distribution and\ngenerating high-fidelity MRI. However, they struggle to produce diverse,\nhigh-quality data outside the distribution defined by the training data. One\nway to address the issue is using causal models developed for 3D volume\ncounterfactuals. However, accurately modeling causality in high-dimensional\nspaces is a challenge so that these models generally generate 3D brain MRIS of\nlower quality. To address these challenges, we propose a two-stage method that\nconstructs a Structural Causal Model (SCM) within the latent space. In the\nfirst stage, we employ a VQ-VAE to learn a compact embedding of the MRI volume.\nSubsequently, we integrate our causal model into this latent space and execute\na three-step counterfactual procedure using a closed-form Generalized Linear\nModel (GLM). Our experiments conducted on real-world high-resolution MRI data\n(1mm) demonstrate that our method can generate high-quality 3D MRI\ncounterfactuals.\n","authors":["Wei Peng","Tian Xia","Fabio De Sousa Ribeiro","Tomas Bosschieter","Ehsan Adeli","Qingyu Zhao","Ben Glocker","Kilian M. Pohl"],"pdf_url":"https://arxiv.org/pdf/2409.05585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05564v1","updated":"2024-09-09T12:43:25Z","published":"2024-09-09T12:43:25Z","title":"LEROjD: Lidar Extended Radar-Only Object Detection","summary":" Accurate 3D object detection is vital for automated driving. While lidar\nsensors are well suited for this task, they are expensive and have limitations\nin adverse weather conditions. 3+1D imaging radar sensors offer a\ncost-effective, robust alternative but face challenges due to their low\nresolution and high measurement noise. Existing 3+1D imaging radar datasets\ninclude radar and lidar data, enabling cross-modal model improvements. Although\nlidar should not be used during inference, it can aid the training of\nradar-only object detectors. We explore two strategies to transfer knowledge\nfrom the lidar to the radar domain and radar-only object detectors: 1.\nmulti-stage training with sequential lidar point cloud thin-out, and 2.\ncross-modal knowledge distillation. In the multi-stage process, three thin-out\nmethods are examined. Our results show significant performance gains of up to\n4.2 percentage points in mean Average Precision with multi-stage training and\nup to 3.9 percentage points with knowledge distillation by initializing the\nstudent with the teacher's weights. The main benefit of these approaches is\ntheir applicability to other 3D object detection networks without altering\ntheir architecture, as we show by analyzing it on two different object\ndetectors. Our code is available at https://github.com/rst-tu-dortmund/lerojd\n","authors":["Patrick Palmer","Martin Krüger","Stefan Schütte","Richard Altendorfer","Ganesh Adam","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2409.05564v1.pdf","comment":"Accepted for publication as ECCV 2024"},{"id":"http://arxiv.org/abs/2401.11944v3","updated":"2024-09-09T12:38:11Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n Benchmark","summary":" As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU. CMMMU includes\n12k manually collected multimodal questions from college exams, quizzes, and\ntextbooks, covering six core disciplines: Art & Design, Business, Science,\nHealth & Medicine, Humanities & Social Science, and Tech & Engineering, like\nits companion, MMMU. These questions span 30 subjects and comprise 39 highly\nheterogeneous image types, such as charts, diagrams, maps, tables, music\nsheets, and chemical structures. CMMMU focuses on complex perception and\nreasoning with domain-specific knowledge in the Chinese context. We evaluate 11\nopen-source LLMs and one proprietary GPT-4V(ision). Even GPT-4V only achieves\naccuracies of 42%, indicating a large space for improvement. CMMMU will boost\nthe community to build the next-generation LMMs towards expert artificial\nintelligence and promote the democratization of LMMs by providing diverse\nlanguage contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05558v1","updated":"2024-09-09T12:29:53Z","published":"2024-09-09T12:29:53Z","title":"Seeing Through the Mask: Rethinking Adversarial Examples for CAPTCHAs","summary":" Modern CAPTCHAs rely heavily on vision tasks that are supposedly hard for\ncomputers but easy for humans. However, advances in image recognition models\npose a significant threat to such CAPTCHAs. These models can easily be fooled\nby generating some well-hidden \"random\" noise and adding it to the image, or\nhiding objects in the image. However, these methods are model-specific and thus\ncan not aid CAPTCHAs in fooling all models. We show in this work that by\nallowing for more significant changes to the images while preserving the\nsemantic information and keeping it solvable by humans, we can fool many\nstate-of-the-art models. Specifically, we demonstrate that by adding masks of\nvarious intensities the Accuracy @ 1 (Acc@1) drops by more than 50%-points for\nall models, and supposedly robust models such as vision transformers see an\nAcc@1 drop of 80%-points.\n These masks can therefore effectively fool modern image classifiers, thus\nshowing that machines have not caught up with humans -- yet.\n","authors":["Yahya Jabary","Andreas Plesner","Turlan Kuzhagaliyev","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2409.05558v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2408.03632v3","updated":"2024-09-09T12:26:04Z","published":"2024-08-07T08:43:58Z","title":"Concept Conductor: Orchestrating Multiple Personalized Concepts in\n Text-to-Image Synthesis","summary":" The customization of text-to-image models has seen significant advancements,\nyet generating multiple personalized concepts remains a challenging task.\nCurrent methods struggle with attribute leakage and layout confusion when\nhandling multiple concepts, leading to reduced concept fidelity and semantic\nconsistency. In this work, we introduce a novel training-free framework,\nConcept Conductor, designed to ensure visual fidelity and correct layout in\nmulti-concept customization. Concept Conductor isolates the sampling processes\nof multiple custom models to prevent attribute leakage between different\nconcepts and corrects erroneous layouts through self-attention-based spatial\nguidance. Additionally, we present a concept injection technique that employs\nshape-aware masks to specify the generation area for each concept. This\ntechnique injects the structure and appearance of personalized concepts through\nfeature fusion in the attention layers, ensuring harmony in the final image.\nExtensive qualitative and quantitative experiments demonstrate that Concept\nConductor can consistently generate composite images with accurate layouts\nwhile preserving the visual details of each concept. Compared to existing\nbaselines, Concept Conductor shows significant performance improvements. Our\nmethod supports the combination of any number of concepts and maintains high\nfidelity even when dealing with visually similar concepts. The code and models\nare available at https://github.com/Nihukat/Concept-Conductor.\n","authors":["Zebin Yao","Fangxiang Feng","Ruifan Li","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03632v3.pdf","comment":"Github Page: https://github.com/Nihukat/Concept-Conductor"},{"id":"http://arxiv.org/abs/2409.05552v1","updated":"2024-09-09T12:17:38Z","published":"2024-09-09T12:17:38Z","title":"Seeing is Believing? Enhancing Vision-Language Navigation using Visual\n Perturbations","summary":" Autonomous navigation for an embodied agent guided by natural language\ninstructions remains a formidable challenge in vision-and-language navigation\n(VLN). Despite remarkable recent progress in learning fine-grained and\nmultifarious visual representations, the tendency to overfit to the training\nenvironments leads to unsatisfactory generalization performance. In this work,\nwe present a versatile Multi-Branch Architecture (MBA) aimed at exploring and\nexploiting diverse visual inputs. Specifically, we introduce three distinct\nvisual variants: ground-truth depth images, visual inputs integrated with\nincongruent views, and those infused with random noise to enrich the diversity\nof visual input representation and prevent overfitting to the original RGB\nobservations. To adaptively fuse these varied inputs, the proposed MBA extend a\nbase agent model into a multi-branch variant, where each branch processes a\ndifferent visual input. Surprisingly, even random noise can further enhance\nnavigation performance in unseen environments. Extensive experiments conducted\non three VLN benchmarks (R2R, REVERIE, SOON) demonstrate that our proposed\nmethod equals or even surpasses state-of-the-art results. The source code will\nbe publicly available.\n","authors":["Xuesong Zhang","Jia Li","Yunbo Xu","Zhenzhen Hu","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2409.05552v1.pdf","comment":"5 pages, 2 figures, submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2406.02202v2","updated":"2024-09-09T12:04:27Z","published":"2024-06-04T10:57:59Z","title":"No Captions, No Problem: Captionless 3D-CLIP Alignment with Hard\n Negatives via CLIP Knowledge and LLMs","summary":" In this study, we explore an alternative approach to enhance contrastive\ntext-image-3D alignment in the absence of textual descriptions for 3D objects.\nWe introduce two unsupervised methods, $I2I$ and $(I2L)^2$, which leverage CLIP\nknowledge about textual and 2D data to compute the neural perceived similarity\nbetween two 3D samples. We employ the proposed methods to mine 3D hard\nnegatives, establishing a multimodal contrastive pipeline with hard negative\nweighting via a custom loss function. We train on different configurations of\nthe proposed hard negative mining approach, and we evaluate the accuracy of our\nmodels in 3D classification and on the cross-modal retrieval benchmark, testing\nimage-to-shape and shape-to-image retrieval. Results demonstrate that our\napproach, even without explicit text alignment, achieves comparable or superior\nperformance on zero-shot and standard 3D classification, while significantly\nimproving both image-to-shape and shape-to-image retrieval compared to previous\nmethods.\n","authors":["Cristian Sbrolli","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2406.02202v2.pdf","comment":"to be published in BMVC 2024 Proceedings"},{"id":"http://arxiv.org/abs/2409.05540v1","updated":"2024-09-09T12:00:17Z","published":"2024-09-09T12:00:17Z","title":"Exploring Rich Subjective Quality Information for Image Quality\n Assessment in the Wild","summary":" Traditional in the wild image quality assessment (IQA) models are generally\ntrained with the quality labels of mean opinion score (MOS), while missing the\nrich subjective quality information contained in the quality ratings, for\nexample, the standard deviation of opinion scores (SOS) or even distribution of\nopinion scores (DOS). In this paper, we propose a novel IQA method named\nRichIQA to explore the rich subjective rating information beyond MOS to predict\nimage quality in the wild. RichIQA is characterized by two key novel designs:\n(1) a three-stage image quality prediction network which exploits the powerful\nfeature representation capability of the Convolutional vision Transformer (CvT)\nand mimics the short-term and long-term memory mechanisms of human brain; (2) a\nmulti-label training strategy in which rich subjective quality information like\nMOS, SOS and DOS are concurrently used to train the quality prediction network.\nPowered by these two novel designs, RichIQA is able to predict the image\nquality in terms of a distribution, from which the mean image quality can be\nsubsequently obtained. Extensive experimental results verify that the\nthree-stage network is tailored to predict rich quality information, while the\nmulti-label training strategy can fully exploit the potentials within\nsubjective quality rating and enhance the prediction performance and\ngeneralizability of the network. RichIQA outperforms state-of-the-art\ncompetitors on multiple large-scale in the wild IQA databases with rich\nsubjective rating labels. The code of RichIQA will be made publicly available\non GitHub.\n","authors":["Xiongkuo Min","Yixuan Gao","Yuqin Cao","Guangtao Zhai","Wenjun Zhang","Huifang Sun","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17815v2","updated":"2024-09-09T11:51:10Z","published":"2024-06-25T05:54:07Z","title":"SUM: Saliency Unification through Mamba for Visual Attention Modeling","summary":" Visual attention modeling, important for interpreting and prioritizing visual\nstimuli, plays a significant role in applications such as marketing,\nmultimedia, and robotics. Traditional saliency prediction models, especially\nthose based on Convolutional Neural Networks (CNNs) or Transformers, achieve\nnotable success by leveraging large-scale annotated datasets. However, the\ncurrent state-of-the-art (SOTA) models that use Transformers are\ncomputationally expensive. Additionally, separate models are often required for\neach image type, lacking a unified approach. In this paper, we propose Saliency\nUnification through Mamba (SUM), a novel approach that integrates the efficient\nlong-range dependency modeling of Mamba with U-Net to provide a unified model\nfor diverse image types. Using a novel Conditional Visual State Space (C-VSS)\nblock, SUM dynamically adapts to various image types, including natural scenes,\nweb pages, and commercial imagery, ensuring universal applicability across\ndifferent data types. Our comprehensive evaluations across five benchmarks\ndemonstrate that SUM seamlessly adapts to different visual characteristics and\nconsistently outperforms existing models. These results position SUM as a\nversatile and powerful tool for advancing visual attention modeling, offering a\nrobust solution universally applicable across different types of visual\ncontent.\n","authors":["Alireza Hosseini","Amirhossein Kazerouni","Saeed Akhavan","Michael Brudno","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2406.17815v2.pdf","comment":"Accepted at IEEE/CVF WACV 2025"},{"id":"http://arxiv.org/abs/2409.05531v1","updated":"2024-09-09T11:43:35Z","published":"2024-09-09T11:43:35Z","title":"HMAFlow: Learning More Accurate Optical Flow via Hierarchical Motion\n Field Alignment","summary":" Optical flow estimation is a fundamental and long-standing visual task. In\nthis work, we present a novel method, dubbed HMAFlow, to improve optical flow\nestimation in these tough scenes, especially with small objects. The proposed\nmodel mainly consists of two core components: a Hierarchical Motion Field\nAlignment (HMA) module and a Correlation Self-Attention (CSA) module. In\naddition, we rebuild 4D cost volumes by employing a Multi-Scale Correlation\nSearch (MCS) layer and replacing average pooling in common cost volumes with an\nsearch strategy using multiple search ranges. Experimental results demonstrate\nthat our model achieves the best generalization performance in comparison to\nother state-of-the-art methods. Specifically, compared with RAFT, our method\nachieves relative error reductions of 14.2% and 3.4% on the clean pass and\nfinal pass of the Sintel online benchmark, respectively. On the KITTI test\nbenchmark, HMAFlow surpasses RAFT and GMA in the Fl-all metric by a relative\nmargin of 6.8% and 7.7%, respectively. To facilitate future research, our code\nwill be made available at https://github.com/BooTurbo/HMAFlow.\n","authors":["Dianbo Ma","Kousuke Imamura","Ziyan Gao","Xiangjie Wang","Satoshi Yamane"],"pdf_url":"https://arxiv.org/pdf/2409.05531v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2305.07825v2","updated":"2024-09-09T11:08:17Z","published":"2023-05-13T02:46:41Z","title":"Student Classroom Behavior Detection based on YOLOv7-BRA and Multi-Model\n Fusion","summary":" Accurately detecting student behavior in classroom videos can aid in\nanalyzing their classroom performance and improving teaching effectiveness.\nHowever, the current accuracy rate in behavior detection is low. To address\nthis challenge, we propose the Student Classroom Behavior Detection system\nbased on based on YOLOv7-BRA (YOLOv7 with Bi-level Routing Attention ). We\nidentified eight different behavior patterns, including standing, sitting,\nspeaking, listening, walking, raising hands, reading, and writing. We\nconstructed a dataset, which contained 11,248 labels and 4,001 images, with an\nemphasis on the common behavior of raising hands in a classroom setting\n(Student Classroom Behavior dataset, SCB-Dataset). To improve detection\naccuracy, we added the biformer attention module to the YOLOv7 network.\nFinally, we fused the results from YOLOv7 CrowdHuman, SlowFast, and DeepSort\nmodels to obtain student classroom behavior data. We conducted experiments on\nthe SCB-Dataset, and YOLOv7-BRA achieved an mAP@0.5 of 87.1%, resulting in a\n2.2% improvement over previous results. Our SCB-dataset can be downloaded from:\nhttps://github.com/Whiffe/SCB-datase\n","authors":["Fan Yang","Tao Wang","Xiaofei Wang"],"pdf_url":"https://arxiv.org/pdf/2305.07825v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2304.02488"},{"id":"http://arxiv.org/abs/2306.03318v2","updated":"2024-09-09T11:04:35Z","published":"2023-06-06T00:01:40Z","title":"Student Classroom Behavior Detection based on Improved YOLOv7","summary":" Accurately detecting student behavior in classroom videos can aid in\nanalyzing their classroom performance and improving teaching effectiveness.\nHowever, the current accuracy rate in behavior detection is low. To address\nthis challenge, we propose the Student Classroom Behavior Detection method,\nbased on improved YOLOv7. First, we created the Student Classroom Behavior\ndataset (SCB-Dataset), which includes 18.4k labels and 4.2k images, covering\nthree behaviors: hand raising, reading, and writing. To improve detection\naccuracy in crowded scenes, we integrated the biformer attention module and\nWise-IoU into the YOLOv7 network. Finally, experiments were conducted on the\nSCB-Dataset, and the model achieved an mAP@0.5 of 79%, resulting in a 1.8%\nimprovement over previous results. The SCB-Dataset and code are available for\ndownload at: https://github.com/Whiffe/SCB-dataset.\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2306.03318v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.07825"},{"id":"http://arxiv.org/abs/2310.02522v2","updated":"2024-09-09T11:01:25Z","published":"2023-10-04T01:43:46Z","title":"SCB-Dataset3: A Benchmark for Detecting Student Classroom Behavior","summary":" The use of deep learning methods to automatically detect students' classroom\nbehavior is a promising approach for analyzing their class performance and\nimproving teaching effectiveness. However, the lack of publicly available\ndatasets on student behavior poses a challenge for researchers in this field.\nTo address this issue, we propose the Student Classroom Behavior dataset\n(SCB-dataset3), which represents real-life scenarios. Our dataset comprises\n5686 images with 45578 labels, focusing on six behaviors: hand-raising,\nreading, writing, using a phone, bowing the head, and leaning over the table.\nWe evaluated the dataset using the YOLOv5, YOLOv7, and YOLOv8 algorithms,\nachieving a mean average precision (map) of up to 80.3$\\%$. We believe that our\ndataset can serve as a robust foundation for future research in student\nbehavior detection and contribute to advancements in this field. Our\nSCB-dataset3 is available for download at:\nhttps://github.com/Whiffe/SCB-dataset\n","authors":["Fan Yang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2310.02522v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.02488,\n arXiv:2306.03318"},{"id":"http://arxiv.org/abs/2310.02523v4","updated":"2024-09-09T10:57:46Z","published":"2023-10-04T01:47:36Z","title":"A Spatio-Temporal Attention-Based Method for Detecting Student Classroom\n Behaviors","summary":" Accurately detecting student behavior from classroom videos is beneficial for\nanalyzing their classroom status and improving teaching efficiency. However,\nlow accuracy in student classroom behavior detection is a prevalent issue. To\naddress this issue, we propose a Spatio-Temporal Attention-Based Method for\nDetecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is\nused to generate motion and environmental information feature maps from the\nvideo. Then, the spatio-temporal attention module is applied to the feature\nmaps, including information aggregation, compression and stimulation processes.\nSubsequently, attention maps in the time, channel and space dimensions are\nobtained, and multi-label behavior classification is performed based on these\nattention maps. To solve the long-tail data problem that exists in student\nclassroom behavior datasets, we use an improved focal loss function to assign\nmore weight to the tail class data during training. Experimental results are\nconducted on a self-made student classroom behavior dataset named STSCB.\nCompared with the SlowFast model, the average accuracy of student behavior\nclassification detection improves by 8.94\\% using BDSTA.\n","authors":["Fan Yang"],"pdf_url":"https://arxiv.org/pdf/2310.02523v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16267v4","updated":"2024-09-09T10:52:54Z","published":"2023-10-25T00:46:26Z","title":"Student Classroom Behavior Detection based on Spatio-Temporal Network\n and Multi-Model Fusion","summary":" Using deep learning methods to detect students' classroom behavior\nautomatically is a promising approach for analyzing their class performance and\nimproving teaching effectiveness. However, the lack of publicly available\nspatio-temporal datasets on student behavior, as well as the high cost of\nmanually labeling such datasets, pose significant challenges for researchers in\nthis field. To address this issue, we proposed a method for extending the\nspatio-temporal behavior dataset in Student Classroom Scenarios\n(SCB-ST-Dataset4) through image dataset. Our SCB-ST-Dataset4 comprises 757265\nimages with 25810 labels, focusing on 3 behaviors: hand-raising, reading,\nwriting. Our proposed method can rapidly generate spatio-temporal behavior\ndatasets without requiring extra manual labeling. Furthermore, we proposed a\nBehavior Similarity Index (BSI) to explore the similarity of behaviors. We\nevaluated the dataset using the YOLOv5, YOLOv7, YOLOv8, and SlowFast\nalgorithms, achieving a mean average precision (map) of up to 82.3%. Last, we\nfused multiple models to generate student behavior-related data from various\nperspectives. The experiment further demonstrates the effectiveness of our\nmethod. And SCB-ST-Dataset4 provides a robust foundation for future research in\nstudent behavior detection, potentially contributing to advancements in this\nfield. The SCB-ST-Dataset4 is available for download at:\nhttps://github.com/Whiffe/SCB-dataset.\n","authors":["Fan Yang","Xiaofei Wang"],"pdf_url":"https://arxiv.org/pdf/2310.16267v4.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2310.02522;\n text overlap with arXiv:2306.03318"},{"id":"http://arxiv.org/abs/2409.05494v1","updated":"2024-09-09T10:47:39Z","published":"2024-09-09T10:47:39Z","title":"An Atmospheric Correction Integrated LULC Segmentation Model for\n High-Resolution Satellite Imagery","summary":" The integration of fine-scale multispectral imagery with deep learning models\nhas revolutionized land use and land cover (LULC) classification. However, the\natmospheric effects present in Top-of-Atmosphere sensor measured Digital Number\nvalues must be corrected to retrieve accurate Bottom-of-Atmosphere surface\nreflectance for reliable analysis. This study employs look-up-table-based\nradiative transfer simulations to estimate the atmospheric path reflectance and\ntransmittance for atmospherically correcting high-resolution CARTOSAT-3\nMultispectral (MX) imagery for several Indian cities. The corrected surface\nreflectance data were subsequently used in supervised and semi-supervised\nsegmentation models, demonstrating stability in multi-class (buildings, roads,\ntrees and water bodies) LULC segmentation accuracy, particularly in scenarios\nwith sparsely labelled data.\n","authors":["Soham Mukherjee","Yash Dixit","Naman Srivastava","Joel D Joy","Rohan Olikara","Koesha Sinha","Swarup E","Rakshit Ramesh"],"pdf_url":"https://arxiv.org/pdf/2409.05494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13330v2","updated":"2024-09-09T10:42:58Z","published":"2023-12-20T17:44:32Z","title":"SOVC: Subject-Oriented Video Captioning","summary":" Describing video content according to users' needs is a long-held goal.\nAlthough existing video captioning methods have made significant progress, the\ngenerated captions may not focus on the entity that users are particularly\ninterested in. To address this problem, we propose a new video captioning task,\nSubject-Oriented Video Captioning (SOVC), which aims to allow users to specify\nthe describing target via a bounding box. To support this task, we construct\ntwo subject-oriented video captioning datasets based on two widely used video\ncaptioning datasets: MSVD and MSRVTT, by annotating subjects in each video for\neach caption. These datasets pave the way for describing users' interested\ntargets. To tackle this task, we introduce a method tailored to this task,\nnamed SOVCNet. It consists of two key components: a subject-oriented sampling\nmodule that samples frames related to the subject to minimize irrelevant\ninformation; and a subject-oriented encoding module that utilizes the subject\nareas as hard prompts and integrates learnable soft prompts, enhancing the\nmodel's focus on the subject's activities and facilitating adaptation to the\ndownstream generation task. Extensive experimental results demonstrate the\neffectiveness of our method on this new task.\n","authors":["Chang Teng","Yunchuan Ma","Guorong Li","Yuankai Qi","Laiyu Qing","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2312.13330v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09964v2","updated":"2024-09-09T10:41:31Z","published":"2024-03-15T02:05:20Z","title":"Boundary Constraint-free Biomechanical Model-Based Surface Matching for\n Intraoperative Liver Deformation Correction","summary":" In image-guided liver surgery, 3D-3D non-rigid registration methods play a\ncrucial role in estimating the mapping between the preoperative model and the\nintraoperative surface represented as point clouds, addressing the challenge of\ntissue deformation. Typically, these methods incorporate a biomechanical model,\nrepresented as a finite element model (FEM), used to regularize a surface\nmatching term. This paper introduces a novel 3D-3D non-rigid registration\nmethod. In contrast to the preceding techniques, our method uniquely\nincorporates the FEM within the surface matching term itself, ensuring that the\nestimated deformation maintains geometric consistency throughout the\nregistration process. Additionally, we eliminate the need to determine\nzero-boundary conditions and applied force locations in the FEM. We achieve\nthis by integrating soft springs into the stiffness matrix and allowing forces\nto be distributed across the entire liver surface. To further improve\nrobustness, we introduce a regularization technique focused on the gradient of\nthe force magnitudes. This regularization imposes spatial smoothness and helps\nprevent the overfitting of irregular noise in intraoperative data. Optimization\nis achieved through an accelerated proximal gradient algorithm, further\nenhanced by our proposed method for determining the optimal step size. Our\nmethod is evaluated and compared to both a learning-based method and a\ntraditional method that features FEM regularization using data collected on our\ncustom-developed phantom, as well as two publicly available datasets. Our\nmethod consistently outperforms or is comparable to the baseline techniques.\nBoth the code and dataset will be made publicly available.\n","authors":["Zixin Yang","Richard Simon","Kelly Merrell","Cristian. A. Linte"],"pdf_url":"https://arxiv.org/pdf/2403.09964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05490v1","updated":"2024-09-09T10:36:19Z","published":"2024-09-09T10:36:19Z","title":"A Taxonomy of Miscompressions: Preparing Image Forensics for Neural\n Compression","summary":" Neural compression has the potential to revolutionize lossy image\ncompression. Based on generative models, recent schemes achieve unprecedented\ncompression rates at high perceptual quality but compromise semantic fidelity.\nDetails of decompressed images may appear optically flawless but semantically\ndifferent from the originals, making compression errors difficult or impossible\nto detect. We explore the problem space and propose a provisional taxonomy of\nmiscompressions. It defines three types of 'what happens' and has a binary\n'high impact' flag indicating miscompressions that alter symbols. We discuss\nhow the taxonomy can facilitate risk communication and research into\nmitigations.\n","authors":["Nora Hofer","Rainer Böhme"],"pdf_url":"https://arxiv.org/pdf/2409.05490v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.05474v1","updated":"2024-09-09T10:06:34Z","published":"2024-09-09T10:06:34Z","title":"PVP-Recon: Progressive View Planning via Warping Consistency for\n Sparse-View Surface Reconstruction","summary":" Neural implicit representations have revolutionized dense multi-view surface\nreconstruction, yet their performance significantly diminishes with sparse\ninput views. A few pioneering works have sought to tackle the challenge of\nsparse-view reconstruction by leveraging additional geometric priors or\nmulti-scene generalizability. However, they are still hindered by the imperfect\nchoice of input views, using images under empirically determined viewpoints to\nprovide considerable overlap. We propose PVP-Recon, a novel and effective\nsparse-view surface reconstruction method that progressively plans the next\nbest views to form an optimal set of sparse viewpoints for image capturing.\nPVP-Recon starts initial surface reconstruction with as few as 3 views and\nprogressively adds new views which are determined based on a novel warping\nscore that reflects the information gain of each newly added view. This\nprogressive view planning progress is interleaved with a neural SDF-based\nreconstruction module that utilizes multi-resolution hash features, enhanced by\na progressive training scheme and a directional Hessian loss. Quantitative and\nqualitative experiments on three benchmark datasets show that our framework\nachieves high-quality reconstruction with a constrained input budget and\noutperforms existing baselines.\n","authors":["Sheng Ye","Yuze He","Matthieu Lin","Jenny Sheng","Ruoyu Fan","Yiheng Han","Yubin Hu","Ran Yi","Yu-Hui Wen","Yong-Jin Liu","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2409.05474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05466v1","updated":"2024-09-09T09:48:27Z","published":"2024-09-09T09:48:27Z","title":"Proto-OOD: Enhancing OOD Object Detection with Prototype Feature\n Similarity","summary":" The limited training samples for object detectors commonly result in low\naccuracy out-of-distribution (OOD) object detection. We have observed that\nfeature vectors of the same class tend to cluster tightly in feature space,\nwhereas those of different classes are more scattered. This insight motivates\nus to leverage feature similarity for OOD detection. Drawing on the concept of\nprototypes prevalent in few-shot learning, we introduce a novel network\narchitecture, Proto-OOD, designed for this purpose. Proto-OOD enhances\nprototype representativeness through contrastive loss and identifies OOD data\nby assessing the similarity between input features and prototypes. It employs a\nnegative embedding generator to create negative embedding, which are then used\nto train the similarity module. Proto-OOD achieves significantly lower FPR95 in\nMS-COCO dataset and higher mAP for Pascal VOC dataset, when utilizing Pascal\nVOC as ID dataset and MS-COCO as OOD dataset. Additionally, we identify\nlimitations in existing evaluation metrics and propose an enhanced evaluation\nprotocol.\n","authors":["Junkun Chen","Jilin Mei","Liang Chen","Fangzhou Zhao","Yu Hu"],"pdf_url":"https://arxiv.org/pdf/2409.05466v1.pdf","comment":"14pages"},{"id":"http://arxiv.org/abs/2409.05463v1","updated":"2024-09-09T09:43:17Z","published":"2024-09-09T09:43:17Z","title":"DriveScape: Towards High-Resolution Controllable Multi-View Driving\n Video Generation","summary":" Recent advancements in generative models have provided promising solutions\nfor synthesizing realistic driving videos, which are crucial for training\nautonomous driving perception models. However, existing approaches often\nstruggle with multi-view video generation due to the challenges of integrating\n3D information while maintaining spatial-temporal consistency and effectively\nlearning from a unified model. In this paper, we propose an end-to-end\nframework named DriveScape for multi-view, 3D condition-guided video\ngeneration. DriveScape not only streamlines the process by integrating camera\ndata to ensure comprehensive spatial-temporal coverage, but also introduces a\nBi-Directional Modulated Transformer module to effectively align 3D road\nstructural information. As a result, our approach enables precise control over\nvideo generation, significantly enhancing realism and providing a robust\nsolution for generating multi-view driving videos. Our framework achieves\nstate-of-the-art results on the nuScenes dataset, demonstrating impressive\ngenerative quality metrics with an FID score of 8.34 and an FVD score of 76.39,\nas well as superior performance across various perception tasks. This paves the\nway for more accurate environmental simulations in autonomous driving. Code\nwill be available at our project homepage.\n","authors":["Wei Wu","Xi Guo","Weixuan Tang","Tingxuan Huang","Chiyu Wang","Dongyue Chen","Chenjing Ding"],"pdf_url":"https://arxiv.org/pdf/2409.05463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02919v3","updated":"2024-09-09T09:11:28Z","published":"2024-09-04T17:58:08Z","title":"HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical\n MLLM Prompts","summary":" The potential for higher-resolution image generation using pretrained\ndiffusion models is immense, yet these models often struggle with issues of\nobject repetition and structural artifacts especially when scaling to 4K\nresolution and higher. We figure out that the problem is caused by that, a\nsingle prompt for the generation of multiple scales provides insufficient\nefficacy. In response, we propose HiPrompt, a new tuning-free solution that\ntackles the above problems by introducing hierarchical prompts. The\nhierarchical prompts offer both global and local guidance. Specifically, the\nglobal guidance comes from the user input that describes the overall content,\nwhile the local guidance utilizes patch-wise descriptions from MLLMs to\nelaborately guide the regional structure and texture generation. Furthermore,\nduring the inverse denoising process, the generated noise is decomposed into\nlow- and high-frequency spatial components. These components are conditioned on\nmultiple prompt levels, including detailed patch-wise descriptions and broader\nimage-level prompts, facilitating prompt-guided denoising under hierarchical\nsemantic guidance. It further allows the generation to focus more on local\nspatial regions and ensures the generated images maintain coherent local and\nglobal semantics, structures, and textures with high definition. Extensive\nexperiments demonstrate that HiPrompt outperforms state-of-the-art works in\nhigher-resolution image generation, significantly reducing object repetition\nand enhancing structural quality.\n","authors":["Xinyu Liu","Yingqing He","Lanqing Guo","Xiang Li","Bu Jin","Peng Li","Yan Li","Chi-Min Chan","Qifeng Chen","Wei Xue","Wenhan Luo","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2409.02919v3.pdf","comment":"https://liuxinyv.github.io/HiPrompt/"},{"id":"http://arxiv.org/abs/2409.05442v1","updated":"2024-09-09T08:46:45Z","published":"2024-09-09T08:46:45Z","title":"EndoOmni: Zero-Shot Cross-Dataset Depth Estimation in Endoscopy by\n Robust Self-Learning from Noisy Labels","summary":" Single-image depth estimation is essential for endoscopy tasks such as\nlocalization, reconstruction, and augmented reality. Most existing methods in\nsurgical scenes focus on in-domain depth estimation, limiting their real-world\napplicability. This constraint stems from the scarcity and inferior labeling\nquality of medical data for training. In this work, we present EndoOmni, the\nfirst foundation model for zero-shot cross-domain depth estimation for\nendoscopy. To harness the potential of diverse training data, we refine the\nadvanced self-learning paradigm that employs a teacher model to generate\npseudo-labels, guiding a student model trained on large-scale labeled and\nunlabeled data. To address training disturbance caused by inherent noise in\ndepth labels, we propose a robust training framework that leverages both depth\nlabels and estimated confidence from the teacher model to jointly guide the\nstudent model training. Moreover, we propose a weighted scale-and-shift\ninvariant loss to adaptively adjust learning weights based on label confidence,\nthus imposing learning bias towards cleaner label pixels while reducing the\ninfluence of highly noisy pixels. Experiments on zero-shot relative depth\nestimation show that our EndoOmni improves state-of-the-art methods in medical\nimaging for 41\\% and existing foundation models for 25\\% in terms of absolute\nrelative error on specific dataset. Furthermore, our model provides strong\ninitialization for fine-tuning to metric depth estimation, maintaining superior\nperformance in both in-domain and out-of-domain scenarios. The source code will\nbe publicly available.\n","authors":["Qingyao Tian","Zhen Chen","Huai Liao","Xinyan Huang","Lujie Li","Sebastien Ourselin","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2409.05442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13756v3","updated":"2024-09-09T08:43:09Z","published":"2023-07-25T18:28:19Z","title":"PlaneRecTR++: Unified Query Learning for Joint 3D Planar Reconstruction\n and Pose Estimation","summary":" 3D plane reconstruction from images can usually be divided into several\nsub-tasks of plane detection, segmentation, parameters regression and possibly\ndepth prediction for per-frame, along with plane correspondence and relative\ncamera pose estimation between frames. Previous works tend to divide and\nconquer these sub-tasks with distinct network modules, overall formulated by a\ntwo-stage paradigm. With an initial camera pose and per-frame plane predictions\nprovided from the first stage, exclusively designed modules, potentially\nrelying on extra plane correspondence labelling, are applied to merge\nmulti-view plane entities and produce 6DoF camera pose. As none of existing\nworks manage to integrate above closely related sub-tasks into a unified\nframework but treat them separately and sequentially, we suspect it potentially\nas a main source of performance limitation for existing approaches. Motivated\nby this finding and the success of query-based learning in enriching reasoning\namong semantic entities, in this paper, we propose PlaneRecTR++, a\nTransformer-based architecture, which for the first time unifies all sub-tasks\nrelated to multi-view reconstruction and pose estimation with a compact\nsingle-stage model, refraining from initial pose estimation and plane\ncorrespondence supervision. Extensive quantitative and qualitative experiments\ndemonstrate that our proposed unified learning achieves mutual benefits across\nsub-tasks, obtaining a new state-of-the-art performance on public ScanNetv1,\nScanNetv2, NYUv2-Plane, and MatterPort3D datasets.\n","authors":["Jingjia Shi","Shuaifeng Zhi","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2307.13756v3.pdf","comment":"Journal extension of our ICCV 2023 paper \"PlaneRecTR\", which expands\n from single view reconstruction to simultaneous multi-view reconstruction and\n camera pose estimation. Note that the ICCV23 PlaneRecTR paper could be found\n in the previous arxiv version [v2](arXiv:2307.13756v2)"},{"id":"http://arxiv.org/abs/2407.06064v2","updated":"2024-09-09T08:34:03Z","published":"2024-07-08T16:05:56Z","title":"Pan-denoising: Guided Hyperspectral Image Denoising via Weighted\n Represent Coefficient Total Variation","summary":" This paper introduces a novel paradigm for hyperspectral image (HSI)\ndenoising, which is termed \\textit{pan-denoising}. In a given scene,\npanchromatic (PAN) images capture similar structures and textures to HSIs but\nwith less noise. This enables the utilization of PAN images to guide the HSI\ndenoising process. Consequently, pan-denoising, which incorporates an\nadditional prior, has the potential to uncover underlying structures and\ndetails beyond the internal information modeling of traditional HSI denoising\nmethods. However, the proper modeling of this additional prior poses a\nsignificant challenge. To alleviate this issue, the paper proposes a novel\nregularization term, Panchromatic Weighted Representation Coefficient Total\nVariation (PWRCTV). It employs the gradient maps of PAN images to automatically\nassign different weights of TV regularization for each pixel, resulting in\nlarger weights for smooth areas and smaller weights for edges. This\nregularization forms the basis of a pan-denoising model, which is solved using\nthe Alternating Direction Method of Multipliers. Extensive experiments on\nsynthetic and real-world datasets demonstrate that PWRCTV outperforms several\nstate-of-the-art methods in terms of metrics and visual quality. Furthermore,\nan HSI classification experiment confirms that PWRCTV, as a preprocessing\nmethod, can enhance the performance of downstream classification tasks. The\ncode and data are available at https://github.com/shuangxu96/PWRCTV.\n","authors":["Shuang Xu","Qiao Ke","Jiangjun Peng","Xiangyong Cao","Zixiang Zhao"],"pdf_url":"https://arxiv.org/pdf/2407.06064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15588v3","updated":"2024-09-09T08:28:43Z","published":"2023-07-28T14:43:27Z","title":"OAFuser: Towards Omni-Aperture Fusion for Light Field Semantic\n Segmentation","summary":" Light field cameras are capable of capturing intricate angular and spatial\ndetails. This allows for acquiring complex light patterns and details from\nmultiple angles, significantly enhancing the precision of image semantic\nsegmentation. However, two significant issues arise: (1) The extensive angular\ninformation of light field cameras contains a large amount of redundant data,\nwhich is overwhelming for the limited hardware resources of intelligent agents.\n(2) A relative displacement difference exists in the data collected by\ndifferent micro-lenses. To address these issues, we propose an Omni-Aperture\nFusion model (OAFuser) that leverages dense context from the central view and\nextracts the angular information from sub-aperture images to generate\nsemantically consistent results. To simultaneously streamline the redundant\ninformation from the light field cameras and avoid feature loss during network\npropagation, we present a simple yet very effective Sub-Aperture Fusion Module\n(SAFM). This module efficiently embeds sub-aperture images in angular features,\nallowing the network to process each sub-aperture image with a minimal\ncomputational demand of only (around 1GFlops). Furthermore, to address the\nmismatched spatial information across viewpoints, we present a Center Angular\nRectification Module (CARM) to realize feature resorting and prevent feature\nocclusion caused by misalignment. The proposed OAFuser achieves\nstate-of-the-art performance on four UrbanLF datasets in terms of all\nevaluation metrics and sets a new record of 84.93% in mIoU on the UrbanLF-Real\nExtended dataset, with a gain of +3.69%. The source code for OAFuser is\navailable at https://github.com/FeiBryantkit/OAFuser.\n","authors":["Fei Teng","Jiaming Zhang","Kunyu Peng","Yaonan Wang","Rainer Stiefelhagen","Kailun Yang"],"pdf_url":"https://arxiv.org/pdf/2307.15588v3.pdf","comment":"Accepted to IEEE Transactions on Artificial Intelligence (TAI). The\n source code is available at https://github.com/FeiBryantkit/OAFuser"},{"id":"http://arxiv.org/abs/2409.05427v1","updated":"2024-09-09T08:26:47Z","published":"2024-09-09T08:26:47Z","title":"TextToucher: Fine-Grained Text-to-Touch Generation","summary":" Tactile sensation plays a crucial role in the development of multi-modal\nlarge models and embodied intelligence. To collect tactile data with minimal\ncost as possible, a series of studies have attempted to generate tactile images\nby vision-to-touch image translation. However, compared to text modality,\nvisual modality-driven tactile generation cannot accurately depict human\ntactile sensation. In this work, we analyze the characteristics of tactile\nimages in detail from two granularities: object-level (tactile texture, tactile\nshape), and sensor-level (gel status). We model these granularities of\ninformation through text descriptions and propose a fine-grained Text-to-Touch\ngeneration method (TextToucher) to generate high-quality tactile samples.\nSpecifically, we introduce a multimodal large language model to build the text\nsentences about object-level tactile information and employ a set of learnable\ntext prompts to represent the sensor-level tactile information. To better guide\nthe tactile generation process with the built text information, we fuse the\ndual grains of text information and explore various dual-grain text\nconditioning methods within the diffusion transformer architecture.\nFurthermore, we propose a Contrastive Text-Touch Pre-training (CTTP) metric to\nprecisely evaluate the quality of text-driven generated tactile data. Extensive\nexperiments demonstrate the superiority of our TextToucher method. The source\ncodes will be available at \\url{https://github.com/TtuHamg/TextToucher}.\n","authors":["Jiahang Tu","Hao Fu","Fengyu Yang","Hanbin Zhao","Chao Zhang","Hui Qian"],"pdf_url":"https://arxiv.org/pdf/2409.05427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05425v1","updated":"2024-09-09T08:26:11Z","published":"2024-09-09T08:26:11Z","title":"Distribution Discrepancy and Feature Heterogeneity for Active 3D Object\n Detection","summary":" LiDAR-based 3D object detection is a critical technology for the development\nof autonomous driving and robotics. However, the high cost of data annotation\nlimits its advancement. We propose a novel and effective active learning (AL)\nmethod called Distribution Discrepancy and Feature Heterogeneity (DDFH), which\nsimultaneously considers geometric features and model embeddings, assessing\ninformation from both the instance-level and frame-level perspectives.\nDistribution Discrepancy evaluates the difference and novelty of instances\nwithin the unlabeled and labeled distributions, enabling the model to learn\nefficiently with limited data. Feature Heterogeneity ensures the heterogeneity\nof intra-frame instance features, maintaining feature diversity while avoiding\nredundant or similar instances, thus minimizing annotation costs. Finally,\nmultiple indicators are efficiently aggregated using Quantile Transform,\nproviding a unified measure of informativeness. Extensive experiments\ndemonstrate that DDFH outperforms the current state-of-the-art (SOTA) methods\non the KITTI and Waymo datasets, effectively reducing the bounding box\nannotation cost by 56.3% and showing robustness when working with both\none-stage and two-stage models.\n","authors":["Huang-Yu Chen","Jia-Fong Yeh","Jia-Wei Liao","Pin-Hsuan Peng","Winston H. Hsu"],"pdf_url":"https://arxiv.org/pdf/2409.05425v1.pdf","comment":"Accepted to CoRL 2024"},{"id":"http://arxiv.org/abs/2409.05420v1","updated":"2024-09-09T08:21:17Z","published":"2024-09-09T08:21:17Z","title":"AD-Net: Attention-based dilated convolutional residual network with\n guided decoder for robust skin lesion segmentation","summary":" In computer-aided diagnosis tools employed for skin cancer treatment and\nearly diagnosis, skin lesion segmentation is important. However, achieving\nprecise segmentation is challenging due to inherent variations in appearance,\ncontrast, texture, and blurry lesion boundaries. This research presents a\nrobust approach utilizing a dilated convolutional residual network, which\nincorporates an attention-based spatial feature enhancement block (ASFEB) and\nemploys a guided decoder strategy. In each dilated convolutional residual\nblock, dilated convolution is employed to broaden the receptive field with\nvarying dilation rates. To improve the spatial feature information of the\nencoder, we employed an attention-based spatial feature enhancement block in\nthe skip connections. The ASFEB in our proposed method combines feature maps\nobtained from average and maximum-pooling operations. These combined features\nare then weighted using the active outcome of global average pooling and\nconvolution operations. Additionally, we have incorporated a guided decoder\nstrategy, where each decoder block is optimized using an individual loss\nfunction to enhance the feature learning process in the proposed AD-Net. The\nproposed AD-Net presents a significant benefit by necessitating fewer model\nparameters compared to its peer methods. This reduction in parameters directly\nimpacts the number of labeled data required for training, facilitating faster\nconvergence during the training process. The effectiveness of the proposed\nAD-Net was evaluated using four public benchmark datasets. We conducted a\nWilcoxon signed-rank test to verify the efficiency of the AD-Net. The outcomes\nsuggest that our method surpasses other cutting-edge methods in performance,\neven without the implementation of data augmentation strategies.\n","authors":["Asim Naveed","Syed S. Naqvi","Tariq M. Khan","Shahzaib Iqbal","M. Yaqoob Wani","Haroon Ahmed Khan"],"pdf_url":"https://arxiv.org/pdf/2409.05420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05414v1","updated":"2024-09-09T08:16:17Z","published":"2024-09-09T08:16:17Z","title":"CipherDM: Secure Three-Party Inference for Diffusion Model Sampling","summary":" Diffusion Models (DMs) achieve state-of-the-art synthesis results in image\ngeneration and have been applied to various fields. However, DMs sometimes\nseriously violate user privacy during usage, making the protection of privacy\nan urgent issue. Using traditional privacy computing schemes like Secure\nMulti-Party Computation (MPC) directly in DMs faces significant computation and\ncommunication challenges. To address these issues, we propose CipherDM, the\nfirst novel, versatile and universal framework applying MPC technology to DMs\nfor secure sampling, which can be widely implemented on multiple DM based\ntasks. We thoroughly analyze sampling latency breakdown, find time-consuming\nparts and design corresponding secure MPC protocols for computing nonlinear\nactivations including SoftMax, SiLU and Mish. CipherDM is evaluated on popular\narchitectures (DDPM, DDIM) using MNIST dataset and on SD deployed by diffusers.\nCompared to direct implementation on SPU, our approach improves running time by\napproximately 1.084\\times \\sim 2.328\\times, and reduces communication costs by\napproximately 1.212\\times \\sim 1.791\\times.\n","authors":["Xin Zhao","Xiaojun Chen","Xudong Chen","He Li","Tingyu Fan","Zhendong Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.05414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05413v1","updated":"2024-09-09T08:15:39Z","published":"2024-09-09T08:15:39Z","title":"From Words to Poses: Enhancing Novel Object Pose Estimation with Vision\n Language Models","summary":" Robots are increasingly envisioned to interact in real-world scenarios, where\nthey must continuously adapt to new situations. To detect and grasp novel\nobjects, zero-shot pose estimators determine poses without prior knowledge.\nRecently, vision language models (VLMs) have shown considerable advances in\nrobotics applications by establishing an understanding between language input\nand image input. In our work, we take advantage of VLMs zero-shot capabilities\nand translate this ability to 6D object pose estimation. We propose a novel\nframework for promptable zero-shot 6D object pose estimation using language\nembeddings. The idea is to derive a coarse location of an object based on the\nrelevancy map of a language-embedded NeRF reconstruction and to compute the\npose estimate with a point cloud registration method. Additionally, we provide\nan analysis of LERF's suitability for open-set object pose estimation. We\nexamine hyperparameters, such as activation thresholds for relevancy maps and\ninvestigate the zero-shot capabilities on an instance- and category-level.\nFurthermore, we plan to conduct robotic grasping experiments in a real-world\nsetting.\n","authors":["Tessa Pulli","Stefan Thalhammer","Simon Schwaiger","Markus Vincze"],"pdf_url":"https://arxiv.org/pdf/2409.05413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05407v1","updated":"2024-09-09T08:08:05Z","published":"2024-09-09T08:08:05Z","title":"KRONC: Keypoint-based Robust Camera Optimization for 3D Car\n Reconstruction","summary":" The three-dimensional representation of objects or scenes starting from a set\nof images has been a widely discussed topic for years and has gained additional\nattention after the diffusion of NeRF-based approaches. However, an\nunderestimated prerequisite is the knowledge of camera poses or, more\nspecifically, the estimation of the extrinsic calibration parameters. Although\nexcellent general-purpose Structure-from-Motion methods are available as a\npre-processing step, their computational load is high and they require a lot of\nframes to guarantee sufficient overlapping among the views. This paper\nintroduces KRONC, a novel approach aimed at inferring view poses by leveraging\nprior knowledge about the object to reconstruct and its representation through\nsemantic keypoints. With a focus on vehicle scenes, KRONC is able to estimate\nthe position of the views as a solution to a light optimization problem\ntargeting the convergence of keypoints' back-projections to a singular point.\nTo validate the method, a specific dataset of real-world car scenes has been\ncollected. Experiments confirm KRONC's ability to generate excellent estimates\nof camera poses starting from very coarse initialization. Results are\ncomparable with Structure-from-Motion methods with huge savings in computation.\nCode and data will be made publicly available.\n","authors":["Davide Di Nucci","Alessandro Simoni","Matteo Tomei","Luca Ciuffreda","Roberto Vezzani","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2409.05407v1.pdf","comment":"Accepted at ECCVW"},{"id":"http://arxiv.org/abs/2409.05405v1","updated":"2024-09-09T08:06:50Z","published":"2024-09-09T08:06:50Z","title":"A Survey of Multimodal Composite Editing and Retrieval","summary":" In the real world, where information is abundant and diverse across different\nmodalities, understanding and utilizing various data types to improve retrieval\nsystems is a key focus of research. Multimodal composite retrieval integrates\ndiverse modalities such as text, image and audio, etc. to provide more\naccurate, personalized, and contextually relevant results. To facilitate a\ndeeper understanding of this promising direction, this survey explores\nmultimodal composite editing and retrieval in depth, covering image-text\ncomposite editing, image-text composite retrieval, and other multimodal\ncomposite retrieval. In this survey, we systematically organize the application\nscenarios, methods, benchmarks, experiments, and future directions. Multimodal\nlearning is a hot topic in large model era, and have also witnessed some\nsurveys in multimodal learning and vision-language models with transformers\npublished in the PAMI journal. To the best of our knowledge, this survey is the\nfirst comprehensive review of the literature on multimodal composite retrieval,\nwhich is a timely complement of multimodal fusion to existing reviews. To help\nreaders' quickly track this field, we build the project page for this survey,\nwhich can be found at\nhttps://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval.\n","authors":["Suyan Li","Fuxiang Huang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05405v1.pdf","comment":"22 pages, 3 figures, and 11 tables"},{"id":"http://arxiv.org/abs/2409.05399v1","updated":"2024-09-09T07:55:59Z","published":"2024-09-09T07:55:59Z","title":"Sequential Posterior Sampling with Diffusion Models","summary":" Diffusion models have quickly risen in popularity for their ability to model\ncomplex distributions and perform effective posterior sampling. Unfortunately,\nthe iterative nature of these generative models makes them computationally\nexpensive and unsuitable for real-time sequential inverse problems such as\nultrasound imaging. Considering the strong temporal structure across sequences\nof frames, we propose a novel approach that models the transition dynamics to\nimprove the efficiency of sequential diffusion posterior sampling in\nconditional image synthesis. Through modeling sequence data using a video\nvision transformer (ViViT) transition model based on previous diffusion\noutputs, we can initialize the reverse diffusion trajectory at a lower noise\nscale, greatly reducing the number of iterations required for convergence. We\ndemonstrate the effectiveness of our approach on a real-world dataset of high\nframe rate cardiac ultrasound images and show that it achieves the same\nperformance as a full diffusion trajectory while accelerating inference\n25$\\times$, enabling real-time posterior sampling. Furthermore, we show that\nthe addition of a transition model improves the PSNR up to 8\\% in cases with\nsevere motion. Our method opens up new possibilities for real-time applications\nof diffusion models in imaging and other domains requiring real-time inference.\n","authors":["Tristan S. W. Stevens","Oisín Nolan","Jean-Luc Robert","Ruud J. G. van Sloun"],"pdf_url":"https://arxiv.org/pdf/2409.05399v1.pdf","comment":"5 pages, 4 figures, preprint"},{"id":"http://arxiv.org/abs/2409.05396v1","updated":"2024-09-09T07:49:13Z","published":"2024-09-09T07:49:13Z","title":"FacialFlowNet: Advancing Facial Optical Flow Estimation with a Diverse\n Dataset and a Decomposed Model","summary":" Facial movements play a crucial role in conveying altitude and intentions,\nand facial optical flow provides a dynamic and detailed representation of it.\nHowever, the scarcity of datasets and a modern baseline hinders the progress in\nfacial optical flow research. This paper proposes FacialFlowNet (FFN), a novel\nlarge-scale facial optical flow dataset, and the Decomposed Facial Flow Model\n(DecFlow), the first method capable of decomposing facial flow. FFN comprises\n9,635 identities and 105,970 image pairs, offering unprecedented diversity for\ndetailed facial and head motion analysis. DecFlow features a facial\nsemantic-aware encoder and a decomposed flow decoder, excelling in accurately\nestimating and decomposing facial flow into head and expression components.\nComprehensive experiments demonstrate that FFN significantly enhances the\naccuracy of facial flow estimation across various optical flow methods,\nachieving up to an 11% reduction in Endpoint Error (EPE) (from 3.91 to 3.48).\nMoreover, DecFlow, when coupled with FFN, outperforms existing methods in both\nsynthetic and real-world scenarios, enhancing facial expression analysis. The\ndecomposed expression flow achieves a substantial accuracy improvement of 18%\n(from 69.1% to 82.1%) in micro-expressions recognition. These contributions\nrepresent a significant advancement in facial motion analysis and optical flow\nestimation. Codes and datasets can be found.\n","authors":["Jianzhi Lu","Ruian He","Shili Zhou","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2409.05396v1.pdf","comment":"ACMMM2024"},{"id":"http://arxiv.org/abs/2409.05395v1","updated":"2024-09-09T07:49:09Z","published":"2024-09-09T07:49:09Z","title":"Shaking Up VLMs: Comparing Transformers and Structured State Space\n Models for Vision & Language Modeling","summary":" This study explores replacing Transformers in Visual Language Models (VLMs)\nwith Mamba, a recent structured state space model (SSM) that demonstrates\npromising performance in sequence modeling. We test models up to 3B parameters\nunder controlled conditions, showing that Mamba-based VLMs outperforms\nTransformers-based VLMs in captioning, question answering, and reading\ncomprehension. However, we find that Transformers achieve greater performance\nin visual grounding and the performance gap widens with scale. We explore two\nhypotheses to explain this phenomenon: 1) the effect of task-agnostic visual\nencoding on the updates of the hidden states, and 2) the difficulty in\nperforming visual grounding from the perspective of in-context multimodal\nretrieval. Our results indicate that a task-aware encoding yields minimal\nperformance gains on grounding, however, Transformers significantly outperform\nMamba at in-context multimodal retrieval. Overall, Mamba shows promising\nperformance on tasks where the correct output relies on a summary of the image\nbut struggles when retrieval of explicit information from the context is\nrequired.\n","authors":["Georgios Pantazopoulos","Malvina Nikandrou","Alessandro Suglia","Oliver Lemon","Arash Eshghi"],"pdf_url":"https://arxiv.org/pdf/2409.05395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05393v1","updated":"2024-09-09T07:43:58Z","published":"2024-09-09T07:43:58Z","title":"TAVP: Task-Adaptive Visual Prompt for Cross-domain Few-shot Segmentation","summary":" Under the backdrop of large-scale pre-training, large visual models (LVM)\nhave demonstrated significant potential in image understanding. The recent\nemergence of the Segment Anything Model (SAM) has brought a qualitative shift\nin the field of image segmentation, supporting flexible interactive cues and\nstrong learning capabilities. However, its performance often falls short in\ncross-domain and few-shot applications. Transferring prior knowledge from\nfoundation models to new applications while preserving learning capabilities is\nworth exploring. This work proposes a task-adaptive prompt framework based on\nSAM, a new paradigm for Cross-dominan few-shot segmentation (CD-FSS). First, a\nMulti-level Feature Fusion (MFF) was used for integrated feature extraction.\nBesides, an additional Class Domain Task-Adaptive Auto-Prompt (CDTAP) module\nwas combined with the segmentation branch for class-domain agnostic feature\nextraction and high-quality learnable prompt production. This significant\nadvancement uses a unique generative approach to prompts alongside a\ncomprehensive model structure and specialized prototype computation. While\nensuring that the prior knowledge of SAM is not discarded, the new branch\ndisentangles category and domain information through prototypes, guiding it in\nadapting the CD-FSS. We have achieved the best results on three benchmarks\ncompared to the recent state-of-the-art (SOTA) methods. Comprehensive\nexperiments showed that after task-specific and weighted guidance, the abundant\nfeature information of SAM can be better learned for CD-FSS.\n","authors":["Jiaqi Yang","Ye Huang","Xiangjian He","Linlin Shen","Guoping Qiu"],"pdf_url":"https://arxiv.org/pdf/2409.05393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05389v1","updated":"2024-09-09T07:34:08Z","published":"2024-09-09T07:34:08Z","title":"A Novel Representation of Periodic Pattern and Its Application to\n Untrained Anomaly Detection","summary":" There are a variety of industrial products that possess periodic textures or\nsurfaces, such as carbon fiber textiles and display panels. Traditional\nimage-based quality inspection methods for these products require identifying\nthe periodic patterns from normal images (without anomaly and noise) and\nsubsequently detecting anomaly pixels with inconsistent appearances. However,\nit remains challenging to accurately extract the periodic pattern from a single\nimage in the presence of unknown anomalies and measurement noise. To deal with\nthis challenge, this paper proposes a novel self-representation of the periodic\nimage defined on a set of continuous parameters. In this way, periodic pattern\nlearning can be embedded into a joint optimization framework, which is named\nperiodic-sparse decomposition, with simultaneously modeling the sparse\nanomalies and Gaussian noise. Finally, for the real-world industrial images\nthat may not strictly satisfy the periodic assumption, we propose a novel\npixel-level anomaly scoring strategy to enhance the performance of anomaly\ndetection. Both simulated and real-world case studies demonstrate the\neffectiveness of the proposed methodology for periodic pattern learning and\nanomaly detection.\n","authors":["Peng Ye","Chengyu Tao","Juan Du"],"pdf_url":"https://arxiv.org/pdf/2409.05389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05387v1","updated":"2024-09-09T07:33:14Z","published":"2024-09-09T07:33:14Z","title":"Decoupling Contact for Fine-Grained Motion Style Transfer","summary":" Motion style transfer changes the style of a motion while retaining its\ncontent and is useful in computer animations and games. Contact is an essential\ncomponent of motion style transfer that should be controlled explicitly in\norder to express the style vividly while enhancing motion naturalness and\nquality. However, it is unknown how to decouple and control contact to achieve\nfine-grained control in motion style transfer. In this paper, we present a\nnovel style transfer method for fine-grained control over contacts while\nachieving both motion naturalness and spatial-temporal variations of style.\nBased on our empirical evidence, we propose controlling contact indirectly\nthrough the hip velocity, which can be further decomposed into the trajectory\nand contact timing, respectively. To this end, we propose a new model that\nexplicitly models the correlations between motions and trajectory/contact\ntiming/style, allowing us to decouple and control each separately. Our approach\nis built around a motion manifold, where hip controls can be easily integrated\ninto a Transformer-based decoder. It is versatile in that it can generate\nmotions directly as well as be used as post-processing for existing methods to\nimprove quality and contact controllability. In addition, we propose a new\nmetric that measures a correlation pattern of motions based on our empirical\nevidence, aligning well with human perception in terms of motion naturalness.\nBased on extensive evaluation, our method outperforms existing methods in terms\nof style expressivity and motion quality.\n","authors":["Xiangjun Tang","Linjun Wu","He Wang","Yiqian Wu","Bo Hu","Songnan Li","Xu Gong","Yuchen Liao","Qilong Kou","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2409.05387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05384v1","updated":"2024-09-09T07:32:18Z","published":"2024-09-09T07:32:18Z","title":"Look One and More: Distilling Hybrid Order Relational Knowledge for\n Cross-Resolution Image Recognition","summary":" In spite of great success in many image recognition tasks achieved by recent\ndeep models, directly applying them to recognize low-resolution images may\nsuffer from low accuracy due to the missing of informative details during\nresolution degradation. However, these images are still recognizable for\nsubjects who are familiar with the corresponding high-resolution ones. Inspired\nby that, we propose a teacher-student learning approach to facilitate\nlow-resolution image recognition via hybrid order relational knowledge\ndistillation. The approach refers to three streams: the teacher stream is\npretrained to recognize high-resolution images in high accuracy, the student\nstream is learned to identify low-resolution images by mimicking the teacher's\nbehaviors, and the extra assistant stream is introduced as bridge to help\nknowledge transfer across the teacher to the student. To extract sufficient\nknowledge for reducing the loss in accuracy, the learning of student is\nsupervised with multiple losses, which preserves the similarities in various\norder relational structures. In this way, the capability of recovering missing\ndetails of familiar low-resolution images can be effectively enhanced, leading\nto a better knowledge transfer. Extensive experiments on metric learning,\nlow-resolution image classification and low-resolution face recognition tasks\nshow the effectiveness of our approach, while taking reduced models.\n","authors":["Shiming Ge","Kangkai Zhang","Haolin Liu","Yingying Hua","Shengwei Zhao","Xin Jin","Hao Wen"],"pdf_url":"https://arxiv.org/pdf/2409.05384v1.pdf","comment":"Accepted by AAAI 2020"},{"id":"http://arxiv.org/abs/2409.05383v1","updated":"2024-09-09T07:31:16Z","published":"2024-09-09T07:31:16Z","title":"Deep Learning for Video Anomaly Detection: A Review","summary":" Video anomaly detection (VAD) aims to discover behaviors or events deviating\nfrom the normality in videos. As a long-standing task in the field of computer\nvision, VAD has witnessed much good progress. In the era of deep learning, with\nthe explosion of architectures of continuously growing capability and capacity,\na great variety of deep learning based methods are constantly emerging for the\nVAD task, greatly improving the generalization ability of detection algorithms\nand broadening the application scenarios. Therefore, such a multitude of\nmethods and a large body of literature make a comprehensive survey a pressing\nnecessity. In this paper, we present an extensive and comprehensive research\nreview, covering the spectrum of five different categories, namely,\nsemi-supervised, weakly supervised, fully supervised, unsupervised and open-set\nsupervised VAD, and we also delve into the latest VAD works based on\npre-trained large models, remedying the limitations of past reviews in terms of\nonly focusing on semi-supervised VAD and small model based methods. For the VAD\ntask with different levels of supervision, we construct a well-organized\ntaxonomy, profoundly discuss the characteristics of different types of methods,\nand show their performance comparisons. In addition, this review involves the\npublic datasets, open-source codes, and evaluation metrics covering all the\naforementioned VAD tasks. Finally, we provide several important research\ndirections for the VAD community.\n","authors":["Peng Wu","Chengyu Pan","Yuting Yan","Guansong Pang","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05383v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2409.05381v1","updated":"2024-09-09T07:26:21Z","published":"2024-09-09T07:26:21Z","title":"Boosting CLIP Adaptation for Image Quality Assessment via Meta-Prompt\n Learning and Gradient Regularization","summary":" Image Quality Assessment (IQA) remains an unresolved challenge in the field\nof computer vision, due to complex distortion conditions, diverse image\ncontent, and limited data availability. The existing Blind IQA (BIQA) methods\nheavily rely on extensive human annotations to train models, which is both\nlabor-intensive and costly due to the demanding nature of creating IQA\ndatasets. To mitigate the dependence on labeled samples, this paper introduces\na novel Gradient-Regulated Meta-Prompt IQA Framework (GRMP-IQA). This framework\naims to fast adapt the powerful visual-language pre-trained model, CLIP, to\ndownstream IQA tasks, significantly improving accuracy in scenarios with\nlimited data. Specifically, the GRMP-IQA comprises two key modules: Meta-Prompt\nPre-training Module and Quality-Aware Gradient Regularization. The Meta Prompt\nPre-training Module leverages a meta-learning paradigm to pre-train soft\nprompts with shared meta-knowledge across different distortions, enabling rapid\nadaptation to various IQA tasks. On the other hand, the Quality-Aware Gradient\nRegularization is designed to adjust the update gradients during fine-tuning,\nfocusing the model's attention on quality-relevant features and preventing\noverfitting to semantic information. Extensive experiments on five standard\nBIQA datasets demonstrate the superior performance to the state-of-the-art BIQA\nmethods under limited data setting, i.e., achieving SRCC values of 0.836 (vs.\n0.760 on LIVEC) and 0.853 (vs. 0.812 on KonIQ). Notably, utilizing just 20\\% of\nthe training data, our GRMP-IQA outperforms most existing fully supervised BIQA\nmethods.\n","authors":["Xudong Li","Zihao Huang","Runze Hu","Yan Zhang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2409.05381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05380v1","updated":"2024-09-09T07:25:47Z","published":"2024-09-09T07:25:47Z","title":"Prim2Room: Layout-Controllable Room Mesh Generation from Primitives","summary":" We propose Prim2Room, a novel framework for controllable room mesh generation\nleveraging 2D layout conditions and 3D primitive retrieval to facilitate\nprecise 3D layout specification. Diverging from existing methods that lack\ncontrol and precision, our approach allows for detailed customization of\nroom-scale environments. To overcome the limitations of previous methods, we\nintroduce an adaptive viewpoint selection algorithm that allows the system to\ngenerate the furniture texture and geometry from more favorable views than\npredefined camera trajectories. Additionally, we employ non-rigid depth\nregistration to ensure alignment between generated objects and their\ncorresponding primitive while allowing for shape variations to maintain\ndiversity. Our method not only enhances the accuracy and aesthetic appeal of\ngenerated 3D scenes but also provides a user-friendly platform for detailed\nroom design.\n","authors":["Chengzeng Feng","Jiacheng Wei","Cheng Chen","Yang Li","Pan Ji","Fayao Liu","Hongdong Li","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2409.05380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07487v3","updated":"2024-09-09T07:23:36Z","published":"2024-06-11T17:27:23Z","title":"GLAD: Towards Better Reconstruction with Global and Local Adaptive\n Diffusion Models for Unsupervised Anomaly Detection","summary":" Diffusion models have shown superior performance on unsupervised anomaly\ndetection tasks. Since trained with normal data only, diffusion models tend to\nreconstruct normal counterparts of test images with certain noises added.\nHowever, these methods treat all potential anomalies equally, which may cause\ntwo main problems. From the global perspective, the difficulty of\nreconstructing images with different anomalies is uneven. Therefore, instead of\nutilizing the same setting for all samples, we propose to predict a particular\ndenoising step for each sample by evaluating the difference between image\ncontents and the priors extracted from diffusion models. From the local\nperspective, reconstructing abnormal regions differs from normal areas even in\nthe same image. Theoretically, the diffusion model predicts a noise for each\nstep, typically following a standard Gaussian distribution. However, due to the\ndifference between the anomaly and its potential normal counterpart, the\npredicted noise in abnormal regions will inevitably deviate from the standard\nGaussian distribution. To this end, we propose introducing synthetic abnormal\nsamples in training to encourage the diffusion models to break through the\nlimitation of standard Gaussian distribution, and a spatial-adaptive feature\nfusion scheme is utilized during inference. With the above modifications, we\npropose a global and local adaptive diffusion model (abbreviated to GLAD) for\nunsupervised anomaly detection, which introduces appealing flexibility and\nachieves anomaly-free reconstruction while retaining as much normal information\nas possible. Extensive experiments are conducted on three commonly used anomaly\ndetection datasets (MVTec-AD, MPDD, and VisA) and a printed circuit board\ndataset (PCB-Bank) we integrated, showing the effectiveness of the proposed\nmethod.\n","authors":["Hang Yao","Ming Liu","Haolin Wang","Zhicun Yin","Zifei Yan","Xiaopeng Hong","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2406.07487v3.pdf","comment":"Accepted by ECCV 2024, code and models:\n https://github.com/hyao1/GLAD. Due to the limitation \"The abstract field\n cannot be longer than 1,920 characters\", the abstract here is shorter than\n that in the PDF file"},{"id":"http://arxiv.org/abs/2409.05379v1","updated":"2024-09-09T07:23:28Z","published":"2024-09-09T07:23:28Z","title":"PersonaTalk: Bring Attention to Your Persona in Visual Dubbing","summary":" For audio-driven visual dubbing, it remains a considerable challenge to\nuphold and highlight speaker's persona while synthesizing accurate lip\nsynchronization. Existing methods fall short of capturing speaker's unique\nspeaking style or preserving facial details. In this paper, we present\nPersonaTalk, an attention-based two-stage framework, including geometry\nconstruction and face rendering, for high-fidelity and personalized visual\ndubbing. In the first stage, we propose a style-aware audio encoding module\nthat injects speaking style into audio features through a cross-attention\nlayer. The stylized audio features are then used to drive speaker's template\ngeometry to obtain lip-synced geometries. In the second stage, a dual-attention\nface renderer is introduced to render textures for the target geometries. It\nconsists of two parallel cross-attention layers, namely Lip-Attention and\nFace-Attention, which respectively sample textures from different reference\nframes to render the entire face. With our innovative design, intricate facial\ndetails can be well preserved. Comprehensive experiments and user studies\ndemonstrate our advantages over other state-of-the-art methods in terms of\nvisual quality, lip-sync accuracy and persona preservation. Furthermore, as a\nperson-generic framework, PersonaTalk can achieve competitive performance as\nstate-of-the-art person-specific methods. Project Page:\nhttps://grisoon.github.io/PersonaTalk/.\n","authors":["Longhao Zhang","Shuang Liang","Zhipeng Ge","Tianshu Hu"],"pdf_url":"https://arxiv.org/pdf/2409.05379v1.pdf","comment":"Accepted at SIGGRAPH Asia 2024 (Conference Track)"},{"id":"http://arxiv.org/abs/2311.17460v6","updated":"2024-09-09T07:19:08Z","published":"2023-11-29T09:02:07Z","title":"W-HMR: Monocular Human Mesh Recovery in World Space with Weak-Supervised\n Calibration","summary":" Previous methods for 3D human motion recovery from monocular images often\nfall short due to reliance on camera coordinates, leading to inaccuracies in\nreal-world applications. The limited availability and diversity of focal length\nlabels further exacerbate misalignment issues in reconstructed 3D human bodies.\nTo address these challenges, we introduce W-HMR, a weak-supervised calibration\nmethod that predicts \"reasonable\" focal lengths based on body distortion\ninformation, eliminating the need for precise focal length labels. Our approach\nenhances 2D supervision precision and recovery accuracy. Additionally, we\npresent the OrientCorrect module, which corrects body orientation for plausible\nreconstructions in world space, avoiding the error accumulation associated with\ninaccurate camera rotation predictions. Our contributions include a novel\nweak-supervised camera calibration technique, an effective orientation\ncorrection module, and a decoupling strategy that significantly improves the\ngeneralizability and accuracy of human motion recovery in both camera and world\ncoordinates. The robustness of W-HMR is validated through extensive experiments\non various datasets, showcasing its superiority over existing methods. Codes\nand demos have been made available on the project page\nhttps://yw0208.github.io/w-hmr/.\n","authors":["Wei Yao","Hongwen Zhang","Yunlian Sun","Yebin Liu","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2311.17460v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05378v1","updated":"2024-09-09T07:18:09Z","published":"2024-09-09T07:18:09Z","title":"Memoryless Multimodal Anomaly Detection via Student-Teacher Network and\n Signed Distance Learning","summary":" Unsupervised anomaly detection is a challenging computer vision task, in\nwhich 2D-based anomaly detection methods have been extensively studied.\nHowever, multimodal anomaly detection based on RGB images and 3D point clouds\nrequires further investigation. The existing methods are mainly inspired by\nmemory bank based methods commonly used in 2D-based anomaly detection, which\nmay cost extra memory for storing mutimodal features. In present study, a novel\nmemoryless method MDSS is proposed for multimodal anomaly detection, which\nemploys a light-weighted student-teacher network and a signed distance function\nto learn from RGB images and 3D point clouds respectively, and complements the\nanomaly information from the two modalities. Specifically, a student-teacher\nnetwork is trained with normal RGB images and masks generated from point clouds\nby a dynamic loss, and the anomaly score map could be obtained from the\ndiscrepancy between the output of student and teacher. Furthermore, the signed\ndistance function learns from normal point clouds to predict the signed\ndistances between points and surface, and the obtained signed distances are\nused to generate anomaly score map. Subsequently, the anomaly score maps are\naligned to generate the final anomaly score map for detection. The experimental\nresults indicate that MDSS is comparable but more stable than the SOTA memory\nbank based method Shape-guided, and furthermore performs better than other\nbaseline methods.\n","authors":["Zhongbin Sun","Xiaolong Li","Yiran Li","Yue Ma"],"pdf_url":"https://arxiv.org/pdf/2409.05378v1.pdf","comment":"14 pages, 4 figures, 2 tables, to be published in PRCV-2024"},{"id":"http://arxiv.org/abs/2409.05370v1","updated":"2024-09-09T06:57:22Z","published":"2024-09-09T06:57:22Z","title":"KARGEN: Knowledge-enhanced Automated Radiology Report Generation Using\n Large Language Models","summary":" Harnessing the robust capabilities of Large Language Models (LLMs) for\nnarrative generation, logical reasoning, and common-sense knowledge\nintegration, this study delves into utilizing LLMs to enhance automated\nradiology report generation (R2Gen). Despite the wealth of knowledge within\nLLMs, efficiently triggering relevant knowledge within these large models for\nspecific tasks like R2Gen poses a critical research challenge. This paper\npresents KARGEN, a Knowledge-enhanced Automated radiology Report GENeration\nframework based on LLMs. Utilizing a frozen LLM to generate reports, the\nframework integrates a knowledge graph to unlock chest disease-related\nknowledge within the LLM to enhance the clinical utility of generated reports.\nThis is achieved by leveraging the knowledge graph to distill disease-related\nfeatures in a designed way. Since a radiology report encompasses both normal\nand disease-related findings, the extracted graph-enhanced disease-related\nfeatures are integrated with regional image features, attending to both\naspects. We explore two fusion methods to automatically prioritize and select\nthe most relevant features. The fused features are employed by LLM to generate\nreports that are more sensitive to diseases and of improved quality. Our\napproach demonstrates promising results on the MIMIC-CXR and IU-Xray datasets.\n","authors":["Yingshu Li","Zhanyu Wang","Yunyi Liu","Lei Wang","Lingqiao Liu","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.05370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05847v2","updated":"2024-09-09T06:54:53Z","published":"2024-03-09T09:15:37Z","title":"iBA: Backdoor Attack on 3D Point Cloud via Reconstructing Itself","summary":" The widespread deployment of Deep Neural Networks (DNNs) for 3D point cloud\nprocessing starkly contrasts with their susceptibility to security breaches,\nnotably backdoor attacks. These attacks hijack DNNs during training, embedding\ntriggers in the data that, once activated, cause the network to make\npredetermined errors while maintaining normal performance on unaltered data.\nThis vulnerability poses significant risks, especially given the insufficient\nresearch on robust defense mechanisms for 3D point cloud networks against such\nsophisticated threats. Existing attacks either struggle to resist basic point\ncloud pre-processing methods, or rely on delicate manual design. Exploring\nsimple, effective, imperceptible, and difficult-to-defend triggers in 3D point\nclouds is still challenging.To address these challenges, we introduce\nMirrorAttack, a novel effective 3D backdoor attack method, which implants the\ntrigger by simply reconstructing a clean point cloud with an auto-encoder. The\ndata-driven nature of the MirrorAttack obviates the need for complex manual\ndesign. Minimizing the reconstruction loss automatically improves\nimperceptibility. Simultaneously, the reconstruction network endows the trigger\nwith pronounced nonlinearity and sample specificity, rendering traditional\npreprocessing techniques ineffective in eliminating it. A trigger smoothing\nmodule based on spherical harmonic transformation is also attached to regulate\nthe intensity of the attack.Both quantitive and qualitative results verify the\neffectiveness of our method. We achieve state-of-the-art ASR on different types\nof victim models with the intervention of defensive techniques. Moreover, the\nminimal perturbation introduced by our trigger, as assessed by various metrics,\nattests to the method's stealth, ensuring its imperceptibility.\n","authors":["Yuhao Bian","Shengjing Tian","Xiuping Liu"],"pdf_url":"https://arxiv.org/pdf/2403.05847v2.pdf","comment":"16 pages. in IEEE Transactions on Information Forensics and Security\n (2024)"},{"id":"http://arxiv.org/abs/2408.14846v2","updated":"2024-09-09T06:50:18Z","published":"2024-08-27T07:57:58Z","title":"Diffusion-Occ: 3D Point Cloud Completion via Occupancy Diffusion","summary":" Point clouds are crucial for capturing three-dimensional data but often\nsuffer from incompleteness due to limitations such as resolution and occlusion.\nTraditional methods typically rely on point-based approaches within\ndiscriminative frameworks for point cloud completion. In this paper, we\nintroduce \\textbf{Diffusion-Occ}, a novel framework for Diffusion Point Cloud\nCompletion. Diffusion-Occ utilizes a two-stage coarse-to-fine approach. In the\nfirst stage, the Coarse Density Voxel Prediction Network (CDNet) processes\npartial points to predict coarse density voxels, streamlining global feature\nextraction through voxel classification, as opposed to previous\nregression-based methods. In the second stage, we introduce the Occupancy\nGeneration Network (OccGen), a conditional occupancy diffusion model based on a\ntransformer architecture and enhanced by our Point-Voxel Fuse (PVF) block. This\nblock integrates coarse density voxels with partial points to leverage both\nglobal and local features for comprehensive completion. By thresholding the\noccupancy field, we convert it into a complete point cloud. Additionally, our\nmethod employs diverse training mixtures and efficient diffusion\nparameterization to enable effective one-step sampling during both training and\ninference. Experimental results demonstrate that Diffusion-Occ outperforms\nexisting discriminative and generative methods.\n","authors":["Guoqing Zhang","Jian Liu"],"pdf_url":"https://arxiv.org/pdf/2408.14846v2.pdf","comment":"After a closer examination of our work, we've determined that our\n experiments are not thorough and robust enough, possibly impacting the\n accuracy of our conclusions. Hence, we've decided to withdraw our article\n and, after refining our experiments, intend to resubmit the paper once\n significant improvements have been made"},{"id":"http://arxiv.org/abs/2409.05359v1","updated":"2024-09-09T06:42:17Z","published":"2024-09-09T06:42:17Z","title":"FedBrain-Distill: Communication-Efficient Federated Brain Tumor\n Classification Using Ensemble Knowledge Distillation on Non-IID Data","summary":" Brain is one the most complex organs in the human body. Due to its\ncomplexity, classification of brain tumors still poses a significant challenge,\nmaking brain tumors a particularly serious medical issue. Techniques such as\nMachine Learning (ML) coupled with Magnetic Resonance Imaging (MRI) have paved\nthe way for doctors and medical institutions to classify different types of\ntumors. However, these techniques suffer from limitations that violate patients\nprivacy. Federated Learning (FL) has recently been introduced to solve such an\nissue, but the FL itself suffers from limitations like communication costs and\ndependencies on model architecture, forcing all models to have identical\narchitectures. In this paper, we propose FedBrain-Distill, an approach that\nleverages Knowledge Distillation (KD) in an FL setting that maintains the users\nprivacy and ensures the independence of FL clients in terms of model\narchitecture. FedBrain-Distill uses an ensemble of teachers that distill their\nknowledge to a simple student model. The evaluation of FedBrain-Distill\ndemonstrated high-accuracy results for both Independent and Identically\nDistributed (IID) and non-IID data with substantial low communication costs on\nthe real-world Figshare brain tumor dataset. It is worth mentioning that we\nused Dirichlet distribution to partition the data into IID and non-IID data.\nAll the implementation details are accessible through our Github repository.\n","authors":["Rasoul Jafari Gohari","Laya Aliahmadipour","Ezat Valipour"],"pdf_url":"https://arxiv.org/pdf/2409.05359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11398v2","updated":"2024-09-09T06:21:21Z","published":"2024-07-16T05:35:57Z","title":"Animate3D: Animating Any 3D Model with Multi-view Video Diffusion","summary":" Recent advances in 4D generation mainly focus on generating 4D content by\ndistilling pre-trained text or single-view image-conditioned models. It is\ninconvenient for them to take advantage of various off-the-shelf 3D assets with\nmulti-view attributes, and their results suffer from spatiotemporal\ninconsistency owing to the inherent ambiguity in the supervision signals. In\nthis work, we present Animate3D, a novel framework for animating any static 3D\nmodel. The core idea is two-fold: 1) We propose a novel multi-view video\ndiffusion model (MV-VDM) conditioned on multi-view renderings of the static 3D\nobject, which is trained on our presented large-scale multi-view video dataset\n(MV-Video). 2) Based on MV-VDM, we introduce a framework combining\nreconstruction and 4D Score Distillation Sampling (4D-SDS) to leverage the\nmulti-view video diffusion priors for animating 3D objects. Specifically, for\nMV-VDM, we design a new spatiotemporal attention module to enhance spatial and\ntemporal consistency by integrating 3D and video diffusion models.\nAdditionally, we leverage the static 3D model's multi-view renderings as\nconditions to preserve its identity. For animating 3D models, an effective\ntwo-stage pipeline is proposed: we first reconstruct motions directly from\ngenerated multi-view videos, followed by the introduced 4D-SDS to refine both\nappearance and motion. Benefiting from accurate motion learning, we could\nachieve straightforward mesh animation. Qualitative and quantitative\nexperiments demonstrate that Animate3D significantly outperforms previous\napproaches. Data, code, and models will be open-released.\n","authors":["Yanqin Jiang","Chaohui Yu","Chenjie Cao","Fan Wang","Weiming Hu","Jin Gao"],"pdf_url":"https://arxiv.org/pdf/2407.11398v2.pdf","comment":"Project Page: https://animate3d.github.io/"},{"id":"http://arxiv.org/abs/2409.05352v1","updated":"2024-09-09T06:17:46Z","published":"2024-09-09T06:17:46Z","title":"Driving with Prior Maps: Unified Vector Prior Encoding for Autonomous\n Vehicle Mapping","summary":" High-Definition Maps (HD maps) are essential for the precise navigation and\ndecision-making of autonomous vehicles, yet their creation and upkeep present\nsignificant cost and timeliness challenges. The online construction of HD maps\nusing on-board sensors has emerged as a promising solution; however, these\nmethods can be impeded by incomplete data due to occlusions and inclement\nweather. This paper proposes the PriorDrive framework to addresses these\nlimitations by harnessing the power of prior maps, significantly enhancing the\nrobustness and accuracy of online HD map construction. Our approach integrates\na variety of prior maps, such as OpenStreetMap's Standard Definition Maps (SD\nmaps), outdated HD maps from vendors, and locally constructed maps from\nhistorical vehicle data. To effectively encode this prior information into\nonline mapping models, we introduce a Hybrid Prior Representation (HPQuery)\nthat standardizes the representation of diverse map elements. At the core of\nPriorDrive is the Unified Vector Encoder (UVE), which employs a dual encoding\nmechanism to process vector data. The intra-vector encoder captures\nfine-grained local features, while the inter-vector encoder integrates global\ncontext. Furthermore, we propose a segment-level and point-level pre-training\nstrategy that enables the UVE to learn the prior distribution of vector data,\nthereby improving the encoder's generalizability and performance. Through\nextensive testing on the nuScenes dataset, we demonstrate that PriorDrive is\nhighly compatible with various online mapping models and substantially improves\nmap prediction capabilities. The integration of prior maps through the\nPriorDrive framework offers a robust solution to the challenges of\nsingle-perception data, paving the way for more reliable autonomous vehicle\nnavigation.\n","authors":["Shuang Zeng","Xinyuan Chang","Xinran Liu","Zheng Pan","Xing Wei"],"pdf_url":"https://arxiv.org/pdf/2409.05352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11802v2","updated":"2024-09-09T06:17:15Z","published":"2024-07-16T14:53:35Z","title":"Invariant Causal Knowledge Distillation in Neural Networks","summary":" Knowledge distillation (KD) involves transferring the knowledge from one\nneural network to another, often from a larger, well-trained model (teacher) to\na smaller, more efficient model (student). Traditional KD methods minimize the\nKullback-Leibler (KL) divergence between the probabilistic outputs of the\nteacher and student networks. However, this approach often overlooks crucial\nstructural knowledge embedded within the teacher's network. In this paper, we\nintroduce Invariant Consistency Distillation (ICD), a novel methodology\ndesigned to enhance KD by ensuring that the student model's representations are\nboth discriminative and invariant with respect to the teacher's outputs. Our\napproach is based on causal inference principles and combines contrastive\nlearning with an explicit invariance penalty, capturing significantly more\ninformation from the teacher's representation. ICD uses an efficient,\nparameter-free approach for flexible teacher-student alignment. We provide a\ntheoretical foundation for ICD and demonstrate its effectiveness through\nextensive experiments. Our results on CIFAR-100 and ImageNet ILSVRC-2012 show\nthat ICD outperforms traditional KD techniques and surpasses state-of-the-art\nmethods. In some cases, the student model even exceeds the teacher model in\nterms of accuracy. Furthermore, we successfully apply our method to other\ndatasets, such as Tiny ImageNet and STL-10, demonstrating superior\ncross-dataset generalization. Code is available at\nhttps://github.com/giakoumoglou/distillers.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2407.11802v2.pdf","comment":"8 pages, 2 figures, 4 tables. The paper's title has been changed to\n better emphasize its theoretical foundation"},{"id":"http://arxiv.org/abs/2407.12073v3","updated":"2024-09-09T06:13:48Z","published":"2024-07-16T14:56:13Z","title":"Relational Representation Distillation","summary":" Knowledge distillation (KD) is an effective method for transferring knowledge\nfrom a large, well-trained teacher model to a smaller, more efficient student\nmodel. Despite its success, one of the main challenges in KD is ensuring the\nefficient transfer of complex knowledge while maintaining the student's\ncomputational efficiency. Unlike previous works that applied contrastive\nobjectives promoting explicit negative instances with little attention to the\nrelationships between them, we introduce Relational Representation Distillation\n(RRD). Our approach leverages pairwise similarities to explore and reinforce\nthe relationships between the teacher and student models. Inspired by\nself-supervised learning principles, it uses a relaxed contrastive loss that\nfocuses on similarity rather than exact replication. This method aligns the\noutput distributions of teacher samples in a large memory buffer, improving the\nrobustness and performance of the student model without the need for strict\nnegative instance differentiation. Our approach demonstrates superior\nperformance on CIFAR-100 and ImageNet ILSVRC-2012, outperforming traditional KD\nand sometimes even outperforms the teacher network when combined with KD. It\nalso transfers successfully to other datasets like Tiny ImageNet and STL-10.\nCode is available at https://github.com/giakoumoglou/distillers.\n","authors":["Nikolaos Giakoumoglou","Tania Stathaki"],"pdf_url":"https://arxiv.org/pdf/2407.12073v3.pdf","comment":"8 pages, 4 figures, 4 tables. Updated experiments on ImageNet\n ILSVRC-2012. arXiv admin note: text overlap with arXiv:2407.11802"},{"id":"http://arxiv.org/abs/2409.03777v2","updated":"2024-09-09T05:58:29Z","published":"2024-08-22T03:59:57Z","title":"A Greedy Hierarchical Approach to Whole-Network Filter-Pruning in CNNs","summary":" Deep convolutional neural networks (CNNs) have achieved impressive\nperformance in many computer vision tasks. However, their large model sizes\nrequire heavy computational resources, making pruning redundant filters from\nexisting pre-trained CNNs an essential task in developing efficient models for\nresource-constrained devices. Whole-network filter pruning algorithms prune\nvarying fractions of filters from each layer, hence providing greater\nflexibility. Current whole-network pruning methods are either computationally\nexpensive due to the need to calculate the loss for each pruned filter using a\ntraining dataset, or use various heuristic / learned criteria for determining\nthe pruning fractions for each layer. This paper proposes a two-level\nhierarchical approach for whole-network filter pruning which is efficient and\nuses the classification loss as the final criterion. The lower-level algorithm\n(called filter-pruning) uses a sparse-approximation formulation based on linear\napproximation of filter weights. We explore two algorithms: orthogonal matching\npursuit-based greedy selection and a greedy backward pruning approach. The\nbackward pruning algorithm uses a novel closed-form error criterion for\nefficiently selecting the optimal filter at each stage, thus making the whole\nalgorithm much faster. The higher-level algorithm (called layer-selection)\ngreedily selects the best-pruned layer (pruning using the filter-selection\nalgorithm) using a global pruning criterion. We propose algorithms for two\ndifferent global-pruning criteria: (1) layer-wise relative error (HBGS), and\n(2) final classification error (HBGTS). Our suite of algorithms outperforms\nstate-of-the-art pruning methods on ResNet18, ResNet32, ResNet56, VGG16, and\nResNext101. Our method reduces the RAM requirement for ResNext101 from 7.6 GB\nto 1.5 GB and achieves a 94% reduction in FLOPS without losing accuracy on\nCIFAR-10.\n","authors":["Kiran Purohit","Anurag Reddy Parvathgari","Sourangshu Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2409.03777v2.pdf","comment":"Accepted in TMLR 2024"},{"id":"http://arxiv.org/abs/2409.03420v2","updated":"2024-09-09T05:36:27Z","published":"2024-09-05T11:09:00Z","title":"mPLUG-DocOwl2: High-resolution Compressing for OCR-free Multi-page\n Document Understanding","summary":" Multimodel Large Language Models(MLLMs) have achieved promising OCR-free\nDocument Understanding performance by increasing the supported resolution of\ndocument images. However, this comes at the cost of generating thousands of\nvisual tokens for a single document image, leading to excessive GPU memory and\nslower inference times, particularly in multi-page document comprehension. In\nthis work, to address these challenges, we propose a High-resolution\nDocCompressor module to compress each high-resolution document image into 324\ntokens, guided by low-resolution global visual features. With this compression\nmodule, to strengthen multi-page document comprehension ability and balance\nboth token efficiency and question-answering performance, we develop the\nDocOwl2 under a three-stage training framework: Single-image Pretraining,\nMulti-image Continue-pretraining, and Multi-task Finetuning. DocOwl2 sets a new\nstate-of-the-art across multi-page document understanding benchmarks and\nreduces first token latency by more than 50%, demonstrating advanced\ncapabilities in multi-page questioning answering, explanation with evidence\npages, and cross-page structure understanding. Additionally, compared to\nsingle-image MLLMs trained on similar data, our DocOwl2 achieves comparable\nsingle-page understanding performance with less than 20% of the visual tokens.\nOur codes, models, and data are publicly available at\nhttps://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl2.\n","authors":["Anwen Hu","Haiyang Xu","Liang Zhang","Jiabo Ye","Ming Yan","Ji Zhang","Qin Jin","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.03420v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.05336v1","updated":"2024-09-09T05:29:38Z","published":"2024-09-09T05:29:38Z","title":"Early-exit Convolutional Neural Networks","summary":" This paper is aimed at developing a method that reduces the computational\ncost of convolutional neural networks (CNN) during inference. Conventionally,\nthe input data pass through a fixed neural network architecture. However, easy\nexamples can be classified at early stages of processing and conventional\nnetworks do not take this into account. In this paper, we introduce 'Early-exit\nCNNs', EENets for short, which adapt their computational cost based on the\ninput by stopping the inference process at certain exit locations. In EENets,\nthere are a number of exit blocks each of which consists of a confidence branch\nand a softmax branch. The confidence branch computes the confidence score of\nexiting (i.e. stopping the inference process) at that location; while the\nsoftmax branch outputs a classification probability vector. Both branches are\nlearnable and their parameters are separate. During training of EENets, in\naddition to the classical classification loss, the computational cost of\ninference is taken into account as well. As a result, the network adapts its\nmany confidence branches to the inputs so that less computation is spent for\neasy examples. Inference works as in conventional feed-forward networks,\nhowever, when the output of a confidence branch is larger than a certain\nthreshold, the inference stops for that specific example. The idea of EENets is\napplicable to available CNN architectures such as ResNets. Through\ncomprehensive experiments on MNIST, SVHN, CIFAR10 and Tiny-ImageNet datasets,\nwe show that early-exit (EE) ResNets achieve similar accuracy with their non-EE\nversions while reducing the computational cost to 20% of the original. Code is\navailable at https://github.com/eksuas/eenets.pytorch\n","authors":["Edanur Demir","Emre Akbas"],"pdf_url":"https://arxiv.org/pdf/2409.05336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05335v1","updated":"2024-09-09T05:26:33Z","published":"2024-09-09T05:26:33Z","title":"A Multi-Modal Deep Learning Based Approach for House Price Prediction","summary":" Accurate prediction of house price, a vital aspect of the residential real\nestate sector, is of substantial interest for a wide range of stakeholders.\nHowever, predicting house prices is a complex task due to the significant\nvariability influenced by factors such as house features, location,\nneighborhood, and many others. Despite numerous attempts utilizing a wide array\nof algorithms, including recent deep learning techniques, to predict house\nprices accurately, existing approaches have fallen short of considering a wide\nrange of factors such as textual and visual features. This paper addresses this\ngap by comprehensively incorporating attributes, such as features, textual\ndescriptions, geo-spatial neighborhood, and house images, typically showcased\nin real estate listings in a house price prediction system. Specifically, we\npropose a multi-modal deep learning approach that leverages different types of\ndata to learn more accurate representation of the house. In particular, we\nlearn a joint embedding of raw house attributes, geo-spatial neighborhood, and\nmost importantly from textual description and images representing the house;\nand finally use a downstream regression model to predict the house price from\nthis jointly learned embedding vector. Our experimental results with a\nreal-world dataset show that the text embedding of the house advertisement\ndescription and image embedding of the house pictures in addition to raw\nattributes and geo-spatial embedding, can significantly improve the house price\nprediction accuracy. The relevant source code and dataset are publicly\naccessible at the following URL: https://github.com/4P0N/mhpp\n","authors":["Md Hasebul Hasan","Md Abid Jahan","Mohammed Eunus Ali","Yuan-Fang Li","Timos Sellis"],"pdf_url":"https://arxiv.org/pdf/2409.05335v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2409.05334v1","updated":"2024-09-09T05:25:15Z","published":"2024-09-09T05:25:15Z","title":"Lagrangian Hashing for Compressed Neural Field Representations","summary":" We present Lagrangian Hashing, a representation for neural fields combining\nthe characteristics of fast training NeRF methods that rely on Eulerian grids\n(i.e.~InstantNGP), with those that employ points equipped with features as a\nway to represent information (e.g. 3D Gaussian Splatting or PointNeRF). We\nachieve this by incorporating a point-based representation into the\nhigh-resolution layers of the hierarchical hash tables of an InstantNGP\nrepresentation. As our points are equipped with a field of influence, our\nrepresentation can be interpreted as a mixture of Gaussians stored within the\nhash table. We propose a loss that encourages the movement of our Gaussians\ntowards regions that require more representation budget to be sufficiently well\nrepresented. Our main finding is that our representation allows the\nreconstruction of signals using a more compact representation without\ncompromising quality.\n","authors":["Shrisudhan Govindarajan","Zeno Sambugaro"," Akhmedkhan"," Shabanov","Towaki Takikawa","Daniel Rebain","Weiwei Sun","Nicola Conci","Kwang Moo Yi","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2409.05334v1.pdf","comment":"Project page: https://theialab.github.io/laghashes/"},{"id":"http://arxiv.org/abs/2408.01688v2","updated":"2024-09-09T05:24:27Z","published":"2024-08-03T07:02:01Z","title":"SiamMo: Siamese Motion-Centric 3D Object Tracking","summary":" Current 3D single object tracking methods primarily rely on the Siamese\nmatching-based paradigm, which struggles with textureless and incomplete LiDAR\npoint clouds. Conversely, the motion-centric paradigm avoids appearance\nmatching, thus overcoming these issues. However, its complex multi-stage\npipeline and the limited temporal modeling capability of a single-stream\narchitecture constrain its potential. In this paper, we introduce SiamMo, a\nnovel and simple Siamese motion-centric tracking approach. Unlike the\ntraditional single-stream architecture, we employ Siamese feature extraction\nfor motion-centric tracking. This decouples feature extraction from temporal\nfusion, significantly enhancing tracking performance. Additionally, we design a\nSpatio-Temporal Feature Aggregation module to integrate Siamese features at\nmultiple scales, capturing motion information effectively. We also introduce a\nBox-aware Feature Encoding module to encode object size priors into motion\nestimation. SiamMo is a purely motion-centric tracker that eliminates the need\nfor additional processes like segmentation and box refinement. Without whistles\nand bells, SiamMo not only surpasses state-of-the-art methods across multiple\nbenchmarks but also demonstrates exceptional robustness in challenging\nscenarios. SiamMo sets a new record on the KITTI tracking benchmark with 90.1\\%\nprecision while maintaining a high inference speed of 108 FPS. The code will be\nreleased at https://github.com/HDU-VRLab/SiamMo.\n","authors":["Yuxiang Yang","Yingqi Deng","Jing Zhang","Hongjie Gu","Zhekang Dong"],"pdf_url":"https://arxiv.org/pdf/2408.01688v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05330v1","updated":"2024-09-09T05:20:02Z","published":"2024-09-09T05:20:02Z","title":"KAN-Based Fusion of Dual-Domain for Audio-Driven Facial Landmarks\n Generation","summary":" Audio-driven talking face generation is a widely researched topic due to its\nhigh applicability. Reconstructing a talking face using audio significantly\ncontributes to fields such as education, healthcare, online conversations,\nvirtual assistants, and virtual reality. Early studies often focused solely on\nchanging the mouth movements, which resulted in outcomes with limited practical\napplications. Recently, researchers have proposed a new approach of\nconstructing the entire face, including face pose, neck, and shoulders. To\nachieve this, they need to generate through landmarks. However, creating stable\nlandmarks that align well with the audio is a challenge. In this paper, we\npropose the KFusion of Dual-Domain model, a robust model that generates\nlandmarks from audio. We separate the audio into two distinct domains to learn\nemotional information and facial context, then use a fusion mechanism based on\nthe KAN model. Our model demonstrates high efficiency compared to recent\nmodels. This will lay the groundwork for the development of the audio-driven\ntalking face generation problem in the future.\n","authors":["Hoang-Son Vo-Thanh","Quang-Vinh Nguyen","Soo-Hyung Kim"],"pdf_url":"https://arxiv.org/pdf/2409.05330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02647v3","updated":"2024-09-09T05:19:48Z","published":"2023-12-05T10:39:37Z","title":"TPA3D: Triplane Attention for Fast Text-to-3D Generation","summary":" Due to the lack of large-scale text-3D correspondence data, recent text-to-3D\ngeneration works mainly rely on utilizing 2D diffusion models for synthesizing\n3D data. Since diffusion-based methods typically require significant\noptimization time for both training and inference, the use of GAN-based models\nwould still be desirable for fast 3D generation. In this work, we propose\nTriplane Attention for text-guided 3D generation (TPA3D), an end-to-end\ntrainable GAN-based deep learning model for fast text-to-3D generation. With\nonly 3D shape data and their rendered 2D images observed during training, our\nTPA3D is designed to retrieve detailed visual descriptions for synthesizing the\ncorresponding 3D mesh data. This is achieved by the proposed attention\nmechanisms on the extracted sentence and word-level text features. In our\nexperiments, we show that TPA3D generates high-quality 3D textured shapes\naligned with fine-grained descriptions, while impressive computation efficiency\ncan be observed.\n","authors":["Bin-Shih Wu","Hong-En Chen","Sheng-Yu Huang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02647v3.pdf","comment":"ECCV2024, Project Page: https://redxouls.github.io/TPA3D/"},{"id":"http://arxiv.org/abs/2409.05327v1","updated":"2024-09-09T04:42:57Z","published":"2024-09-09T04:42:57Z","title":"ICPR 2024 Competition on Safe Segmentation of Drive Scenes in\n Unstructured Traffic and Adverse Weather Conditions","summary":" The ICPR 2024 Competition on Safe Segmentation of Drive Scenes in\nUnstructured Traffic and Adverse Weather Conditions served as a rigorous\nplatform to evaluate and benchmark state-of-the-art semantic segmentation\nmodels under challenging conditions for autonomous driving. Over several\nmonths, participants were provided with the IDD-AW dataset, consisting of 5000\nhigh-quality RGB-NIR image pairs, each annotated at the pixel level and\ncaptured under adverse weather conditions such as rain, fog, low light, and\nsnow. A key aspect of the competition was the use and improvement of the Safe\nmean Intersection over Union (Safe mIoU) metric, designed to penalize unsafe\nincorrect predictions that could be overlooked by traditional mIoU. This\ninnovative metric emphasized the importance of safety in developing autonomous\ndriving systems. The competition showed significant advancements in the field,\nwith participants demonstrating models that excelled in semantic segmentation\nand prioritized safety and robustness in unstructured and adverse conditions.\nThe results of the competition set new benchmarks in the domain, highlighting\nthe critical role of safety in deploying autonomous vehicles in real-world\nscenarios. The contributions from this competition are expected to drive\nfurther innovation in autonomous driving technology, addressing the critical\nchallenges of operating in diverse and unpredictable environments.\n","authors":["Furqan Ahmed Shaik","Sandeep Nagar","Aiswarya Maturi","Harshit Kumar Sankhla","Dibyendu Ghosh","Anshuman Majumdar","Srikanth Vidapanakal","Kunal Chaudhary","Sunny Manchanda","Girish Varma"],"pdf_url":"https://arxiv.org/pdf/2409.05327v1.pdf","comment":"15 pages, 7 figures, ICPR Competition Paper"},{"id":"http://arxiv.org/abs/2409.05324v1","updated":"2024-09-09T04:34:47Z","published":"2024-09-09T04:34:47Z","title":"FIF-UNet: An Efficient UNet Using Feature Interaction and Fusion for\n Medical Image Segmentation","summary":" Nowadays, pre-trained encoders are widely used in medical image segmentation\nbecause of their ability to capture complex feature representations. However,\nthe existing models fail to effectively utilize the rich features obtained by\nthe pre-trained encoder, resulting in suboptimal segmentation results. In this\nwork, a novel U-shaped model, called FIF-UNet, is proposed to address the above\nissue, including three plug-and-play modules. A channel spatial interaction\nmodule (CSI) is proposed to obtain informative features by establishing the\ninteraction between encoder stages and corresponding decoder stages. A cascaded\nconv-SE module (CoSE) is designed to enhance the representation of critical\nfeatures by adaptively assigning importance weights on different feature\nchannels. A multi-level fusion module (MLF) is proposed to fuse the multi-scale\nfeatures from the decoder stages, ensuring accurate and robust final\nsegmentation. Comprehensive experiments on the Synapse and ACDC datasets\ndemonstrate that the proposed FIF-UNet outperforms existing state-of-the-art\nmethods, which achieves the highest average DICE of 86.05% and 92.58%,\nrespectively.\n","authors":["Xiaolin Gou","Chuanlin Liao","Jizhe Zhou","Fengshuo Ye","Yi Lin"],"pdf_url":"https://arxiv.org/pdf/2409.05324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18880v2","updated":"2024-09-09T04:17:17Z","published":"2024-05-29T08:39:31Z","title":"EventZoom: A Progressive Approach to Event-Based Data Augmentation for\n Enhanced Neuromorphic Vision","summary":" Dynamic Vision Sensors (DVS) capture event data with high temporal resolution\nand low power consumption, presenting a more efficient solution for visual\nprocessing in dynamic and real-time scenarios compared to conventional video\ncapture methods. Event data augmentation serve as an essential method for\novercoming the limitation of scale and diversity in event datasets. Our\ncomparative experiments demonstrate that the two factors, spatial integrity and\ntemporal continuity, can significantly affect the capacity of event data\naugmentation, which are guarantee for maintaining the sparsity and high dynamic\nrange characteristics unique to event data. However, existing augmentation\nmethods often neglect the preservation of spatial integrity and temporal\ncontinuity. To address this, we developed a novel event data augmentation\nstrategy EventZoom, which employs a temporal progressive strategy, embedding\ntransformed samples into the original samples through progressive scaling and\nshifting. The scaling process avoids the spatial information loss associated\nwith cropping, while the progressive strategy prevents interruptions or abrupt\nchanges in temporal information. We validated EventZoom across various\nsupervised learning frameworks. The experimental results show that EventZoom\nconsistently outperforms existing event data augmentation methods with SOTA\nperformance. For the first time, we have concurrently employed Semi-supervised\nand Unsupervised learning to verify feasibility on event augmentation\nalgorithms, demonstrating the applicability and effectiveness of EventZoom as a\npowerful event-based data augmentation tool in handling real-world scenes with\nhigh dynamics and variability environments.\n","authors":["Yiting Dong","Xiang He","Guobin Shen","Dongcheng Zhao","Yang Li","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2405.18880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05312v1","updated":"2024-09-09T03:53:03Z","published":"2024-09-09T03:53:03Z","title":"Open-World Dynamic Prompt and Continual Visual Representation Learning","summary":" The open world is inherently dynamic, characterized by ever-evolving concepts\nand distributions. Continual learning (CL) in this dynamic open-world\nenvironment presents a significant challenge in effectively generalizing to\nunseen test-time classes. To address this challenge, we introduce a new\npractical CL setting tailored for open-world visual representation learning. In\nthis setting, subsequent data streams systematically introduce novel classes\nthat are disjoint from those seen in previous training phases, while also\nremaining distinct from the unseen test classes. In response, we present\nDynamic Prompt and Representation Learner (DPaRL), a simple yet effective\nPrompt-based CL (PCL) method. Our DPaRL learns to generate dynamic prompts for\ninference, as opposed to relying on a static prompt pool in previous PCL\nmethods. In addition, DPaRL jointly learns dynamic prompt generation and\ndiscriminative representation at each training stage whereas prior PCL methods\nonly refine the prompt learning throughout the process. Our experimental\nresults demonstrate the superiority of our approach, surpassing\nstate-of-the-art methods on well-established open-world image retrieval\nbenchmarks by an average of 4.7\\% improvement in Recall@1 performance.\n","authors":["Youngeun Kim","Jun Fang","Qin Zhang","Zhaowei Cai","Yantao Shen","Rahul Duggal","Dripta S. Raychaudhuri","Zhuowen Tu","Yifan Xing","Onkar Dabeer"],"pdf_url":"https://arxiv.org/pdf/2409.05312v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.05311v1","updated":"2024-09-09T03:50:41Z","published":"2024-09-09T03:50:41Z","title":"Fitting Skeletal Models via Graph-based Learning","summary":" Skeletonization is a popular shape analysis technique that models an object's\ninterior as opposed to just its boundary. Fitting template-based skeletal\nmodels is a time-consuming process requiring much manual parameter tuning.\nRecently, machine learning-based methods have shown promise for generating\ns-reps from object boundaries. In this work, we propose a new skeletonization\nmethod which leverages graph convolutional networks to produce skeletal\nrepresentations (s-reps) from dense segmentation masks. The method is evaluated\non both synthetic data and real hippocampus segmentations, achieving promising\nresults and fast inference.\n","authors":["Nicolás Gaggion","Enzo Ferrante","Beatriz Paniagua","Jared Vicory"],"pdf_url":"https://arxiv.org/pdf/2409.05311v1.pdf","comment":"This paper was presented at the 2024 IEEE International Symposium on\n Biomedical Imaging (ISBI)"},{"id":"http://arxiv.org/abs/2409.05310v1","updated":"2024-09-09T03:42:12Z","published":"2024-09-09T03:42:12Z","title":"Neural Surface Reconstruction and Rendering for LiDAR-Visual Systems","summary":" This paper presents a unified surface reconstruction and rendering framework\nfor LiDAR-visual systems, integrating Neural Radiance Fields (NeRF) and Neural\nDistance Fields (NDF) to recover both appearance and structural information\nfrom posed images and point clouds. We address the structural visible gap\nbetween NeRF and NDF by utilizing a visible-aware occupancy map to classify\nspace into the free, occupied, visible unknown, and background regions. This\nclassification facilitates the recovery of a complete appearance and structure\nof the scene. We unify the training of the NDF and NeRF using a spatial-varying\nscale SDF-to-density transformation for levels of detail for both structure and\nappearance. The proposed method leverages the learned NDF for structure-aware\nNeRF training by an adaptive sphere tracing sampling strategy for accurate\nstructure rendering. In return, NeRF further refines structural in recovering\nmissing or fuzzy structures in the NDF. Extensive experiments demonstrate the\nsuperior quality and versatility of the proposed method across various\nscenarios. To benefit the community, the codes will be released at\n\\url{https://github.com/hku-mars/M2Mapping}.\n","authors":["Jianheng Liu","Chunran Zheng","Yunfei Wan","Bowen Wang","Yixi Cai","Fu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05307v1","updated":"2024-09-09T03:34:05Z","published":"2024-09-09T03:34:05Z","title":"RAL:Redundancy-Aware Lipreading Model Based on Differential Learning\n with Symmetric Views","summary":" Lip reading involves interpreting a speaker's speech by analyzing sequences\nof lip movements. Currently, most models regard the left and right halves of\nthe lips as a symmetrical whole, lacking a thorough investigation of their\ndifferences. However, the left and right halves of the lips are not always\nsymmetrical, and the subtle differences between them contain rich semantic\ninformation. In this paper, we propose a differential learning strategy with\nsymmetric views (DLSV) to address this issue. Additionally, input images often\ncontain a lot of redundant information unrelated to recognition results, which\ncan degrade the model's performance. We present a redundancy-aware operation\n(RAO) to reduce it. Finally, to leverage the relational information between\nsymmetric views and within each view, we further design an adaptive cross-view\ninteraction module (ACVI). Experiments on LRW and LRW-1000 datasets fully\ndemonstrate the effectiveness of our approach.\n","authors":["Zejun gu","Junxia jiang"],"pdf_url":"https://arxiv.org/pdf/2409.05307v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.03431v3","updated":"2024-09-09T03:22:43Z","published":"2024-09-05T11:23:41Z","title":"UV-Mamba: A DCN-Enhanced State Space Model for Urban Village Boundary\n Identification in High-Resolution Remote Sensing Images","summary":" Due to the diverse geographical environments, intricate landscapes, and\nhigh-density settlements, the automatic identification of urban village\nboundaries using remote sensing images remains a highly challenging task. This\npaper proposes a novel and efficient neural network model called UV-Mamba for\naccurate boundary detection in high-resolution remote sensing images. UV-Mamba\nmitigates the memory loss problem in lengthy sequence modeling, which arises in\nstate space models with increasing image size, by incorporating deformable\nconvolutions. Its architecture utilizes an encoder-decoder framework and\nincludes an encoder with four deformable state space augmentation blocks for\nefficient multi-level semantic extraction and a decoder to integrate the\nextracted semantic information. We conducted experiments on two large datasets\nshowing that UV-Mamba achieves state-of-the-art performance. Specifically, our\nmodel achieves 73.3% and 78.1% IoU on the Beijing and Xi'an datasets,\nrespectively, representing improvements of 1.2% and 3.4% IoU over the previous\nbest model while also being 6x faster in inference speed and 40x smaller in\nparameter count. Source code and pre-trained models are available at\nhttps://github.com/Devin-Egber/UV-Mamba.\n","authors":["Lulin Li","Ben Chen","Xuechao Zou","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2409.03431v3.pdf","comment":"5 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2408.09460v3","updated":"2024-09-09T03:21:53Z","published":"2024-08-18T12:48:48Z","title":"Fine-Grained Building Function Recognition from Street-View Images via\n Geometry-Aware Semi-Supervised Learning","summary":" In this work, we propose a geometry-aware semi-supervised framework for\nfine-grained building function recognition, utilizing geometric relationships\namong multi-source data to enhance pseudo-label accuracy in semi-supervised\nlearning, broadening its applicability to various building function\ncategorization systems. Firstly, we design an online semi-supervised\npre-training stage, which facilitates the precise acquisition of building\nfacade location information in street-view images. In the second stage, we\npropose a geometry-aware coarse annotation generation module. This module\neffectively combines GIS data and street-view data based on the geometric\nrelationships, improving the accuracy of pseudo annotations. In the third\nstage, we combine the newly generated coarse annotations with the existing\nlabeled dataset to achieve fine-grained functional recognition of buildings\nacross multiple cities at a large scale. Extensive experiments demonstrate that\nour proposed framework exhibits superior performance in fine-grained functional\nrecognition of buildings. Within the same categorization system, it achieves\nimprovements of 7.6\\% and 4.8\\% compared to fully-supervised methods and\nstate-of-the-art semi-supervised methods, respectively. Additionally, our\nmethod also performs well in cross-city scenarios, i.e., extending the model\ntrained on OmniCity (New York) to new cities (i.e., Los Angeles and Boston)\nwith different building function categorization systems. This study offers a\nnew solution for large-scale multi-city applications with minimal annotation\nrequirements, facilitating more efficient data updates and resource allocation\nin urban management.\n","authors":["Weijia Li","Jinhua Yu","Dairong Chen","Yi Lin","Runmin Dong","Xiang Zhang","Conghui He","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2408.09460v3.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2304.10985v3","updated":"2024-09-09T03:19:10Z","published":"2023-04-21T14:35:47Z","title":"INK: Inheritable Natural Backdoor Attack Against Model Distillation","summary":" Deep learning models are vulnerable to backdoor attacks, where attackers\ninject malicious behavior through data poisoning and later exploit triggers to\nmanipulate deployed models. To improve the stealth and effectiveness of\nbackdoors, prior studies have introduced various imperceptible attack methods\ntargeting both defense mechanisms and manual inspection. However, all\npoisoning-based attacks still rely on privileged access to the training\ndataset. Consequently, model distillation using a trusted dataset has emerged\nas an effective defense against these attacks. To bridge this gap, we introduce\nINK, an inheritable natural backdoor attack that targets model distillation.\nThe key insight behind INK is the use of naturally occurring statistical\nfeatures in all datasets, allowing attackers to leverage them as backdoor\ntriggers without direct access to the training data. Specifically, INK employs\nimage variance as a backdoor trigger and enables both clean-image and\nclean-label attacks by manipulating the labels and image variance in an\nunauthenticated dataset. Once the backdoor is embedded, it transfers from the\nteacher model to the student model, even when defenders use a trusted dataset\nfor distillation. Theoretical analysis and experimental results demonstrate the\nrobustness of INK against transformation-based, search-based, and\ndistillation-based defenses. For instance, INK maintains an attack success rate\nof over 98\\% post-distillation, compared to an average success rate of 1.4\\%\nfor existing methods.\n","authors":["Xiaolei Liu","Ming Yi","Kangyi Ding","Bangzhou Xin","Yixiao Xu","Li Yan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2304.10985v3.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2202.06198v3","updated":"2024-09-09T03:11:17Z","published":"2022-02-13T04:09:21Z","title":"Data standardization for robust lip sync","summary":" Lip sync is a fundamental audio-visual task. However, existing lip sync\nmethods fall short of being robust in the wild. One important cause could be\ndistracting factors on the visual input side, making extracting lip motion\ninformation difficult. To address these issues, this paper proposes a data\nstandardization pipeline to standardize the visual input for lip sync. Based on\nrecent advances in 3D face reconstruction, we first create a model that can\nconsistently disentangle lip motion information from the raw images. Then,\nstandardized images are synthesized with disentangled lip motion information,\nwith all other attributes related to distracting factors set to predefined\nvalues independent of the input, to reduce their effects. Using synthesized\nimages, existing lip sync methods improve their data efficiency and robustness,\nand they achieve competitive performance for the active speaker detection task.\n","authors":["Chun Wang"],"pdf_url":"https://arxiv.org/pdf/2202.06198v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14066v3","updated":"2024-09-09T03:06:21Z","published":"2024-07-19T06:50:24Z","title":"360VFI: A Dataset and Benchmark for Omnidirectional Video Frame\n Interpolation","summary":" Head-mounted 360{\\deg} displays and portable 360{\\deg} cameras have\nsignificantly progressed, providing viewers a realistic and immersive\nexperience. However, many omnidirectional videos have low frame rates that can\nlead to visual fatigue, and the prevailing plane frame interpolation\nmethodologies are unsuitable for omnidirectional video interpolation because\nthey are designed solely for traditional videos. This paper introduces the\nbenchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We\npresent a practical implementation that introduces a distortion prior from\nomnidirectional video into the network to modulate distortions. Specifically,\nwe propose a pyramid distortion-sensitive feature extractor that uses the\nunique characteristics of equirectangular projection (ERP) format as prior\ninformation. Moreover, we devise a decoder that uses an affine transformation\nto further facilitate the synthesis of intermediate frames. 360VFI is the first\ndataset and benchmark that explores the challenge of Omnidirectional Video\nFrame Interpolation. Through our benchmark analysis, we present four different\ndistortion condition scenes in the proposed 360VFI dataset to evaluate the\nchallenges triggered by distortion during interpolation. Besides, experimental\nresults demonstrate that Omnidirectional Video Interpolation can be effectively\nimproved by modeling for omnidirectional distortion.\n","authors":["Wenxuan Lu","Mengshun Hu","Yansheng Qiu","Liang Liao","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.14066v3.pdf","comment":"This is a preprint version"},{"id":"http://arxiv.org/abs/2312.08873v2","updated":"2024-09-09T02:48:03Z","published":"2023-12-12T00:53:56Z","title":"Diffusion Cocktail: Mixing Domain-Specific Diffusion Models for\n Diversified Image Generations","summary":" Diffusion models, capable of high-quality image generation, receive\nunparalleled popularity for their ease of extension. Active users have created\na massive collection of domain-specific diffusion models by fine-tuning base\nmodels on self-collected datasets. Recent work has focused on improving a\nsingle diffusion model by uncovering semantic and visual information encoded in\nvarious architecture components. However, those methods overlook the vastly\navailable set of fine-tuned diffusion models and, therefore, miss the\nopportunity to utilize their combined capacity for novel generation. In this\nwork, we propose Diffusion Cocktail (Ditail), a training-free method that\ntransfers style and content information between multiple diffusion models. This\nallows us to perform diversified generations using a set of diffusion models,\nresulting in novel images unobtainable by a single model. Ditail also offers\nfine-grained control of the generation process, which enables flexible\nmanipulations of styles and contents. With these properties, Ditail excels in\nnumerous applications, including style transfer guided by diffusion models,\nnovel-style image generation, and image manipulation via prompts or collage\ninputs.\n","authors":["Haoming Liu","Yuanhe Guo","Shengjie Wang","Hongyi Wen"],"pdf_url":"https://arxiv.org/pdf/2312.08873v2.pdf","comment":"Project Page: https://maps-research.github.io/Ditail/"},{"id":"http://arxiv.org/abs/2409.04214v2","updated":"2024-09-09T02:46:34Z","published":"2024-09-06T12:11:06Z","title":"Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver","summary":" Mathematical reasoning remains an ongoing challenge for AI models, especially\nfor geometry problems that require both linguistic and visual signals. As the\nvision encoders of most MLLMs are trained on natural scenes, they often\nstruggle to understand geometric diagrams, performing no better in geometry\nproblem solving than LLMs that only process text. This limitation is amplified\nby the lack of effective methods for representing geometric relationships. To\naddress these issues, we introduce the Diagram Formalization Enhanced Geometry\nProblem Solver (DFE-GPS), a new framework that integrates visual features,\ngeometric formal language, and natural language representations. We propose a\nnovel synthetic data approach and create a large-scale geometric dataset,\nSynthGeo228K, annotated with both formal and natural language captions,\ndesigned to enhance the vision encoder for a better understanding of geometric\nstructures. Our framework improves MLLMs' ability to process geometric diagrams\nand extends their application to open-ended tasks on the formalgeo7k dataset.\n","authors":["Zeren Zhang","Jo-Ku Cheng","Jingyang Deng","Lu Tian","Jinwen Ma","Ziran Qin","Xiaokai Zhang","Na Zhu","Tuo Leng"],"pdf_url":"https://arxiv.org/pdf/2409.04214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05280v1","updated":"2024-09-09T02:18:50Z","published":"2024-09-09T02:18:50Z","title":"RotCAtt-TransUNet++: Novel Deep Neural Network for Sophisticated Cardiac\n Segmentation","summary":" Cardiovascular disease is a major global health concern, contributing\nsignificantly to global mortality. Accurately segmenting cardiac medical\nimaging data is crucial for reducing fatality rates associated with these\nconditions. However, current state-of-the-art (SOTA) neural networks, including\nCNN-based and Transformer-based approaches, face challenges in capturing both\ninter-slice connections and intra-slice details, especially in datasets\nfeaturing intricate, long-range details along the z-axis like coronary\narteries. Existing methods also struggle with differentiating non-cardiac\ncomponents from the myocardium, resulting in segmentation inaccuracies and the\n\"spraying\" phenomenon. To address these issues, we introduce\nRotCAtt-TransUNet++, a novel architecture designed for robust segmentation of\nintricate cardiac structures. Our approach enhances global context modeling\nthrough multiscale feature aggregation and nested skip connections in the\nencoder. Transformer layers facilitate capturing intra-slice interactions,\nwhile a rotatory attention mechanism handles inter-slice connectivity. A\nchannel-wise cross-attention gate integrates multiscale information and decoder\nfeatures, effectively bridging semantic gaps. Experimental results across\nmultiple datasets demonstrate superior performance over current methods,\nachieving near-perfect annotation of coronary arteries and myocardium. Ablation\nstudies confirm that our rotatory attention mechanism significantly improves\nsegmentation accuracy by transforming embedded vectorized patches in semantic\ndimensional space.\n","authors":["Quoc-Bao Nguyen-Le","Tuan-Hy Le","Anh-Triet Do","Quoc-Huy Trinh"],"pdf_url":"https://arxiv.org/pdf/2409.05280v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2407.11633v3","updated":"2024-09-09T02:17:12Z","published":"2024-07-16T11:55:23Z","title":"Scaling Diffusion Transformers to 16 Billion Parameters","summary":" In this paper, we present DiT-MoE, a sparse version of the diffusion\nTransformer, that is scalable and competitive with dense networks while\nexhibiting highly optimized inference. The DiT-MoE includes two simple designs:\nshared expert routing and expert-level balance loss, thereby capturing common\nknowledge and reducing redundancy among the different routed experts. When\napplied to conditional image generation, a deep analysis of experts\nspecialization gains some interesting observations: (i) Expert selection shows\npreference with spatial position and denoising time step, while insensitive\nwith different class-conditional information; (ii) As the MoE layers go deeper,\nthe selection of experts gradually shifts from specific spacial position to\ndispersion and balance. (iii) Expert specialization tends to be more\nconcentrated at the early time step and then gradually uniform after half. We\nattribute it to the diffusion process that first models the low-frequency\nspatial information and then high-frequency complex information. Based on the\nabove guidance, a series of DiT-MoE experimentally achieves performance on par\nwith dense networks yet requires much less computational load during inference.\nMore encouragingly, we demonstrate the potential of DiT-MoE with synthesized\nimage data, scaling diffusion model at a 16.5B parameter that attains a new\nSoTA FID-50K score of 1.80 in 512$\\times$512 resolution settings. The project\npage: https://github.com/feizc/DiT-MoE.\n","authors":["Zhengcong Fei","Mingyuan Fan","Changqian Yu","Debang Li","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2407.11633v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05279v1","updated":"2024-09-09T02:14:23Z","published":"2024-09-09T02:14:23Z","title":"BrainDecoder: Style-Based Visual Decoding of EEG Signals","summary":" Decoding neural representations of visual stimuli from electroencephalography\n(EEG) offers valuable insights into brain activity and cognition. Recent\nadvancements in deep learning have significantly enhanced the field of visual\ndecoding of EEG, primarily focusing on reconstructing the semantic content of\nvisual stimuli. In this paper, we present a novel visual decoding pipeline\nthat, in addition to recovering the content, emphasizes the reconstruction of\nthe style, such as color and texture, of images viewed by the subject. Unlike\nprevious methods, this ``style-based'' approach learns in the CLIP spaces of\nimage and text separately, facilitating a more nuanced extraction of\ninformation from EEG signals. We also use captions for text alignment simpler\nthan previously employed, which we find work better. Both quantitative and\nqualitative evaluations show that our method better preserves the style of\nvisual stimuli and extracts more fine-grained semantic information from neural\nsignals. Notably, it achieves significant improvements in quantitative results\nand sets a new state-of-the-art on the popular Brain2Image dataset.\n","authors":["Minsuk Choi","Hiroshi Ishikawa"],"pdf_url":"https://arxiv.org/pdf/2409.05279v1.pdf","comment":"5 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.05277v1","updated":"2024-09-09T02:09:49Z","published":"2024-09-09T02:09:49Z","title":"Disentangled Representations for Short-Term and Long-Term Person\n Re-Identification","summary":" We address the problem of person re-identification (reID), that is,\nretrieving person images from a large dataset, given a query image of the\nperson of interest. A key challenge is to learn person representations robust\nto intra-class variations, as different persons could have the same attribute,\nand persons' appearances look different, e.g., with viewpoint changes. Recent\nreID methods focus on learning person features discriminative only for a\nparticular factor of variations (e.g., human pose), which also requires\ncorresponding supervisory signals (e.g., pose annotations). To tackle this\nproblem, we propose to factorize person images into identity-related and\nunrelated features. Identity-related features contain information useful for\nspecifying a particular person (e.g., clothing), while identity-unrelated ones\nhold other factors (e.g., human pose). To this end, we propose a new generative\nadversarial network, dubbed identity shuffle GAN (IS-GAN). It disentangles\nidentity-related and unrelated features from person images through an\nidentity-shuffling technique that exploits identification labels alone without\nany auxiliary supervisory signals. We restrict the distribution of\nidentity-unrelated features or encourage the identity-related and unrelated\nfeatures to be uncorrelated, facilitating the disentanglement process.\nExperimental results validate the effectiveness of IS-GAN, showing\nstate-of-the-art performance on standard reID benchmarks, including\nMarket-1501, CUHK03, and DukeMTMC-reID. We further demonstrate the advantages\nof disentangling person representations on a long-term reID task, setting a new\nstate of the art on a Celeb-reID dataset.\n","authors":["Chanho Eom","Wonkyung Lee","Geon Lee","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2409.05277v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:1910.12003"},{"id":"http://arxiv.org/abs/2409.05274v1","updated":"2024-09-09T01:50:01Z","published":"2024-09-09T01:50:01Z","title":"Rethinking the Atmospheric Scattering-driven Attention via Channel and\n Gamma Correction Priors for Low-Light Image Enhancement","summary":" Low-light image enhancement remains a critical challenge in computer vision,\nas does the lightweight design for edge devices with the computational burden\nfor deep learning models. In this article, we introduce an extended version of\nChannel-Prior and Gamma-Estimation Network (CPGA-Net), termed CPGA-Net+, which\nincorporates an attention mechanism driven by a reformulated Atmospheric\nScattering Model and effectively addresses both global and local image\nprocessing through Plug-in Attention with gamma correction. These innovations\nenable CPGA-Net+ to achieve superior performance on image enhancement tasks,\nsurpassing lightweight state-of-the-art methods with high efficiency. Our\nresults demonstrate the model's effectiveness and show the potential\napplications in resource-constrained environments.\n","authors":["Shyang-En Weng","Cheng-Yen Hsiao","Shaou-Gang Miaou"],"pdf_url":"https://arxiv.org/pdf/2409.05274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05260v1","updated":"2024-09-09T01:11:47Z","published":"2024-09-09T01:11:47Z","title":"Scalable Frame Sampling for Video Classification: A Semi-Optimal Policy\n Approach with Reduced Search Space","summary":" Given a video with $T$ frames, frame sampling is a task to select $N \\ll T$\nframes, so as to maximize the performance of a fixed video classifier. Not just\nbrute-force search, but most existing methods suffer from its vast search space\nof $\\binom{T}{N}$, especially when $N$ gets large. To address this challenge,\nwe introduce a novel perspective of reducing the search space from $O(T^N)$ to\n$O(T)$. Instead of exploring the entire $O(T^N)$ space, our proposed\nsemi-optimal policy selects the top $N$ frames based on the independently\nestimated value of each frame using per-frame confidence, significantly\nreducing the computational complexity. We verify that our semi-optimal policy\ncan efficiently approximate the optimal policy, particularly under practical\nsettings. Additionally, through extensive experiments on various datasets and\nmodel architectures, we demonstrate that learning our semi-optimal policy\nensures stable and high performance regardless of the size of $N$ and $T$.\n","authors":["Junho Lee","Jeongwoo Shin","Seung Woo Ko","Seongsu Ha","Joonseok Lee"],"pdf_url":"https://arxiv.org/pdf/2409.05260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05258v1","updated":"2024-09-09T00:47:30Z","published":"2024-09-09T00:47:30Z","title":"Towards Automated Machine Learning Research","summary":" This paper explores a top-down approach to automating incremental advances in\nmachine learning research through component-level innovation, facilitated by\nLarge Language Models (LLMs). Our framework systematically generates novel\ncomponents, validates their feasibility, and evaluates their performance\nagainst existing baselines. A key distinction of this approach lies in how\nthese novel components are generated. Unlike traditional AutoML and NAS\nmethods, which often rely on a bottom-up combinatorial search over predefined,\nhardcoded base components, our method leverages the cross-domain knowledge\nembedded in LLMs to propose new components that may not be confined to any\nhard-coded predefined set. By incorporating a reward model to prioritize\npromising hypotheses, we aim to improve the efficiency of the hypothesis\ngeneration and evaluation process. We hope this approach offers a new avenue\nfor exploration and contributes to the ongoing dialogue in the field.\n","authors":["Shervin Ardeshir"],"pdf_url":"https://arxiv.org/pdf/2409.05258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05255v1","updated":"2024-09-09T00:18:48Z","published":"2024-09-09T00:18:48Z","title":"Label-free evaluation of lung and heart transplant biopsies using\n virtual staining","summary":" Organ transplantation serves as the primary therapeutic strategy for\nend-stage organ failures. However, allograft rejection is a common complication\nof organ transplantation. Histological assessment is essential for the timely\ndetection and diagnosis of transplant rejection and remains the gold standard.\nNevertheless, the traditional histochemical staining process is time-consuming,\ncostly, and labor-intensive. Here, we present a panel of virtual staining\nneural networks for lung and heart transplant biopsies, which digitally convert\nautofluorescence microscopic images of label-free tissue sections into their\nbrightfield histologically stained counterparts, bypassing the traditional\nhistochemical staining process. Specifically, we virtually generated\nHematoxylin and Eosin (H&E), Masson's Trichrome (MT), and Elastic Verhoeff-Van\nGieson (EVG) stains for label-free transplant lung tissue, along with H&E and\nMT stains for label-free transplant heart tissue. Subsequent blind evaluations\nconducted by three board-certified pathologists have confirmed that the virtual\nstaining networks consistently produce high-quality histology images with high\ncolor uniformity, closely resembling their well-stained histochemical\ncounterparts across various tissue features. The use of virtually stained\nimages for the evaluation of transplant biopsies achieved comparable diagnostic\noutcomes to those obtained via traditional histochemical staining, with a\nconcordance rate of 82.4% for lung samples and 91.7% for heart samples.\nMoreover, virtual staining models create multiple stains from the same\nautofluorescence input, eliminating structural mismatches observed between\nadjacent sections stained in the traditional workflow, while also saving\ntissue, expert time, and staining costs.\n","authors":["Yuzhu Li","Nir Pillar","Tairan Liu","Guangdong Ma","Yuxuan Qi","Kevin de Haan","Yijie Zhang","Xilin Yang","Adrian J. Correa","Guangqian Xiao","Kuang-Yu Jen","Kenneth A. Iczkowski","Yulun Wu","William Dean Wallace","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2409.05255v1.pdf","comment":"21 Pages, 5 Figures"},{"id":"http://arxiv.org/abs/2407.05266v2","updated":"2024-09-09T00:08:36Z","published":"2024-07-07T05:39:25Z","title":"CLAMP-ViT: Contrastive Data-Free Learning for Adaptive Post-Training\n Quantization of ViTs","summary":" We present CLAMP-ViT, a data-free post-training quantization method for\nvision transformers (ViTs). We identify the limitations of recent techniques,\nnotably their inability to leverage meaningful inter-patch relationships,\nleading to the generation of simplistic and semantically vague data, impacting\nquantization accuracy. CLAMP-ViT employs a two-stage approach, cyclically\nadapting between data generation and model quantization. Specifically, we\nincorporate a patch-level contrastive learning scheme to generate richer,\nsemantically meaningful data. Furthermore, we leverage contrastive learning in\nlayer-wise evolutionary search for fixed- and mixed-precision quantization to\nidentify optimal quantization parameters while mitigating the effects of a\nnon-smooth loss landscape. Extensive evaluations across various vision tasks\ndemonstrate the superiority of CLAMP-ViT, with performance improvements of up\nto 3% in top-1 accuracy for classification, 0.6 mAP for object detection, and\n1.5 mIoU for segmentation at similar or better compression ratio over existing\nalternatives. Code is available at\nhttps://github.com/georgia-tech-synergy-lab/CLAMP-ViT.git\n","authors":["Akshat Ramachandran","Souvik Kundu","Tushar Krishna"],"pdf_url":"https://arxiv.org/pdf/2407.05266v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.05250v1","updated":"2024-09-09T00:01:48Z","published":"2024-09-09T00:01:48Z","title":"MRStyle: A Unified Framework for Color Style Transfer with\n Multi-Modality Reference","summary":" In this paper, we introduce MRStyle, a comprehensive framework that enables\ncolor style transfer using multi-modality reference, including image and text.\nTo achieve a unified style feature space for both modalities, we first develop\na neural network called IRStyle, which generates stylized 3D lookup tables for\nimage reference. This is accomplished by integrating an interaction\ndual-mapping network with a combined supervised learning pipeline, resulting in\nthree key benefits: elimination of visual artifacts, efficient handling of\nhigh-resolution images with low memory usage, and maintenance of style\nconsistency even in situations with significant color style variations. For\ntext reference, we align the text feature of stable diffusion priors with the\nstyle feature of our IRStyle to perform text-guided color style transfer\n(TRStyle). Our TRStyle method is highly efficient in both training and\ninference, producing notable open-set text-guided transfer results. Extensive\nexperiments in both image and text settings demonstrate that our proposed\nmethod outperforms the state-of-the-art in both qualitative and quantitative\nevaluations.\n","authors":["Jiancheng Huang","Yu Gao","Zequn Jie","Yujie Zhong","Xintong Han","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2409.05250v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.05806v1","updated":"2024-09-09T17:11:51Z","published":"2024-09-09T17:11:51Z","title":"Benchmarking Chinese Knowledge Rectification in Large Language Models","summary":" While Large Language Models (LLMs) exhibit remarkable generative\ncapabilities, they are not without flaws, particularly in the form of\nhallucinations. This issue is even more pronounced when LLMs are applied to\nspecific languages and domains. For example, LLMs may generate nonsense\ninformation when handling Chinese ancient poetry, proverbs, or idioms, owing to\nthe lack of specific knowledge. To this end, this paper introduces a benchmark\nfor rectifying Chinese knowledge in LLMs via knowledge editing. Specifically,\nwe introduce a new Chinese dataset, CKnowEdit, by collecting seven type of\nknowledge from various sources, including classical texts, idioms, and content\nfrom Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony,\nantithesis, and logical constructs inherent in the Chinese language. Through\nthe analysis of this dataset, we uncover the challenges faced by current LLMs\nin mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge\nediting techniques on this dataset unveil the substantial scope for advancement\nin the rectification of Chinese knowledge. Code and dataset are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Tianhe Lu","Jizhan Fang","Yunzhi Yao","Xin Xu","Ningyu Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05806v1.pdf","comment":"Ongoing work; code and dataset are available at\n https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2409.05692v1","updated":"2024-09-09T15:05:27Z","published":"2024-09-09T15:05:27Z","title":"Extracting the U.S. building types from OpenStreetMap data","summary":" Building type information is crucial for population estimation, traffic\nplanning, urban planning, and emergency response applications. Although\nessential, such data is often not readily available. To alleviate this problem,\nthis work creates a comprehensive dataset by providing\nresidential/non-residential building classification covering the entire United\nStates. We propose and utilize an unsupervised machine learning method to\nclassify building types based on building footprints and available\nOpenStreetMap information. The classification result is validated using\nauthoritative ground truth data for select counties in the U.S. The validation\nshows a high precision for non-residential building classification and a high\nrecall for residential buildings. We identified various approaches to improving\nthe quality of the classification, such as removing sheds and garages from the\ndataset. Furthermore, analyzing the misclassifications revealed that they are\nmainly due to missing and scarce metadata in OSM. A major result of this work\nis the resulting dataset of classifying 67,705,475 buildings. We hope that this\ndata is of value to the scientific community, including urban and\ntransportation planners.\n","authors":["Henrique F. de Arruda","Sandro M. Reia","Shiyang Ruan","Kuldip S. Atwal","Hamdi Kavak","Taylor Anderson","Dieter Pfoser"],"pdf_url":"https://arxiv.org/pdf/2409.05692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05677v1","updated":"2024-09-09T14:44:19Z","published":"2024-09-09T14:44:19Z","title":"RegNLP in Action: Facilitating Compliance Through Automated Information\n Retrieval and Answer Generation","summary":" Regulatory documents, issued by governmental regulatory bodies, establish\nrules, guidelines, and standards that organizations must adhere to for legal\ncompliance. These documents, characterized by their length, complexity and\nfrequent updates, are challenging to interpret, requiring significant\nallocation of time and expertise on the part of organizations to ensure ongoing\ncompliance.Regulatory Natural Language Processing (RegNLP) is a\nmultidisciplinary subfield aimed at simplifying access to and interpretation of\nregulatory rules and obligations. We define an Automated Question-Passage\nGeneration task for RegNLP, create the ObliQA dataset containing 27,869\nquestions derived from the Abu Dhabi Global Markets (ADGM) financial regulation\ndocument collection, design a baseline Regulatory Information Retrieval and\nAnswer Generation system, and evaluate it with RePASs, a novel evaluation\nmetric that tests whether generated answers accurately capture all relevant\nobligations and avoid contradictions.\n","authors":["Tuba Gokhan","Kexin Wang","Iryna Gurevych","Ted Briscoe"],"pdf_url":"https://arxiv.org/pdf/2409.05677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05633v1","updated":"2024-09-09T14:04:17Z","published":"2024-09-09T14:04:17Z","title":"Enhancing Graph Contrastive Learning with Reliable and Informative\n Augmentation for Recommendation","summary":" Graph neural network (GNN) has been a powerful approach in collaborative\nfiltering (CF) due to its ability to model high-order user-item relationships.\nRecently, to alleviate the data sparsity and enhance representation learning,\nmany efforts have been conducted to integrate contrastive learning (CL) with\nGNNs. Despite the promising improvements, the contrastive view generation based\non structure and representation perturbations in existing methods potentially\ndisrupts the collaborative information in contrastive views, resulting in\nlimited effectiveness of positive alignment. To overcome this issue, we propose\nCoGCL, a novel framework that aims to enhance graph contrastive learning by\nconstructing contrastive views with stronger collaborative information via\ndiscrete codes. The core idea is to map users and items into discrete codes\nrich in collaborative information for reliable and informative contrastive view\ngeneration. To this end, we initially introduce a multi-level vector quantizer\nin an end-to-end manner to quantize user and item representations into discrete\ncodes. Based on these discrete codes, we enhance the collaborative information\nof contrastive views by considering neighborhood structure and semantic\nrelevance respectively. For neighborhood structure, we propose virtual neighbor\naugmentation by treating discrete codes as virtual neighbors, which expands an\nobserved user-item interaction into multiple edges involving discrete codes.\nRegarding semantic relevance, we identify similar users/items based on shared\ndiscrete codes and interaction targets to generate the semantically relevant\nview. Through these strategies, we construct contrastive views with stronger\ncollaborative information and develop a triple-view graph contrastive learning\napproach. Extensive experiments on four public datasets demonstrate the\neffectiveness of our proposed approach.\n","authors":["Bowen Zheng","Junjie Zhang","Hongyu Lu","Yu Chen","Ming Chen","Wayne Xin Zhao","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2409.05633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05570v1","updated":"2024-09-09T12:53:06Z","published":"2024-09-09T12:53:06Z","title":"Rs4rs: Semantically Find Recent Publications from Top Recommendation\n System-Related Venues","summary":" Rs4rs is a web application designed to perform semantic search on recent\npapers from top conferences and journals related to Recommender Systems.\nCurrent scholarly search engine tools like Google Scholar, Semantic Scholar,\nand ResearchGate often yield broad results that fail to target the most\nrelevant high-quality publications. Moreover, manually visiting individual\nconference and journal websites is a time-consuming process that primarily\nsupports only syntactic searches. Rs4rs addresses these issues by providing a\nuser-friendly platform where researchers can input their topic of interest and\nreceive a list of recent, relevant papers from top Recommender Systems venues.\nUtilizing semantic search techniques, Rs4rs ensures that the search results are\nnot only precise and relevant but also comprehensive, capturing papers\nregardless of variations in wording. This tool significantly enhances research\nefficiency and accuracy, thereby benefitting the research community and public\nby facilitating access to high-quality, pertinent academic resources in the\nfield of Recommender Systems. Rs4rs is available at https://rs4rs.com.\n","authors":["Tri Kurniawan Wijaya","Edoardo D'Amico","Gabor Fodor","Manuel V. Loureiro"],"pdf_url":"https://arxiv.org/pdf/2409.05570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05546v1","updated":"2024-09-09T12:11:53Z","published":"2024-09-09T12:11:53Z","title":"End-to-End Learnable Item Tokenization for Generative Recommendation","summary":" Recently, generative recommendation has emerged as a promising new paradigm\nthat directly generates item identifiers for recommendation. However, a key\nchallenge lies in how to effectively construct item identifiers that are\nsuitable for recommender systems. Existing methods typically decouple item\ntokenization from subsequent generative recommendation training, likely\nresulting in suboptimal performance. To address this limitation, we propose\nETEGRec, a novel End-To-End Generative Recommender by seamlessly integrating\nitem tokenization and generative recommendation. Our framework is developed\nbased on the dual encoder-decoder architecture, which consists of an item\ntokenizer and a generative recommender. In order to achieve mutual enhancement\nbetween the two components, we propose a recommendation-oriented alignment\napproach by devising two specific optimization objectives: sequence-item\nalignment and preference-semantic alignment. These two alignment objectives can\neffectively couple the learning of item tokenizer and generative recommender,\nthereby fostering the mutual enhancement between the two components. Finally,\nwe further devise an alternating optimization method, to facilitate stable and\neffective end-to-end learning of the entire framework. Extensive experiments\ndemonstrate the effectiveness of our proposed framework compared to a series of\ntraditional sequential recommendation models and generative recommendation\nbaselines.\n","authors":["Enze Liu","Bowen Zheng","Cheng Ling","Lantao Hu","Han Li","Wayne Xin Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.05546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05526v1","updated":"2024-09-09T11:35:35Z","published":"2024-09-09T11:35:35Z","title":"RBoard: A Unified Platform for Reproducible and Reusable Recommender\n System Benchmarks","summary":" Recommender systems research lacks standardized benchmarks for\nreproducibility and algorithm comparisons. We introduce RBoard, a novel\nframework addressing these challenges by providing a comprehensive platform for\nbenchmarking diverse recommendation tasks, including CTR prediction, Top-N\nrecommendation, and others. RBoard's primary objective is to enable fully\nreproducible and reusable experiments across these scenarios. The framework\nevaluates algorithms across multiple datasets within each task, aggregating\nresults for a holistic performance assessment. It implements standardized\nevaluation protocols, ensuring consistency and comparability. To facilitate\nreproducibility, all user-provided code can be easily downloaded and executed,\nallowing researchers to reliably replicate studies and build upon previous\nwork. By offering a unified platform for rigorous, reproducible evaluation\nacross various recommendation scenarios, RBoard aims to accelerate progress in\nthe field and establish a new standard for recommender systems benchmarking in\nboth academia and industry. The platform is available at https://rboard.org and\nthe demo video can be found at https://bit.ly/rboard-demo.\n","authors":["Tri Kurniawan Wijaya","Edoardo D'Amico","Gabor Fodor","Manuel V. Loureiro"],"pdf_url":"https://arxiv.org/pdf/2409.05526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05512v1","updated":"2024-09-09T11:10:45Z","published":"2024-09-09T11:10:45Z","title":"DatAasee -- A Metadata-Lake as Metadata Catalog for a Virtual Data-Lake","summary":" Metadata management for distributed data sources is a long-standing but\never-growing problem. To counter this challenge in a research-data and\nlibrary-oriented setting, this work constructs a data architecture, derived\nfrom the data-lake: the metadata-lake. A proof-of-concept implementation of\nthis proposed metadata system is presented and evaluated as well.\n","authors":["Christian Himpe"],"pdf_url":"https://arxiv.org/pdf/2409.05512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03753v2","updated":"2024-09-09T10:04:00Z","published":"2024-09-05T17:59:15Z","title":"WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild","summary":" The increasing availability of real-world conversation data offers exciting\nopportunities for researchers to study user-chatbot interactions. However, the\nsheer volume of this data makes manually examining individual conversations\nimpractical. To overcome this challenge, we introduce WildVis, an interactive\ntool that enables fast, versatile, and large-scale conversation analysis.\nWildVis provides search and visualization capabilities in the text and\nembedding spaces based on a list of criteria. To manage million-scale datasets,\nwe implemented optimizations including search index construction, embedding\nprecomputation and compression, and caching to ensure responsive user\ninteractions within seconds. We demonstrate WildVis' utility through three case\nstudies: facilitating chatbot misuse research, visualizing and comparing topic\ndistributions across datasets, and characterizing user-specific conversation\npatterns. WildVis is open-source and designed to be extendable, supporting\nadditional datasets and customized search and visualization functionalities.\n","authors":["Yuntian Deng","Wenting Zhao","Jack Hessel","Xiang Ren","Claire Cardie","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05462v1","updated":"2024-09-09T09:42:46Z","published":"2024-09-09T09:42:46Z","title":"Federated Transfer Learning Based Cooperative Wideband Spectrum Sensing\n with Model Pruning","summary":" For ultra-wideband and high-rate wireless communication systems, wideband\nspectrum sensing (WSS) is critical, since it empowers secondary users (SUs) to\ncapture the spectrum holes for opportunistic transmission. However, WSS\nencounters challenges such as excessive costs of hardware and computation due\nto the high sampling rate, as well as robustness issues arising from scenario\nmismatch. In this paper, a WSS neural network (WSSNet) is proposed by\nexploiting multicoset preprocessing to enable the sub-Nyquist sampling, with\nthe two dimensional convolution design specifically tailored to work with the\npreprocessed samples. A federated transfer learning (FTL) based framework\nmobilizing multiple SUs is further developed to achieve a robust model\nadaptable to various scenarios, which is paved by the selective weight pruning\nfor the fast model adaptation and inference. Simulation results demonstrate\nthat the proposed FTL-WSSNet achieves the fairly good performance in different\ntarget scenarios even without local adaptation samples.\n","authors":["Jibin Jia","Peihao Dong","Fuhui Zhou","Qihui Wu"],"pdf_url":"https://arxiv.org/pdf/2409.05462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05461v1","updated":"2024-09-09T09:42:31Z","published":"2024-09-09T09:42:31Z","title":"Recommender Systems Algorithm Selection for Ranking Prediction on\n Implicit Feedback Datasets","summary":" The recommender systems algorithm selection problem for ranking prediction on\nimplicit feedback datasets is under-explored. Traditional approaches in\nrecommender systems algorithm selection focus predominantly on rating\nprediction on explicit feedback datasets, leaving a research gap for ranking\nprediction on implicit feedback datasets. Algorithm selection is a critical\nchallenge for nearly every practitioner in recommender systems. In this work,\nwe take the first steps toward addressing this research gap. We evaluate the\nNDCG@10 of 24 recommender systems algorithms, each with two hyperparameter\nconfigurations, on 72 recommender systems datasets. We train four optimized\nmachine-learning meta-models and one automated machine-learning meta-model with\nthree different settings on the resulting meta-dataset. Our results show that\nthe predictions of all tested meta-models exhibit a median Spearman correlation\nranging from 0.857 to 0.918 with the ground truth. We show that the median\nSpearman correlation between meta-model predictions and the ground truth\nincreases by an average of 0.124 when the meta-model is optimized to predict\nthe ranking of algorithms instead of their performance. Furthermore, in terms\nof predicting the best algorithm for an unknown dataset, we demonstrate that\nthe best optimized traditional meta-model, e.g., XGBoost, achieves a recall of\n48.6%, outperforming the best tested automated machine learning meta-model,\ne.g., AutoGluon, which achieves a recall of 47.2%.\n","authors":["Lukas Wegmeth","Tobias Vente","Joeran Beel"],"pdf_url":"https://arxiv.org/pdf/2409.05461v1.pdf","comment":"Accepted for presentation at the 18th ACM Conference on Recommender\n Systems in the Late-Breaking Results Track"},{"id":"http://arxiv.org/abs/2409.05417v1","updated":"2024-09-09T08:19:43Z","published":"2024-09-09T08:19:43Z","title":"Replicability Measures for Longitudinal Information Retrieval Evaluation","summary":" Information Retrieval (IR) systems are exposed to constant changes in most\ncomponents. Documents are created, updated, or deleted, the information needs\nare changing, and even relevance might not be static. While it is generally\nexpected that the IR systems retain a consistent utility for the users, test\ncollection evaluations rely on a fixed experimental setup. Based on the\nLongEval shared task and test collection, this work explores how the\neffectiveness measured in evolving experiments can be assessed. Specifically,\nthe persistency of effectiveness is investigated as a replicability task. It is\nobserved how the effectiveness progressively deteriorates over time compared to\nthe initial measurement. Employing adapted replicability measures provides\nfurther insight into the persistence of effectiveness. The ranking of systems\nvaries across retrieval measures and time. In conclusion, it was found that the\nmost effective systems are not necessarily the ones with the most persistent\nperformance.\n","authors":["Jüri Keller","Timo Breuer","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2409.05417v1.pdf","comment":"Experimental IR Meets Multilinguality, Multimodality, and Interaction\n - 15th International Conference of the CLEF Association, CLEF 2024, Grenoble,\n France, September 9-12, 2024, Proceedings. arXiv admin note: text overlap\n with arXiv:2308.10549"},{"id":"http://arxiv.org/abs/2409.05405v1","updated":"2024-09-09T08:06:50Z","published":"2024-09-09T08:06:50Z","title":"A Survey of Multimodal Composite Editing and Retrieval","summary":" In the real world, where information is abundant and diverse across different\nmodalities, understanding and utilizing various data types to improve retrieval\nsystems is a key focus of research. Multimodal composite retrieval integrates\ndiverse modalities such as text, image and audio, etc. to provide more\naccurate, personalized, and contextually relevant results. To facilitate a\ndeeper understanding of this promising direction, this survey explores\nmultimodal composite editing and retrieval in depth, covering image-text\ncomposite editing, image-text composite retrieval, and other multimodal\ncomposite retrieval. In this survey, we systematically organize the application\nscenarios, methods, benchmarks, experiments, and future directions. Multimodal\nlearning is a hot topic in large model era, and have also witnessed some\nsurveys in multimodal learning and vision-language models with transformers\npublished in the PAMI journal. To the best of our knowledge, this survey is the\nfirst comprehensive review of the literature on multimodal composite retrieval,\nwhich is a timely complement of multimodal fusion to existing reviews. To help\nreaders' quickly track this field, we build the project page for this survey,\nwhich can be found at\nhttps://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval.\n","authors":["Suyan Li","Fuxiang Huang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05405v1.pdf","comment":"22 pages, 3 figures, and 11 tables"},{"id":"http://arxiv.org/abs/2409.05401v1","updated":"2024-09-09T07:57:43Z","published":"2024-09-09T07:57:43Z","title":"NLLB-E5: A Scalable Multilingual Retrieval Model","summary":" Despite significant progress in multilingual information retrieval, the lack\nof models capable of effectively supporting multiple languages, particularly\nlow-resource like Indic languages, remains a critical challenge. This paper\npresents NLLB-E5: A Scalable Multilingual Retrieval Model. NLLB-E5 leverages\nthe in-built multilingual capabilities in the NLLB encoder for translation\ntasks. It proposes a distillation approach from multilingual retriever E5 to\nprovide a zero-shot retrieval approach handling multiple languages, including\nall major Indic languages, without requiring multilingual training data. We\nevaluate the model on a comprehensive suite of existing benchmarks, including\nHindi-BEIR, highlighting its robust performance across diverse languages and\ntasks. Our findings uncover task and domain-specific challenges, providing\nvaluable insights into the retrieval performance, especially for low-resource\nlanguages. NLLB-E5 addresses the urgent need for an inclusive, scalable, and\nlanguage-agnostic text retrieval model, advancing the field of multilingual\ninformation access and promoting digital inclusivity for millions of users\nglobally.\n","authors":["Arkadeep Acharya","Rudra Murthy","Vishwajeet Kumar","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2409.05401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03893v2","updated":"2024-09-09T07:47:58Z","published":"2024-09-05T19:59:42Z","title":"Understanding Fairness in Recommender Systems: A Healthcare Perspective","summary":" Fairness in AI-driven decision-making systems has become a critical concern,\nespecially when these systems directly affect human lives. This paper explores\nthe public's comprehension of fairness in healthcare recommendations. We\nconducted a survey where participants selected from four fairness metrics --\nDemographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive\nValue -- across different healthcare scenarios to assess their understanding of\nthese concepts. Our findings reveal that fairness is a complex and often\nmisunderstood concept, with a generally low level of public understanding\nregarding fairness metrics in recommender systems. This study highlights the\nneed for enhanced information and education on algorithmic fairness to support\ninformed decision-making in using these systems. Furthermore, the results\nsuggest that a one-size-fits-all approach to fairness may be insufficient,\npointing to the importance of context-sensitive designs in developing equitable\nAI systems.\n","authors":["Veronica Kecki","Alan Said"],"pdf_url":"https://arxiv.org/pdf/2409.03893v2.pdf","comment":"Accepted to the 18th ACM Conference on Recommender Systems"},{"id":"http://arxiv.org/abs/2407.14482v2","updated":"2024-09-09T06:19:07Z","published":"2024-07-19T17:35:47Z","title":"ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG\n Capabilities","summary":" In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K\ncontext window, designed to bridge the gap between open-source LLMs and leading\nproprietary models (e.g., GPT-4-Turbo) in long-context understanding and\nretrieval-augmented generation (RAG) capabilities. These two capabilities are\nessential for LLMs to process large volumes of information that cannot fit into\na single prompt and are complementary to each other, depending on the\ndownstream tasks and computational budgets. We present a detailed continued\ntraining recipe to extend the context window of Llama3-70B-base from 8K to 128K\ntokens, along with a three-stage instruction tuning process to enhance the\nmodel's instruction-following, RAG performance, and long-context understanding\ncapabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model\noutperforms most existing state-of-the-art models, including\nGPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on\nultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only\na 4K context window, showing the strong long context capability across varying\nsequence lengths. We further provide extensive comparisons between direct\nlong-context and RAG solutions using the same state-of-the-art long-context\nLLMs. Interestingly, we find that the performance of strong long-context LLMs\nusing RAG improves when retrieving a larger number of chunks. With a large set\nof top-k chunks, RAG consistently outperforms direct long-context solution\nusing the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B\nand Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To\nadvance research in this field, we open-sourced the model weights, training\ndata, and the evaluation setup for the for the community:\nhttps://chatqa2-project.github.io/\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Chejian Xu","Zihan Liu","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2407.14482v2.pdf","comment":"v2: major update with significantly improved results"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.05866v1","updated":"2024-09-09T17:59:54Z","published":"2024-09-09T17:59:54Z","title":"A Framework for Evaluating PM2.5 Forecasts from the Perspective of\n Individual Decision Making","summary":" Wildfire frequency is increasing as the climate changes, and the resulting\nair pollution poses health risks. Just as people routinely use weather\nforecasts to plan their activities around precipitation, reliable air quality\nforecasts could help individuals reduce their exposure to air pollution. In the\npresent work, we evaluate several existing forecasts of fine particular matter\n(PM2.5) within the continental United States in the context of individual\ndecision-making. Our comparison suggests there is meaningful room for\nimprovement in air pollution forecasting, which might be realized by\nincorporating more data sources and using machine learning tools. To facilitate\nfuture machine learning development and benchmarking, we set up a framework to\nevaluate and compare air pollution forecasts for individual decision making. We\nintroduce a new loss to capture decisions about when to use mitigation\nmeasures. We highlight the importance of visualizations when comparing\nforecasts. Finally, we provide code to download and compare archived forecast\npredictions.\n","authors":["Renato Berlinghieri","David R. Burt","Paolo Giani","Arlene M. Fiore","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2409.05866v1.pdf","comment":"22 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.05865v1","updated":"2024-09-09T17:59:50Z","published":"2024-09-09T17:59:50Z","title":"Robot Utility Models: General Policies for Zero-Shot Deployment in New\n Environments","summary":" Robot models, particularly those trained with large amounts of data, have\nrecently shown a plethora of real-world manipulation and navigation\ncapabilities. Several independent efforts have shown that given sufficient\ntraining data in an environment, robot policies can generalize to demonstrated\nvariations in that environment. However, needing to finetune robot models to\nevery new environment stands in stark contrast to models in language or vision\nthat can be deployed zero-shot for open-world problems. In this work, we\npresent Robot Utility Models (RUMs), a framework for training and deploying\nzero-shot robot policies that can directly generalize to new environments\nwithout any finetuning. To create RUMs efficiently, we develop new tools to\nquickly collect data for mobile manipulation tasks, integrate such data into a\npolicy with multi-modal imitation learning, and deploy policies on-device on\nHello Robot Stretch, a cheap commodity robot, with an external mLLM verifier\nfor retrying. We train five such utility models for opening cabinet doors,\nopening drawers, picking up napkins, picking up paper bags, and reorienting\nfallen objects. Our system, on average, achieves 90% success rate in unseen,\nnovel environments interacting with unseen objects. Moreover, the utility\nmodels can also succeed in different robot and camera set-ups with no further\ndata, training, or fine-tuning. Primary among our lessons are the importance of\ntraining data over training algorithm and policy class, guidance about data\nscaling, necessity for diverse yet high-quality demonstrations, and a recipe\nfor robot introspection and retrying to improve performance on individual\nenvironments. Our code, data, models, hardware designs, as well as our\nexperiment and deployment videos are open sourced and can be found on our\nproject website: https://robotutilitymodels.com\n","authors":["Haritheja Etukuru","Norihito Naka","Zijin Hu","Seungjae Lee","Julian Mehu","Aaron Edsinger","Chris Paxton","Soumith Chintala","Lerrel Pinto","Nur Muhammad Mahi Shafiullah"],"pdf_url":"https://arxiv.org/pdf/2409.05865v1.pdf","comment":"Project website https://robotutilitymodels.com"},{"id":"http://arxiv.org/abs/2409.05864v1","updated":"2024-09-09T17:59:45Z","published":"2024-09-09T17:59:45Z","title":"Neural MP: A Generalist Neural Motion Planner","summary":" The current paradigm for motion planning generates solutions from scratch for\nevery new problem, which consumes significant amounts of time and computational\nresources. For complex, cluttered scenes, motion planning approaches can often\ntake minutes to produce a solution, while humans are able to accurately and\nsafely reach any goal in seconds by leveraging their prior experience. We seek\nto do the same by applying data-driven learning at scale to the problem of\nmotion planning. Our approach builds a large number of complex scenes in\nsimulation, collects expert data from a motion planner, then distills it into a\nreactive generalist policy. We then combine this with lightweight optimization\nto obtain a safe path for real world deployment. We perform a thorough\nevaluation of our method on 64 motion planning tasks across four diverse\nenvironments with randomized poses, scenes and obstacles, in the real world,\ndemonstrating an improvement of 23%, 17% and 79% motion planning success rate\nover state of the art sampling, optimization and learning based planning\nmethods. Video results available at mihdalal.github.io/neuralmotionplanner\n","authors":["Murtaza Dalal","Jiahui Yang","Russell Mendonca","Youssef Khaky","Ruslan Salakhutdinov","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2409.05864v1.pdf","comment":"Website at mihdalal.github.io/neuralmotionplanner. Main paper: 7\n pages, 4 figures, 2 tables. Appendix: 9 pages, 5 figures, 6 tables"},{"id":"http://arxiv.org/abs/2408.08381v3","updated":"2024-09-09T17:58:42Z","published":"2024-08-15T18:54:31Z","title":"Pre-processing and Compression: Understanding Hidden Representation\n Refinement Across Imaging Domains via Intrinsic Dimension","summary":" In recent years, there has been interest in how geometric properties such as\nintrinsic dimension (ID) of a neural network's hidden representations change\nthrough its layers, and how such properties are predictive of important model\nbehavior such as generalization ability. However, evidence has begun to emerge\nthat such behavior can change significantly depending on the domain of the\nnetwork's training data, such as natural versus medical images. Here, we\nfurther this inquiry by exploring how the ID of a network's learned\nrepresentations changes through its layers, in essence, characterizing how the\nnetwork successively refines the information content of input data to be used\nfor predictions. Analyzing eleven natural and medical image datasets across six\nnetwork architectures, we find that how ID changes through the network differs\nnoticeably between natural and medical image models. Specifically, medical\nimage models peak in representation ID earlier in the network, implying a\ndifference in the image features and their abstractness that are typically used\nfor downstream tasks in these domains. Additionally, we discover a strong\ncorrelation of this peak representation ID with the ID of the data in its input\nspace, implying that the intrinsic information content of a model's learned\nrepresentations is guided by that of the data it was trained on. Overall, our\nfindings emphasize notable discrepancies in network behavior between natural\nand non-natural imaging domains regarding hidden representation information\ncontent, and provide further insights into how a network's learned features are\nshaped by its training data.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2408.08381v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05846v1","updated":"2024-09-09T17:45:37Z","published":"2024-09-09T17:45:37Z","title":"An Introduction to Quantum Reinforcement Learning (QRL)","summary":" Recent advancements in quantum computing (QC) and machine learning (ML) have\nsparked considerable interest in the integration of these two cutting-edge\nfields. Among the various ML techniques, reinforcement learning (RL) stands out\nfor its ability to address complex sequential decision-making problems. RL has\nalready demonstrated substantial success in the classical ML community. Now,\nthe emerging field of Quantum Reinforcement Learning (QRL) seeks to enhance RL\nalgorithms by incorporating principles from quantum computing. This paper\noffers an introduction to this exciting area for the broader AI and ML\ncommunity.\n","authors":["Samuel Yen-Chi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05846v1.pdf","comment":"Accepted by The 15th International Conference on ICT Convergence -\n ICTC 2024"},{"id":"http://arxiv.org/abs/2405.20611v2","updated":"2024-09-09T17:35:58Z","published":"2024-05-31T03:57:19Z","title":"Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in\n Lifted Compiled Code","summary":" Detecting vulnerabilities within compiled binaries is challenging due to lost\nhigh-level code structures and other factors such as architectural\ndependencies, compilers, and optimization options. To address these obstacles,\nthis research explores vulnerability detection using natural language\nprocessing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn\nsemantics from intermediate representation (LLVM IR) code. Long short-term\nmemory (LSTM) neural networks were trained on embeddings from encoders created\nusing approximately 48k LLVM functions from the Juliet dataset. This study is\npioneering in its comparison of word2vec models with multiple bidirectional\ntransformer (BERT, RoBERTa) embeddings built using LLVM code to train neural\nnetworks to detect vulnerabilities in compiled binaries. word2vec Skip-Gram\nmodels achieved 92% validation accuracy in detecting vulnerabilities,\noutperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This\nsuggests that complex contextual embeddings may not provide advantages over\nsimpler word2vec models for this task when a limited number (e.g. 48K) of data\nsamples are used to train the bidirectional transformer-based models. The\ncomparative results provide novel insights into selecting optimal embeddings\nfor learning compiler-independent semantic code representations to advance\nmachine learning detection of vulnerabilities in compiled binaries.\n","authors":["Gary A. McCully","John D. Hastings","Shengjie Xu","Adam Fortier"],"pdf_url":"https://arxiv.org/pdf/2405.20611v2.pdf","comment":"Updated with improvements\""},{"id":"http://arxiv.org/abs/2409.05816v1","updated":"2024-09-09T17:23:29Z","published":"2024-09-09T17:23:29Z","title":"Improving Pretraining Data Using Perplexity Correlations","summary":" Quality pretraining data is often seen as the key to high-performance\nlanguage models. However, progress in understanding pretraining data has been\nslow due to the costly pretraining runs required for data selection\nexperiments. We present a framework that avoids these costs and selects\nhigh-quality pretraining data without any LLM training of our own. Our work is\nbased on a simple observation: LLM losses on many pretraining texts are\ncorrelated with downstream benchmark performance, and selecting\nhigh-correlation documents is an effective pretraining data selection method.\nWe build a new statistical framework for data selection centered around\nestimates of perplexity-benchmark correlations and perform data selection using\na sample of 90 LLMs taken from the Open LLM Leaderboard on texts from tens of\nthousands of web domains. In controlled pretraining experiments at the 160M\nparameter scale on 8 benchmarks, our approach outperforms DSIR on every\nbenchmark, while matching the best data selector found in DataComp-LM, a\nhand-engineered bigram classifier.\n","authors":["Tristan Thrush","Christopher Potts","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2409.05816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.05832v2","updated":"2024-09-09T17:20:01Z","published":"2024-06-09T15:50:35Z","title":"Improving Antibody Design with Force-Guided Sampling in Diffusion Models","summary":" Antibodies, crucial for immune defense, primarily rely on\ncomplementarity-determining regions (CDRs) to bind and neutralize antigens,\nsuch as viruses. The design of these CDRs determines the antibody's affinity\nand specificity towards its target. Generative models, particularly denoising\ndiffusion probabilistic models (DDPMs), have shown potential to advance the\nstructure-based design of CDR regions. However, only a limited dataset of bound\nantibody-antigen structures is available, and generalization to\nout-of-distribution interfaces remains a challenge. Physics based force-fields,\nwhich approximate atomic interactions, offer a coarse but universal source of\ninformation to better mold designs to target interfaces. Integrating this\nfoundational information into diffusion models is, therefore, highly desirable.\nHere, we propose a novel approach to enhance the sampling process of diffusion\nmodels by integrating force field energy-based feedback. Our model, DiffForce,\nemploys forces to guide the diffusion sampling process, effectively blending\nthe two distributions. Through extensive experiments, we demonstrate that our\nmethod guides the model to sample CDRs with lower energy, enhancing both the\nstructure and sequence of the generated antibodies.\n","authors":["Paulina Kulytė","Francisco Vargas","Simon Valentin Mathis","Yu Guang Wang","José Miguel Hernández-Lobato","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2406.05832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05806v1","updated":"2024-09-09T17:11:51Z","published":"2024-09-09T17:11:51Z","title":"Benchmarking Chinese Knowledge Rectification in Large Language Models","summary":" While Large Language Models (LLMs) exhibit remarkable generative\ncapabilities, they are not without flaws, particularly in the form of\nhallucinations. This issue is even more pronounced when LLMs are applied to\nspecific languages and domains. For example, LLMs may generate nonsense\ninformation when handling Chinese ancient poetry, proverbs, or idioms, owing to\nthe lack of specific knowledge. To this end, this paper introduces a benchmark\nfor rectifying Chinese knowledge in LLMs via knowledge editing. Specifically,\nwe introduce a new Chinese dataset, CKnowEdit, by collecting seven type of\nknowledge from various sources, including classical texts, idioms, and content\nfrom Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony,\nantithesis, and logical constructs inherent in the Chinese language. Through\nthe analysis of this dataset, we uncover the challenges faced by current LLMs\nin mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge\nediting techniques on this dataset unveil the substantial scope for advancement\nin the rectification of Chinese knowledge. Code and dataset are available at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Tianhe Lu","Jizhan Fang","Yunzhi Yao","Xin Xu","Ningyu Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05806v1.pdf","comment":"Ongoing work; code and dataset are available at\n https://github.com/zjunlp/EasyEdit"},{"id":"http://arxiv.org/abs/2409.05804v1","updated":"2024-09-09T17:10:36Z","published":"2024-09-09T17:10:36Z","title":"Celcomen: spatial causal disentanglement for single-cell and tissue\n perturbation modeling","summary":" Celcomen leverages a mathematical causality framework to disentangle intra-\nand inter- cellular gene regulation programs in spatial transcriptomics and\nsingle-cell data through a generative graph neural network. It can learn\ngene-gene interactions, as well as generate post-perturbation counterfactual\nspatial transcriptomics, thereby offering access to experimentally inaccessible\nsamples. We validated its disentanglement, identifiability, and counterfactual\nprediction capabilities through simulations and in clinically relevant human\nglioblastoma, human fetal spleen, and mouse lung cancer samples. Celcomen\nprovides the means to model disease and therapy induced changes allowing for\nnew insights into single-cell spatially resolved tissue responses relevant to\nhuman health.\n","authors":["Stathis Megas","Daniel G. Chen","Krzysztof Polanski","Moshe Eliasof","Carola-Bibiane Schonlieb","Sarah A. Teichmann"],"pdf_url":"https://arxiv.org/pdf/2409.05804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09176v2","updated":"2024-09-09T17:09:53Z","published":"2024-05-15T08:33:41Z","title":"Cross-Input Certified Training for Universal Perturbations","summary":" Existing work in trustworthy machine learning primarily focuses on\nsingle-input adversarial perturbations. In many real-world attack scenarios,\ninput-agnostic adversarial attacks, e.g. universal adversarial perturbations\n(UAPs), are much more feasible. Current certified training methods train models\nrobust to single-input perturbations but achieve suboptimal clean and UAP\naccuracy, thereby limiting their applicability in practical applications. We\npropose a novel method, CITRUS, for certified training of networks robust\nagainst UAP attackers. We show in an extensive evaluation across different\ndatasets, architectures, and perturbation magnitudes that our method\noutperforms traditional certified training methods on standard accuracy (up to\n10.3\\%) and achieves SOTA performance on the more practical certified UAP\naccuracy metric.\n","authors":["Changming Xu","Gagandeep Singh"],"pdf_url":"https://arxiv.org/pdf/2405.09176v2.pdf","comment":"23 pages, 6 figures, ECCV '24"},{"id":"http://arxiv.org/abs/2409.05800v1","updated":"2024-09-09T17:03:43Z","published":"2024-09-09T17:03:43Z","title":"Input Space Mode Connectivity in Deep Neural Networks","summary":" We extend the concept of loss landscape mode connectivity to the input space\nof deep neural networks. Mode connectivity was originally studied within\nparameter space, where it describes the existence of low-loss paths between\ndifferent solutions (loss minimizers) obtained through gradient descent. We\npresent theoretical and empirical evidence of its presence in the input space\nof deep networks, thereby highlighting the broader nature of the phenomenon. We\nobserve that different input images with similar predictions are generally\nconnected, and for trained models, the path tends to be simple, with only a\nsmall deviation from being a linear path. Our methodology utilizes real,\ninterpolated, and synthetic inputs created using the input optimization\ntechnique for feature visualization. We conjecture that input space mode\nconnectivity in high-dimensional spaces is a geometric effect that takes place\neven in untrained models and can be explained through percolation theory. We\nexploit mode connectivity to obtain new insights about adversarial examples and\ndemonstrate its potential for adversarial detection. Additionally, we discuss\napplications for the interpretability of deep networks.\n","authors":["Jakub Vrabel","Ori Shem-Ur","Yaron Oz","David Krueger"],"pdf_url":"https://arxiv.org/pdf/2409.05800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05798v1","updated":"2024-09-09T17:02:47Z","published":"2024-09-09T17:02:47Z","title":"Enhancing Preference-based Linear Bandits via Human Response Time","summary":" Binary human choice feedback is widely used in interactive preference\nlearning for its simplicity, but it provides limited information about\npreference strength. To overcome this limitation, we leverage human response\ntimes, which inversely correlate with preference strength, as complementary\ninformation. Our work integrates the EZ-diffusion model, which jointly models\nhuman choices and response times, into preference-based linear bandits. We\nintroduce a computationally efficient utility estimator that reformulates the\nutility estimation problem using both choices and response times as a linear\nregression problem. Theoretical and empirical comparisons with traditional\nchoice-only estimators reveal that for queries with strong preferences (\"easy\"\nqueries), choices alone provide limited information, while response times offer\nvaluable complementary information about preference strength. As a result,\nincorporating response times makes easy queries more useful. We demonstrate\nthis advantage in the fixed-budget best-arm identification problem, with\nsimulations based on three real-world datasets, consistently showing\naccelerated learning when response times are incorporated.\n","authors":["Shen Li","Yuyang Zhang","Zhaolin Ren","Claire Liang","Na Li","Julie A. Shah"],"pdf_url":"https://arxiv.org/pdf/2409.05798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05790v1","updated":"2024-09-09T16:50:41Z","published":"2024-09-09T16:50:41Z","title":"Predicting Critical Heat Flux with Uncertainty Quantification and Domain\n Generalization Using Conditional Variational Autoencoders and Deep Neural\n Networks","summary":" Deep generative models (DGMs) have proven to be powerful in generating\nrealistic data samples. Their capability to learn the underlying distribution\nof a dataset enable them to generate synthetic data samples that closely\nresemble the original training dataset, thus addressing the challenge of data\nscarcity. In this work, we investigated the capabilities of DGMs by developing\na conditional variational autoencoder (CVAE) model to augment the critical heat\nflux (CHF) measurement data that was used to generate the 2006 Groeneveld\nlookup table. To determine how this approach compared to traditional methods, a\nfine-tuned deep neural network (DNN) regression model was created and evaluated\nwith the same dataset. Both the CVAE and DNN models achieved small mean\nabsolute relative errors, with the CVAE model maintaining more favorable\nresults. To quantify the uncertainty in the model's predictions, uncertainty\nquantification (UQ) was performed with repeated sampling of the CVAE model and\nensembling of the DNN model. Following UQ, the DNN ensemble notably improved\nperformance when compared to the baseline DNN model, while the CVAE model\nachieved similar results to its non-UQ results. The CVAE model was shown to\nhave significantly less variability and a higher confidence after assessment of\nthe prediction-wise relative standard deviations. Evaluating domain\ngeneralization, both models achieved small mean error values when predicting\nboth inside and outside the training domain, with predictions outside the\ntraining domain showing slightly larger errors. Overall, the CVAE model was\ncomparable to the DNN regression model in predicting CHF values but with better\nuncertainty behavior.\n","authors":["Farah Alsafadi","Aidan Furlong","Xu Wu"],"pdf_url":"https://arxiv.org/pdf/2409.05790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05786v1","updated":"2024-09-09T16:48:42Z","published":"2024-09-09T16:48:42Z","title":"Leveraging Object Priors for Point Tracking","summary":" Point tracking is a fundamental problem in computer vision with numerous\napplications in AR and robotics. A common failure mode in long-term point\ntracking occurs when the predicted point leaves the object it belongs to and\nlands on the background or another object. We identify this as the failure to\ncorrectly capture objectness properties in learning to track. To address this\nlimitation of prior work, we propose a novel objectness regularization approach\nthat guides points to be aware of object priors by forcing them to stay inside\nthe the boundaries of object instances. By capturing objectness cues at\ntraining time, we avoid the need to compute object masks during testing. In\naddition, we leverage contextual attention to enhance the feature\nrepresentation for capturing objectness at the feature level more effectively.\nAs a result, our approach achieves state-of-the-art performance on three point\ntracking benchmarks, and we further validate the effectiveness of our\ncomponents via ablation studies. The source code is available at:\nhttps://github.com/RehgLab/tracking_objectness\n","authors":["Bikram Boote","Anh Thai","Wenqi Jia","Ozgur Kara","Stefan Stojanov","James M. Rehg","Sangmin Lee"],"pdf_url":"https://arxiv.org/pdf/2409.05786v1.pdf","comment":"ECCV 2024 ILR Workshop"},{"id":"http://arxiv.org/abs/2409.05782v1","updated":"2024-09-09T16:45:26Z","published":"2024-09-09T16:45:26Z","title":"Unified Neural Network Scaling Laws and Scale-time Equivalence","summary":" As neural networks continue to grow in size but datasets might not, it is\nvital to understand how much performance improvement can be expected: is it\nmore important to scale network size or data volume? Thus, neural network\nscaling laws, which characterize how test error varies with network size and\ndata volume, have become increasingly important. However, existing scaling laws\nare often applicable only in limited regimes and often do not incorporate or\npredict well-known phenomena such as double descent. Here, we present a novel\ntheoretical characterization of how three factors -- model size, training time,\nand data volume -- interact to determine the performance of deep neural\nnetworks. We first establish a theoretical and empirical equivalence between\nscaling the size of a neural network and increasing its training time\nproportionally. Scale-time equivalence challenges the current practice, wherein\nlarge models are trained for small durations, and suggests that smaller models\ntrained over extended periods could match their efficacy. It also leads to a\nnovel method for predicting the performance of large-scale networks from\nsmall-scale networks trained for extended epochs, and vice versa. We next\ncombine scale-time equivalence with a linear model analysis of double descent\nto obtain a unified theoretical scaling law, which we confirm with experiments\nacross vision benchmarks and network architectures. These laws explain several\npreviously unexplained phenomena: reduced data requirements for generalization\nin larger models, heightened sensitivity to label noise in overparameterized\nmodels, and instances where increasing model scale does not necessarily enhance\nperformance. Our findings hold significant implications for the practical\ndeployment of neural networks, offering a more accessible and efficient path to\ntraining and fine-tuning large models.\n","authors":["Akhilan Boopathy","Ila Fiete"],"pdf_url":"https://arxiv.org/pdf/2409.05782v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05780v1","updated":"2024-09-09T16:43:09Z","published":"2024-09-09T16:43:09Z","title":"Breaking Neural Network Scaling Laws with Modularity","summary":" Modular neural networks outperform nonmodular neural networks on tasks\nranging from visual question answering to robotics. These performance\nimprovements are thought to be due to modular networks' superior ability to\nmodel the compositional and combinatorial structure of real-world problems.\nHowever, a theoretical explanation of how modularity improves generalizability,\nand how to leverage task modularity while training networks remains elusive.\nUsing recent theoretical progress in explaining neural network generalization,\nwe investigate how the amount of training data required to generalize on a task\nvaries with the intrinsic dimensionality of a task's input. We show\ntheoretically that when applied to modularly structured tasks, while nonmodular\nnetworks require an exponential number of samples with task dimensionality,\nmodular networks' sample complexity is independent of task dimensionality:\nmodular networks can generalize in high dimensions. We then develop a novel\nlearning rule for modular networks to exploit this advantage and empirically\nshow the improved generalization of the rule, both in- and out-of-distribution,\non high-dimensional, modular tasks.\n","authors":["Akhilan Boopathy","Sunshine Jiang","William Yue","Jaedong Hwang","Abhiram Iyer","Ila Fiete"],"pdf_url":"https://arxiv.org/pdf/2409.05780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14774v2","updated":"2024-09-09T16:41:36Z","published":"2024-08-27T04:31:58Z","title":"Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning","summary":" We introduce Instruct-SkillMix, an automated approach for creating diverse,\nhigh quality SFT data. The Instruct-SkillMix pipeline involves two stages, each\nleveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to\nextract core \"skills\" for instruction-following, either from existing datasets,\nor by directly prompting the model; (2) Data generation: uses the powerful LLM\nto generate (instruction, response) data that exhibit a randomly chosen pair of\nthese skills. Here, the use of random skill combinations promotes diversity and\ndifficulty.\n Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from\nInstruct-SkillMix leads to strong gains on instruction following benchmarks\nsuch as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples,\nLLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0.\nTo our knowledge, this achieves state-of-the-art performance among all models\nthat have only undergone SFT (no RL methods) and competes with proprietary\nmodels such as Claude 3 Opus and LLaMA-3.1-405B-Instruct.\n Ablation studies also suggest plausible reasons for why creating open\ninstruction-tuning datasets via naive crowd-sourcing has proved difficult.\nIntroducing low quality answers (\"shirkers\") in $20\\%$ of Instruct-SkillMix\nexamples causes performance to plummet, sometimes catastrophically.\n The Instruct-SkillMix pipeline is flexible and is adaptable to other\nsettings.\n","authors":["Simran Kaur","Simon Park","Anirudh Goyal","Sanjeev Arora"],"pdf_url":"https://arxiv.org/pdf/2408.14774v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05778v1","updated":"2024-09-09T16:41:04Z","published":"2024-09-09T16:41:04Z","title":"Advanced LSTM Neural Networks for Predicting Directional Changes in\n Sector-Specific ETFs Using Machine Learning Techniques","summary":" Trading and investing in stocks for some is their full-time career, while for\nothers, it's simply a supplementary income stream. Universal among all\ninvestors is the desire to turn a profit. The key to achieving this goal is\ndiversification. Spreading investments across sectors is critical to\nprofitability and maximizing returns. This study aims to gauge the viability of\nmachine learning methods in practicing the principle of diversification to\nmaximize portfolio returns. To test this, the study evaluates the Long-Short\nTerm Memory (LSTM) model across nine different sectors and over 2,200 stocks\nusing Vanguard's sector-based ETFs. The R-squared value across all sectors\nshowed promising results, with an average of 0.8651 and a high of 0.942 for the\nVNQ ETF. These findings suggest that the LSTM model is a capable and viable\nmodel for accurately predicting directional changes across various industry\nsectors, helping investors diversify and grow their portfolios.\n","authors":["Rifa Gowani","Zaryab Kanjiani"],"pdf_url":"https://arxiv.org/pdf/2409.05778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03165v3","updated":"2024-09-09T16:40:24Z","published":"2023-10-04T21:17:31Z","title":"Enhancing Accuracy in Deep Learning Using Random Matrix Theory","summary":" We explore the applications of random matrix theory (RMT) in the training of\ndeep neural networks (DNNs), focusing on layer pruning that is reducing the\nnumber of DNN parameters (weights). Our numerical results show that this\npruning leads to a drastic reduction of parameters while not reducing the\naccuracy of DNNs and CNNs. Moreover, pruning the fully connected DNNs actually\nincreases the accuracy and decreases the variance for random initializations.\nOur numerics indicate that this enhancement in accuracy is due to the\nsimplification of the loss landscape. We next provide rigorous mathematical\nunderpinning of these numerical results by proving the RMT-based Pruning\nTheorem. Our results offer valuable insights into the practical application of\nRMT for the creation of more efficient and accurate deep-learning models.\n","authors":["Leonid Berlyand","Etienne Sandier","Yitzchak Shmalo","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.03165v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14417v2","updated":"2024-09-09T16:34:00Z","published":"2024-07-19T15:42:49Z","title":"Mixture of Experts with Mixture of Precisions for Tuning Quality of\n Service","summary":" The increasing demand for deploying large Mixture-of-Experts (MoE) models in\nresource-constrained environments necessitates efficient approaches to address\ntheir high memory and computational requirements challenges. Moreover, given\nthat tasks come in different user-defined constraints and the available\nresources change over time in multi-tenant environments, it is necessary to\ndesign an approach which provides a flexible configuration space. This paper\npresents an adaptive serving approach for the efficient deployment of MoE\nmodels, capitalizing on partial quantization of the experts. By dynamically\ndetermining the number of quantized experts and their distribution across CPU\nand GPU, our approach explores the Pareto frontier and offers a fine-grained\nrange of configurations for tuning throughput and model quality. Our evaluation\non an NVIDIA A100 GPU using a Mixtral 8x7B MoE model for three language\nmodelling benchmarks demonstrates that the throughput of token generation can\nbe adjusted from 0.63 to 13.00 token per second. This enhancement comes with a\nmarginal perplexity increase of 3.81 to 4.00, 13.59 to 14.17, and 7.24 to 7.40\nfor WikiText2, PTB, and C4 datasets respectively under maximum quantization.\nThese results highlight the practical applicability of our approach in dynamic\nand accuracy-sensitive applications where both memory usage and output quality\nare important.\n","authors":["HamidReza Imani","Abdolah Amirany","Tarek El-Ghazawi"],"pdf_url":"https://arxiv.org/pdf/2407.14417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05770v1","updated":"2024-09-09T16:33:00Z","published":"2024-09-09T16:33:00Z","title":"Consensus-based Distributed Quantum Kernel Learning for Speech\n Recognition","summary":" This paper presents a Consensus-based Distributed Quantum Kernel Learning\n(CDQKL) framework aimed at improving speech recognition through distributed\nquantum computing.CDQKL addresses the challenges of scalability and data\nprivacy in centralized quantum kernel learning. It does this by distributing\ncomputational tasks across quantum terminals, which are connected through\nclassical channels. This approach enables the exchange of model parameters\nwithout sharing local training data, thereby maintaining data privacy and\nenhancing computational efficiency. Experimental evaluations on benchmark\nspeech emotion recognition datasets demonstrate that CDQKL achieves competitive\nclassification accuracy and scalability compared to centralized and local\nquantum kernel learning models. The distributed nature of CDQKL offers\nadvantages in privacy preservation and computational efficiency, making it\nsuitable for data-sensitive fields such as telecommunications, automotive, and\nfinance. The findings suggest that CDQKL can effectively leverage distributed\nquantum computing for large-scale machine-learning tasks.\n","authors":["Kuan-Cheng Chen","Wenxuan Ma","Xiaotian Xu"],"pdf_url":"https://arxiv.org/pdf/2409.05770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.07592v3","updated":"2024-09-09T16:18:54Z","published":"2022-02-15T17:28:42Z","title":"Deep Convolutional Autoencoder for Assessment of Drive-Cycle Anomalies\n in Connected Vehicle Sensor Data","summary":" This work investigates a practical and novel method for automated\nunsupervised fault detection in vehicles using a fully convolutional\nautoencoder. The results demonstrate the algorithm we developed can detect\nanomalies which correspond to powertrain faults by learning patterns in the\nmultivariate time-series data of hybrid-electric vehicle powertrain sensors.\nData was collected by engineers at Ford Motor Company from numerous sensors\nover several drive cycle variations. This study provides evidence of the\nanomaly detecting capability of our trained autoencoder and investigates the\nsuitability of our autoencoder relative to other unsupervised methods for\nautomatic fault detection in this data set. Preliminary results of testing the\nautoencoder on the powertrain sensor data indicate the data reconstruction\napproach availed by the autoencoder is a robust technique for identifying the\nabnormal sequences in the multivariate series. These results support that\nirregularities in hybrid-electric vehicles' powertrains are conveyed via sensor\nsignals in the embedded electronic communication system, and therefore can be\nidentified mechanistically with a trained algorithm. Additional unsupervised\nmethods are tested and show the autoencoder performs better at fault detection\nthan outlier detectors and other novel deep learning techniques.\n","authors":["Anthony Geglio","Eisa Hedayati","Mark Tascillo","Dyche Anderson","Jonathan Barker","Timothy C. Havens"],"pdf_url":"https://arxiv.org/pdf/2202.07592v3.pdf","comment":"SSCI2022, 7 pages, 3 Tables, 3 Figures"},{"id":"http://arxiv.org/abs/2401.01479v3","updated":"2024-09-09T16:12:01Z","published":"2024-01-03T00:49:51Z","title":"Kernel-U-Net: Multivariate Time Series Forecasting using Custom Kernels","summary":" Time series forecasting task predicts future trends based on historical\ninformation. Transformer-based U-Net architectures, despite their success in\nmedical image segmentation, have limitations in both expressiveness and\ncomputation efficiency in time series forecasting as evidenced in YFormer. To\ntackle these challenges, we introduce Kernel-U-Net, a flexible and\nkernel-customizable U-shape neural network architecture. The kernel-U-Net\nencoder compresses the input series into latent vectors, and its symmetric\ndecoder subsequently expands these vectors into output series. Specifically,\nKernel-U-Net separates the procedure of partitioning input time series into\npatches from kernel manipulation, thereby providing the convenience of\ncustomized executing kernels. Our method offers two primary advantages: 1)\nFlexibility in kernel customization to adapt to specific datasets; and 2)\nEnhanced computational efficiency, with the complexity of the Transformer layer\nreduced to linear. Experiments on seven real-world datasets, demonstrate that\nKernel-U-Net's performance either exceeds or meets that of the existing\nstate-of-the-art model in the majority of cases in channel-independent\nsettings. The source code for Kernel-U-Net will be made publicly available for\nfurther research and application.\n","authors":["Jiang You","Arben Cela","René Natowicz","Jacob Ouanounou","Patrick Siarry"],"pdf_url":"https://arxiv.org/pdf/2401.01479v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05755v1","updated":"2024-09-09T16:11:07Z","published":"2024-09-09T16:11:07Z","title":"Are Heterophily-Specific GNNs and Homophily Metrics Really Effective?\n Evaluation Pitfalls and New Benchmarks","summary":" Over the past decade, Graph Neural Networks (GNNs) have achieved great\nsuccess on machine learning tasks with relational data. However, recent studies\nhave found that heterophily can cause significant performance degradation of\nGNNs, especially on node-level tasks. Numerous heterophilic benchmark datasets\nhave been put forward to validate the efficacy of heterophily-specific GNNs and\nvarious homophily metrics have been designed to help people recognize these\nmalignant datasets. Nevertheless, there still exist multiple pitfalls that\nseverely hinder the proper evaluation of new models and metrics. In this paper,\nwe point out three most serious pitfalls: 1) a lack of hyperparameter tuning;\n2) insufficient model evaluation on the real challenging heterophilic datasets;\n3) missing quantitative evaluation benchmark for homophily metrics on synthetic\ngraphs. To overcome these challenges, we first train and fine-tune baseline\nmodels on $27$ most widely used benchmark datasets, categorize them into three\ndistinct groups: malignant, benign and ambiguous heterophilic datasets, and\nidentify the real challenging subsets of tasks. To our best knowledge, we are\nthe first to propose such taxonomy. Then, we re-evaluate $10$\nheterophily-specific state-of-the-arts (SOTA) GNNs with fine-tuned\nhyperparameters on different groups of heterophilic datasets. Based on the\nmodel performance, we reassess their effectiveness on addressing heterophily\nchallenge. At last, we evaluate $11$ popular homophily metrics on synthetic\ngraphs with three different generation approaches. To compare the metrics\nstrictly, we propose the first quantitative evaluation method based on\nFr\\'echet distance.\n","authors":["Sitao Luan","Qincheng Lu","Chenqing Hua","Xinyu Wang","Jiaqi Zhu","Xiao-Wen Chang","Guy Wolf","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2409.05755v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2407.09618"},{"id":"http://arxiv.org/abs/2409.05746v1","updated":"2024-09-09T16:01:58Z","published":"2024-09-09T16:01:58Z","title":"LLMs Will Always Hallucinate, and We Need to Live With This","summary":" As Large Language Models become more ubiquitous across domains, it becomes\nimportant to examine their inherent limitations critically. This work argues\nthat hallucinations in language models are not just occasional errors but an\ninevitable feature of these systems. We demonstrate that hallucinations stem\nfrom the fundamental mathematical and logical structure of LLMs. It is,\ntherefore, impossible to eliminate them through architectural improvements,\ndataset enhancements, or fact-checking mechanisms. Our analysis draws on\ncomputational theory and Godel's First Incompleteness Theorem, which references\nthe undecidability of problems like the Halting, Emptiness, and Acceptance\nProblems. We demonstrate that every stage of the LLM process-from training data\ncompilation to fact retrieval, intent classification, and text generation-will\nhave a non-zero probability of producing hallucinations. This work introduces\nthe concept of Structural Hallucination as an intrinsic nature of these\nsystems. By establishing the mathematical certainty of hallucinations, we\nchallenge the prevailing notion that they can be fully mitigated.\n","authors":["Sourav Banerjee","Ayushi Agarwal","Saloni Singla"],"pdf_url":"https://arxiv.org/pdf/2409.05746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03887v2","updated":"2024-09-09T15:53:27Z","published":"2024-09-05T19:50:26Z","title":"The Influence of Faulty Labels in Data Sets on Human Pose Estimation","summary":" In this study we provide empirical evidence demonstrating that the quality of\ntraining data impacts model performance in Human Pose Estimation (HPE).\nInaccurate labels in widely used data sets, ranging from minor errors to severe\nmislabeling, can negatively influence learning and distort performance metrics.\nWe perform an in-depth analysis of popular HPE data sets to show the extent and\nnature of label inaccuracies. Our findings suggest that accounting for the\nimpact of faulty labels will facilitate the development of more robust and\naccurate HPE models for a variety of real-world applications. We show improved\nperformance with cleansed data.\n","authors":["Arnold Schwarz","Levente Hernadi","Felix Bießmann","Kristian Hildebrand"],"pdf_url":"https://arxiv.org/pdf/2409.03887v2.pdf","comment":"15 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2408.09237v4","updated":"2024-09-09T15:51:05Z","published":"2024-08-17T16:06:14Z","title":"QEDCartographer: Automating Formal Verification Using Reward-Free\n Reinforcement Learning","summary":" Formal verification is a promising method for producing reliable software,\nbut the difficulty of manually writing verification proofs severely limits its\nutility in practice. Recent methods have automated some proof synthesis by\nguiding a search through the proof space using a theorem prover. Unfortunately,\nthe theorem prover provides only the crudest estimate of progress, resulting in\neffectively undirected search. To address this problem, we create\nQEDCartographer, an automated proof-synthesis tool that combines supervised and\nreinforcement learning to more effectively explore the proof space.\nQEDCartographer incorporates the proofs' branching structure, enabling\nreward-free search and overcoming the sparse reward problem inherent to formal\nverification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K\ntheorems from 124 open-source Coq projects. QEDCartographer fully automatically\nproves 21.4% of the test-set theorems. Previous search-based proof-synthesis\ntools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on\nsupervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively.\nDiva, which combines 62 tools, proves 19.2%. Comparing to the most effective\nprior tool, Proverbot9001, QEDCartographer produces 34% shorter proofs 29%\nfaster, on average over the theorems both tools prove. Together,\nQEDCartographer and non-learning-based CoqHammer prove 30.3% of the theorems,\nwhile CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement\nlearning is a fruitful research direction for improving proof-synthesis tools'\nsearch mechanisms.\n","authors":["Alex Sanchez-Stern","Abhishek Varghese","Zhanna Kaufman","Dylan Zhang","Talia Ringer","Yuriy Brun"],"pdf_url":"https://arxiv.org/pdf/2408.09237v4.pdf","comment":"Published in the International Conference on Software Engineering\n (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan\n Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal\n Verification Using Reward-Free Reinforcement Learning, in Proceedings of the\n 47th International Conference on Software Engineering (ICSE), 2025"},{"id":"http://arxiv.org/abs/2409.05709v1","updated":"2024-09-09T15:20:24Z","published":"2024-09-09T15:20:24Z","title":"Real-time optimal control of high-dimensional parametrized systems by\n deep learning-based reduced order models","summary":" Steering a system towards a desired target in a very short amount of time is\nchallenging from a computational standpoint. Indeed, the intrinsically\niterative nature of optimal control problems requires multiple simulations of\nthe physical system to be controlled. Moreover, the control action needs to be\nupdated whenever the underlying scenario undergoes variations. Full-order\nmodels based on, e.g., the Finite Element Method, do not meet these\nrequirements due to the computational burden they usually entail. On the other\nhand, conventional reduced order modeling techniques such as the Reduced Basis\nmethod, are intrusive, rely on a linear superimposition of modes, and lack of\nefficiency when addressing nonlinear time-dependent dynamics. In this work, we\npropose a non-intrusive Deep Learning-based Reduced Order Modeling (DL-ROM)\ntechnique for the rapid control of systems described in terms of parametrized\nPDEs in multiple scenarios. In particular, optimal full-order snapshots are\ngenerated and properly reduced by either Proper Orthogonal Decomposition or\ndeep autoencoders (or a combination thereof) while feedforward neural networks\nare exploited to learn the map from scenario parameters to reduced optimal\nsolutions. Nonlinear dimensionality reduction therefore allows us to consider\nstate variables and control actions that are both low-dimensional and\ndistributed. After (i) data generation, (ii) dimensionality reduction, and\n(iii) neural networks training in the offline phase, optimal control strategies\ncan be rapidly retrieved in an online phase for any scenario of interest. The\ncomputational speedup and the high accuracy obtained with the proposed approach\nare assessed on different PDE-constrained optimization problems, ranging from\nthe minimization of energy dissipation in incompressible flows modelled through\nNavier-Stokes equations to the thermal active cooling in heat transfer.\n","authors":["Matteo Tomasetto","Andrea Manzoni","Francesco Braghin"],"pdf_url":"https://arxiv.org/pdf/2409.05709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05701v1","updated":"2024-09-09T15:13:56Z","published":"2024-09-09T15:13:56Z","title":"pFedGPA: Diffusion-based Generative Parameter Aggregation for\n Personalized Federated Learning","summary":" Federated Learning (FL) offers a decentralized approach to model training,\nwhere data remains local and only model parameters are shared between the\nclients and the central server. Traditional methods, such as Federated\nAveraging (FedAvg), linearly aggregate these parameters which are usually\ntrained on heterogeneous data distributions, potentially overlooking the\ncomplex, high-dimensional nature of the parameter space. This can result in\ndegraded performance of the aggregated model. While personalized FL approaches\ncan mitigate the heterogeneous data issue to some extent, the limitation of\nlinear aggregation remains unresolved. To alleviate this issue, we investigate\nthe generative approach of diffusion model and propose a novel generative\nparameter aggregation framework for personalized FL, \\texttt{pFedGPA}. In this\nframework, we deploy a diffusion model on the server to integrate the diverse\nparameter distributions and propose a parameter inversion method to efficiently\ngenerate a set of personalized parameters for each client. This inversion\nmethod transforms the uploaded parameters into a latent code, which is then\naggregated through denoising sampling to produce the final personalized\nparameters. By encoding the dependence of a client's model parameters on the\nspecific data distribution using the high-capacity diffusion model,\n\\texttt{pFedGPA} can effectively decouple the complexity of the overall\ndistribution of all clients' model parameters from the complexity of each\nindividual client's parameter distribution. Our experimental results\nconsistently demonstrate the superior performance of the proposed method across\nmultiple datasets, surpassing baseline approaches.\n","authors":["Jiahao Lai","Jiaqi Li","Jian Xu","Yanru Wu","Boshi Tang","Siqi Chen","Yongfeng Huang","Wenbo Ding","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2409.05701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05698v1","updated":"2024-09-09T15:12:24Z","published":"2024-09-09T15:12:24Z","title":"MANA-Net: Mitigating Aggregated Sentiment Homogenization with News\n Weighting for Enhanced Market Prediction","summary":" It is widely acknowledged that extracting market sentiments from news data\nbenefits market predictions. However, existing methods of using financial\nsentiments remain simplistic, relying on equal-weight and static aggregation to\nmanage sentiments from multiple news items. This leads to a critical issue\ntermed ``Aggregated Sentiment Homogenization'', which has been explored through\nour analysis of a large financial news dataset from industry practice. This\nphenomenon occurs when aggregating numerous sentiments, causing representations\nto converge towards the mean values of sentiment distributions and thereby\nsmoothing out unique and important information. Consequently, the aggregated\nsentiment representations lose much predictive value of news data. To address\nthis problem, we introduce the Market Attention-weighted News Aggregation\nNetwork (MANA-Net), a novel method that leverages a dynamic market-news\nattention mechanism to aggregate news sentiments for market prediction.\nMANA-Net learns the relevance of news sentiments to price changes and assigns\nvarying weights to individual news items. By integrating the news aggregation\nstep into the networks for market prediction, MANA-Net allows for trainable\nsentiment representations that are optimized directly for prediction. We\nevaluate MANA-Net using the S&P 500 and NASDAQ 100 indices, along with\nfinancial news spanning from 2003 to 2018. Experimental results demonstrate\nthat MANA-Net outperforms various recent market prediction methods, enhancing\nProfit & Loss by 1.1% and the daily Sharpe ratio by 0.252.\n","authors":["Mengyu Wang","Tiejun Ma"],"pdf_url":"https://arxiv.org/pdf/2409.05698v1.pdf","comment":"Accepted by CIKM 24"},{"id":"http://arxiv.org/abs/2409.05697v1","updated":"2024-09-09T15:11:45Z","published":"2024-09-09T15:11:45Z","title":"Segmentation by Factorization: Unsupervised Semantic Segmentation for\n Pathology by Factorizing Foundation Model Features","summary":" We introduce Segmentation by Factorization (F-SEG), an unsupervised\nsegmentation method for pathology that generates segmentation masks from\npre-trained deep learning models. F-SEG allows the use of pre-trained deep\nneural networks, including recently developed pathology foundation models, for\nsemantic segmentation. It achieves this without requiring additional training\nor finetuning, by factorizing the spatial features extracted by the models into\nsegmentation masks and their associated concept features. We create generic\ntissue phenotypes for H&E images by training clustering models for multiple\nnumbers of clusters on features extracted from several deep learning models on\nThe Cancer Genome Atlas Program (TCGA), and then show how the clusters can be\nused for factorizing corresponding segmentation masks using off-the-shelf deep\nlearning models. Our results show that F-SEG provides robust unsupervised\nsegmentation capabilities for H&E pathology images, and that the segmentation\nquality is greatly improved by utilizing pathology foundation models. We\ndiscuss and propose methods for evaluating the performance of unsupervised\nsegmentation in pathology.\n","authors":["Jacob Gildenblat","Ofir Hadar"],"pdf_url":"https://arxiv.org/pdf/2409.05697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05692v1","updated":"2024-09-09T15:05:27Z","published":"2024-09-09T15:05:27Z","title":"Extracting the U.S. building types from OpenStreetMap data","summary":" Building type information is crucial for population estimation, traffic\nplanning, urban planning, and emergency response applications. Although\nessential, such data is often not readily available. To alleviate this problem,\nthis work creates a comprehensive dataset by providing\nresidential/non-residential building classification covering the entire United\nStates. We propose and utilize an unsupervised machine learning method to\nclassify building types based on building footprints and available\nOpenStreetMap information. The classification result is validated using\nauthoritative ground truth data for select counties in the U.S. The validation\nshows a high precision for non-residential building classification and a high\nrecall for residential buildings. We identified various approaches to improving\nthe quality of the classification, such as removing sheds and garages from the\ndataset. Furthermore, analyzing the misclassifications revealed that they are\nmainly due to missing and scarce metadata in OSM. A major result of this work\nis the resulting dataset of classifying 67,705,475 buildings. We hope that this\ndata is of value to the scientific community, including urban and\ntransportation planners.\n","authors":["Henrique F. de Arruda","Sandro M. Reia","Shiyang Ruan","Kuldip S. Atwal","Hamdi Kavak","Taylor Anderson","Dieter Pfoser"],"pdf_url":"https://arxiv.org/pdf/2409.05692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03727v3","updated":"2024-09-09T15:04:15Z","published":"2024-05-06T08:09:46Z","title":"Large Language Models Synergize with Automated Machine Learning","summary":" Recently, program synthesis driven by large language models (LLMs) has become\nincreasingly popular. However, program synthesis for machine learning (ML)\ntasks still poses significant challenges. This paper explores a novel form of\nprogram synthesis, targeting ML programs, by combining LLMs and automated\nmachine learning (autoML). Specifically, our goal is to fully automate the\ngeneration and optimization of the code of the entire ML workflow, from data\npreparation to modeling and post-processing, utilizing only textual\ndescriptions of the ML tasks. To manage the length and diversity of ML\nprograms, we propose to break each ML program into smaller, manageable parts.\nEach part is generated separately by the LLM, with careful consideration of\ntheir compatibilities. To ensure compatibilities, we design a testing technique\nfor ML programs. Unlike traditional program synthesis, which typically relies\non binary evaluations (i.e., correct or incorrect), evaluating ML programs\nnecessitates more than just binary judgments. Our approach automates the\nnumerical evaluation and optimization of these programs, selecting the best\ncandidates through autoML techniques. In experiments across various ML tasks,\nour method outperforms existing methods in 10 out of 12 tasks for generating ML\nprograms. In addition, autoML significantly improves the performance of the\ngenerated ML programs. In experiments, given the textual task description, our\nmethod, Text-to-ML, generates the complete and optimized ML program in a fully\nautonomous process. The implementation of our method is available at\nhttps://github.com/JLX0/llm-automl.\n","authors":["Jinglue Xu","Jialong Li","Zhen Liu","Nagar Anthel Venkatesh Suryanarayanan","Guoyuan Zhou","Jia Guo","Hitoshi Iba","Kenji Tei"],"pdf_url":"https://arxiv.org/pdf/2405.03727v3.pdf","comment":"published at TMLR"},{"id":"http://arxiv.org/abs/2305.09868v3","updated":"2024-09-09T15:00:32Z","published":"2023-05-17T00:45:41Z","title":"The Principle of Uncertain Maximum Entropy","summary":" The principle of maximum entropy is a well-established technique for choosing\na distribution that matches available information while minimizing bias. It\nfinds broad use across scientific disciplines and in machine learning. However,\nthe principle as defined by is susceptible to noise and error in observations.\nThis forces real-world practitioners to use relaxed versions of the principle\nin an ad hoc way, negatively impacting interpretation. To address this\nsituation, we present a new principle we call uncertain maximum entropy that\ngeneralizes the classic principle and provides interpretable solutions\nirrespective of the observational methods in use. We introduce a convex\napproximation and expectation-maximization based algorithm for finding\nsolutions to our new principle. Finally, we contrast this new technique with\ntwo simpler generally applicable solutions theoretically and experimentally\nshow our technique provides superior accuracy.\n","authors":["Kenneth Bogert","Matthew Kothe"],"pdf_url":"https://arxiv.org/pdf/2305.09868v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.00220v2","updated":"2024-09-09T14:56:49Z","published":"2022-06-01T04:08:57Z","title":"Adaptive Online Learning of Quantum States","summary":" The problem of efficient quantum state learning, also called shadow\ntomography, aims to comprehend an unknown $d$-dimensional quantum state through\nPOVMs. Yet, these states are rarely static; they evolve due to factors such as\nmeasurements, environmental noise, or inherent Hamiltonian state transitions.\nThis paper leverages techniques from adaptive online learning to keep pace with\nsuch state changes.\n The key metrics considered for learning in these mutable environments are\nenhanced notions of regret, specifically adaptive and dynamic regret. We\npresent adaptive and dynamic regret bounds for online shadow tomography, which\nare polynomial in the number of qubits and sublinear in the number of\nmeasurements. To support our theoretical findings, we include numerical\nexperiments that validate our proposed models.\n","authors":["Xinyi Chen","Elad Hazan","Tongyang Li","Zhou Lu","Xinzhao Wang","Rui Yang"],"pdf_url":"https://arxiv.org/pdf/2206.00220v2.pdf","comment":"28 pages, 4 figures"},{"id":"http://arxiv.org/abs/2409.05672v1","updated":"2024-09-09T14:41:24Z","published":"2024-09-09T14:41:24Z","title":"Zero-shot Outlier Detection via Prior-data Fitted Networks: Model\n Selection Bygone!","summary":" Outlier detection (OD) has a vast literature as it finds numerous\napplications in environmental monitoring, cybersecurity, finance, and medicine\nto name a few. Being an inherently unsupervised task, model selection is a key\nbottleneck for OD (both algorithm and hyperparameter selection) without label\nsupervision. There is a long list of techniques to choose from -- both\nclassical algorithms and deep neural architectures -- and while several studies\nreport their hyperparameter sensitivity, the literature is quite slim on\nunsupervised model selection -- limiting the effective use of OD in practice.\nIn this paper we present FoMo-0D, for zero/0-shot OD exploring a transformative\nnew direction that bypasses the hurdle of model selection altogether (!), thus\nbreaking new ground. The fundamental idea behind FoMo-0D is the Prior-data\nFitted Networks, recently introduced by Muller et al.(2022), which trains a\nTransformer model on a large body of synthetically generated data from a prior\ndata distribution. In essence, FoMo-0D is a pretrained Foundation Model for\nzero/0-shot OD on tabular data, which can directly predict the (outlier/inlier)\nlabel of any test data at inference time, by merely a single forward pass --\nmaking obsolete the need for choosing an algorithm/architecture, tuning its\nassociated hyperparameters, and even training any model parameters when given a\nnew OD dataset. Extensive experiments on 57 public benchmark datasets against\n26 baseline methods show that FoMo-0D performs statistically no different from\nthe top 2nd baseline, while significantly outperforming the majority of the\nbaselines, with an average inference time of 7.7 ms per test sample.\n","authors":["Yuchen Shen","Haomin Wen","Leman Akoglu"],"pdf_url":"https://arxiv.org/pdf/2409.05672v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.05668v1","updated":"2024-09-09T14:38:31Z","published":"2024-09-09T14:38:31Z","title":"Unlearning or Concealment? A Critical Analysis and Evaluation Metrics\n for Unlearning in Diffusion Models","summary":" Recent research has seen significant interest in methods for concept removal\nand targeted forgetting in diffusion models. In this paper, we conduct a\ncomprehensive white-box analysis to expose significant vulnerabilities in\nexisting diffusion model unlearning methods. We show that the objective\nfunctions used for unlearning in the existing methods lead to decoupling of the\ntargeted concepts (meant to be forgotten) for the corresponding prompts. This\nis concealment and not actual unlearning, which was the original goal. The\nineffectiveness of current methods stems primarily from their narrow focus on\nreducing generation probabilities for specific prompt sets, neglecting the\ndiverse modalities of intermediate guidance employed during the inference\nprocess. The paper presents a rigorous theoretical and empirical examination of\nfour commonly used techniques for unlearning in diffusion models. We introduce\ntwo new evaluation metrics: Concept Retrieval Score (CRS) and Concept\nConfidence Score (CCS). These metrics are based on a successful adversarial\nattack setup that can recover forgotten concepts from unlearned diffusion\nmodels. The CRS measures the similarity between the latent representations of\nthe unlearned and fully trained models after unlearning. It reports the extent\nof retrieval of the forgotten concepts with increasing amount of guidance. The\nCCS quantifies the confidence of the model in assigning the target concept to\nthe manipulated data. It reports the probability of the unlearned model's\ngenerations to be aligned with the original domain knowledge with increasing\namount of guidance. Evaluating existing unlearning methods with our proposed\nstringent metrics for diffusion models reveals significant shortcomings in\ntheir ability to truly unlearn concepts. Source Code:\nhttps://respailab.github.io/unlearning-or-concealment\n","authors":["Aakash Sen Sharma","Niladri Sarkar","Vikram Chundawat","Ankur A Mali","Murari Mandal"],"pdf_url":"https://arxiv.org/pdf/2409.05668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05665v1","updated":"2024-09-09T14:36:33Z","published":"2024-09-09T14:36:33Z","title":"K-Fold Causal BART for CATE Estimation","summary":" This research aims to propose and evaluate a novel model named K-Fold Causal\nBayesian Additive Regression Trees (K-Fold Causal BART) for improved estimation\nof Average Treatment Effects (ATE) and Conditional Average Treatment Effects\n(CATE). The study employs synthetic and semi-synthetic datasets, including the\nwidely recognized Infant Health and Development Program (IHDP) benchmark\ndataset, to validate the model's performance. Despite promising results in\nsynthetic scenarios, the IHDP dataset reveals that the proposed model is not\nstate-of-the-art for ATE and CATE estimation. Nonetheless, the research\nprovides several novel insights: 1. The ps-BART model is likely the preferred\nchoice for CATE and ATE estimation due to better generalization compared to the\nother benchmark models - including the Bayesian Causal Forest (BCF) model,\nwhich is considered by many the current best model for CATE estimation, 2. The\nBCF model's performance deteriorates significantly with increasing treatment\neffect heterogeneity, while the ps-BART model remains robust, 3. Models tend to\nbe overconfident in CATE uncertainty quantification when treatment effect\nheterogeneity is low, 4. A second K-Fold method is unnecessary for avoiding\noverfitting in CATE estimation, as it adds computational costs without\nimproving performance, 5. Detailed analysis reveals the importance of\nunderstanding dataset characteristics and using nuanced evaluation methods, 6.\nThe conclusion of Curth et al. (2021) that indirect strategies for CATE\nestimation are superior for the IHDP dataset is contradicted by the results of\nthis research. These findings challenge existing assumptions and suggest\ndirections for future research to enhance causal inference methodologies.\n","authors":["Hugo Gobato Souto","Francisco Louzada Neto"],"pdf_url":"https://arxiv.org/pdf/2409.05665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05662v1","updated":"2024-09-09T14:35:23Z","published":"2024-09-09T14:35:23Z","title":"Real-Time Human Action Recognition on Embedded Platforms","summary":" With advancements in computer vision and deep learning, video-based human\naction recognition (HAR) has become practical. However, due to the complexity\nof the computation pipeline, running HAR on live video streams incurs excessive\ndelays on embedded platforms. This work tackles the real-time performance\nchallenges of HAR with four contributions: 1) an experimental study identifying\na standard Optical Flow (OF) extraction technique as the latency bottleneck in\na state-of-the-art HAR pipeline, 2) an exploration of the latency-accuracy\ntradeoff between the standard and deep learning approaches to OF extraction,\nwhich highlights the need for a novel, efficient motion feature extractor, 3)\nthe design of Integrated Motion Feature Extractor (IMFE), a novel single-shot\nneural network architecture for motion feature extraction with drastic\nimprovement in latency, 4) the development of RT-HARE, a real-time HAR system\ntailored for embedded platforms. Experimental results on an Nvidia Jetson\nXavier NX platform demonstrated that RT-HARE realizes real-time HAR at a video\nframe rate of 30 frames per second while delivering high levels of recognition\naccuracy.\n","authors":["Ruiqi Wang","Zichen Wang","Peiqi Gao","Mingzhen Li","Jaehwan Jeong","Yihang Xu","Yejin Lee","Lisa Connor","Chenyang Lu"],"pdf_url":"https://arxiv.org/pdf/2409.05662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16528v3","updated":"2024-09-09T14:31:26Z","published":"2024-05-26T11:29:57Z","title":"LoQT: Low-Rank Adapters for Quantized Pre-Training","summary":" Training of large neural networks requires significant computational\nresources. Despite advances using low-rank adapters and quantization,\npretraining of models such as LLMs on consumer hardware has not been possible\nwithout model sharding, offloading during training, or per-layer gradient\nupdates. To address these limitations, we propose LoQT, a method for\nefficiently training quantized models. LoQT uses gradient-based tensor\nfactorization to initialize low-rank trainable weight matrices that are\nperiodically merged into quantized full-rank weight matrices. Our approach is\nsuitable for both pretraining and fine-tuning of models, which we demonstrate\nexperimentally for language modeling and downstream task adaptation. We find\nthat LoQT enables efficient training of models up to 7B parameters on a\nconsumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B\nparameter model using per-layer gradient updates on the same hardware.\n","authors":["Sebastian Loeschcke","Mads Toftrup","Michael J. Kastoryano","Serge Belongie","Vésteinn Snæbjarnarson"],"pdf_url":"https://arxiv.org/pdf/2405.16528v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04069v2","updated":"2024-09-09T14:28:03Z","published":"2024-09-06T07:20:45Z","title":"Online Residual Learning from Offline Experts for Pedestrian Tracking","summary":" In this paper, we consider the problem of predicting unknown targets from\ndata. We propose Online Residual Learning (ORL), a method that combines online\nadaptation with offline-trained predictions. At a lower level, we employ\nmultiple offline predictions generated before or at the beginning of the\nprediction horizon. We augment every offline prediction by learning their\nrespective residual error concerning the true target state online, using the\nrecursive least squares algorithm. At a higher level, we treat the augmented\nlower-level predictors as experts, adopting the Prediction with Expert Advice\nframework. We utilize an adaptive softmax weighting scheme to form an aggregate\nprediction and provide guarantees for ORL in terms of regret. We employ ORL to\nboost performance in the setting of online pedestrian trajectory prediction.\nBased on data from the Stanford Drone Dataset, we show that ORL can demonstrate\nbest-of-both-worlds performance.\n","authors":["Anastasios Vlachos","Anastasios Tsiamis","Aren Karapetyan","Efe C. Balta","John Lygeros"],"pdf_url":"https://arxiv.org/pdf/2409.04069v2.pdf","comment":"Accepted to CDC 2024, v2: fixed certain typos"},{"id":"http://arxiv.org/abs/2312.17336v3","updated":"2024-09-09T14:24:54Z","published":"2023-12-28T19:28:23Z","title":"PINN surrogate of Li-ion battery models for parameter inference. Part\n II: Regularization and application of the pseudo-2D model","summary":" Bayesian parameter inference is useful to improve Li-ion battery diagnostics\nand can help formulate battery aging models. However, it is computationally\nintensive and cannot be easily repeated for multiple cycles, multiple operating\nconditions, or multiple replicate cells. To reduce the computational cost of\nBayesian calibration, numerical solvers for physics-based models can be\nreplaced with faster surrogates. A physics-informed neural network (PINN) is\ndeveloped as a surrogate for the pseudo-2D (P2D) battery model calibration. For\nthe P2D surrogate, additional training regularization was needed as compared to\nthe PINN single-particle model (SPM) developed in Part I. Both the PINN SPM and\nP2D surrogate models are exercised for parameter inference and compared to data\nobtained from a direct numerical solution of the governing equations. A\nparameter inference study highlights the ability to use these PINNs to\ncalibrate scaling parameters for the cathode Li diffusion and the anode\nexchange current density. By realizing computational speed-ups of 2250x for the\nP2D model, as compared to using standard integrating methods, the PINN\nsurrogates enable rapid state-of-health diagnostics. In the low-data\navailability scenario, the testing error was estimated to 2mV for the SPM\nsurrogate and 10mV for the P2D surrogate which could be mitigated with\nadditional data.\n","authors":["Malik Hassanaly","Peter J. Weddle","Ryan N. King","Subhayan De","Alireza Doostan","Corey R. Randall","Eric J. Dufek","Andrew M. Colclasure","Kandler Smith"],"pdf_url":"https://arxiv.org/pdf/2312.17336v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05657v1","updated":"2024-09-09T14:23:19Z","published":"2024-09-09T14:23:19Z","title":"Adversarial Attacks on Data Attribution","summary":" Data attribution aims to quantify the contribution of individual training\ndata points to the outputs of an AI model, which has been used to measure the\nvalue of training data and compensate data providers. Given the impact on\nfinancial decisions and compensation mechanisms, a critical question arises\nconcerning the adversarial robustness of data attribution methods. However,\nthere has been little to no systematic research addressing this issue. In this\nwork, we aim to bridge this gap by detailing a threat model with clear\nassumptions about the adversary's goal and capabilities, and by proposing\nprincipled adversarial attack methods on data attribution. We present two such\nmethods, Shadow Attack and Outlier Attack, both of which generate manipulated\ndatasets to adversarially inflate the compensation. The Shadow Attack leverages\nknowledge about the data distribution in the AI applications, and derives\nadversarial perturbations through \"shadow training\", a technique commonly used\nin membership inference attacks. In contrast, the Outlier Attack does not\nassume any knowledge about the data distribution and relies solely on black-box\nqueries to the target model's predictions. It exploits an inductive bias\npresent in many data attribution methods - outlier data points are more likely\nto be influential - and employs adversarial examples to generate manipulated\ndatasets. Empirically, in image classification and text generation tasks, the\nShadow Attack can inflate the data-attribution-based compensation by at least\n200%, while the Outlier Attack achieves compensation inflation ranging from\n185% to as much as 643%.\n","authors":["Xinhe Wang","Pingbang Hu","Junwei Deng","Jiaqi W. Ma"],"pdf_url":"https://arxiv.org/pdf/2409.05657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05655v1","updated":"2024-09-09T14:22:19Z","published":"2024-09-09T14:22:19Z","title":"Interactive incremental learning of generalizable skills with local\n trajectory modulation","summary":" The problem of generalization in learning from demonstration (LfD) has\nreceived considerable attention over the years, particularly within the context\nof movement primitives, where a number of approaches have emerged. Recently,\ntwo important approaches have gained recognition. While one leverages\nvia-points to adapt skills locally by modulating demonstrated trajectories,\nanother relies on so-called task-parameterized models that encode movements\nwith respect to different coordinate systems, using a product of probabilities\nfor generalization. While the former are well-suited to precise, local\nmodulations, the latter aim at generalizing over large regions of the workspace\nand often involve multiple objects. Addressing the quality of generalization by\nleveraging both approaches simultaneously has received little attention. In\nthis work, we propose an interactive imitation learning framework that\nsimultaneously leverages local and global modulations of trajectory\ndistributions. Building on the kernelized movement primitives (KMP) framework,\nwe introduce novel mechanisms for skill modulation from direct human corrective\nfeedback. Our approach particularly exploits the concept of via-points to\nincrementally and interactively 1) improve the model accuracy locally, 2) add\nnew objects to the task during execution and 3) extend the skill into regions\nwhere demonstrations were not provided. We evaluate our method on a bearing\nring-loading task using a torque-controlled, 7-DoF, DLR SARA robot.\n","authors":["Markus Knauer","Alin Albu-Schäffer","Freek Stulp","João Silvério"],"pdf_url":"https://arxiv.org/pdf/2409.05655v1.pdf","comment":"21 pages, 16 figures"},{"id":"http://arxiv.org/abs/2409.05635v1","updated":"2024-09-09T14:05:30Z","published":"2024-09-09T14:05:30Z","title":"Optimal Projections for Classification with Naive Bayes","summary":" In the Naive Bayes classification model the class conditional densities are\nestimated as the products of their marginal densities along the cardinal basis\ndirections. We study the problem of obtaining an alternative basis for this\nfactorisation with the objective of enhancing the discriminatory power of the\nassociated classification model. We formulate the problem as a projection\npursuit to find the optimal linear projection on which to perform\nclassification. Optimality is determined based on the multinomial likelihood\nwithin which probabilities are estimated using the Naive Bayes factorisation of\nthe projected data. Projection pursuit offers the added benefits of dimension\nreduction and visualisation. We discuss an intuitive connection with class\nconditional independent components analysis, and show how this is realised\nvisually in practical applications. The performance of the resulting\nclassification models is investigated using a large collection of (162)\npublicly available benchmark data sets and in comparison with relevant\nalternatives. We find that the proposed approach substantially outperforms\nother popular probabilistic discriminant analysis models and is highly\ncompetitive with Support Vector Machines.\n","authors":["David P. Hofmeyr","Francois Kamper","Michail M. Melonas"],"pdf_url":"https://arxiv.org/pdf/2409.05635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05622v1","updated":"2024-09-09T13:56:03Z","published":"2024-09-09T13:56:03Z","title":"Forward KL Regularized Preference Optimization for Aligning Diffusion\n Policies","summary":" Diffusion models have achieved remarkable success in sequential\ndecision-making by leveraging the highly expressive model capabilities in\npolicy learning. A central problem for learning diffusion policies is to align\nthe policy output with human intents in various tasks. To achieve this,\nprevious methods conduct return-conditioned policy generation or Reinforcement\nLearning (RL)-based policy optimization, while they both rely on pre-defined\nreward functions. In this work, we propose a novel framework, Forward KL\nregularized Preference optimization for aligning Diffusion policies, to align\nthe diffusion policy with preferences directly. We first train a diffusion\npolicy from the offline dataset without considering the preference, and then\nalign the policy to the preference data via direct preference optimization.\nDuring the alignment phase, we formulate direct preference learning in a\ndiffusion policy, where the forward KL regularization is employed in preference\noptimization to avoid generating out-of-distribution actions. We conduct\nextensive experiments for MetaWorld manipulation and D4RL tasks. The results\nshow our method exhibits superior alignment with preferences and outperforms\nprevious state-of-the-art algorithms.\n","authors":["Zhao Shan","Chenyou Fan","Shuang Qiu","Jiyuan Shi","Chenjia Bai"],"pdf_url":"https://arxiv.org/pdf/2409.05622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05620v1","updated":"2024-09-09T13:55:07Z","published":"2024-09-09T13:55:07Z","title":"Joint Input and Output Coordination for Class-Incremental Learning","summary":" Incremental learning is nontrivial due to severe catastrophic forgetting.\nAlthough storing a small amount of data on old tasks during incremental\nlearning is a feasible solution, current strategies still do not 1) adequately\naddress the class bias problem, and 2) alleviate the mutual interference\nbetween new and old tasks, and 3) consider the problem of class bias within\ntasks. This motivates us to propose a joint input and output coordination\n(JIOC) mechanism to address these issues. This mechanism assigns different\nweights to different categories of data according to the gradient of the output\nscore, and uses knowledge distillation (KD) to reduce the mutual interference\nbetween the outputs of old and new tasks. The proposed mechanism is general and\nflexible, and can be incorporated into different incremental learning\napproaches that use memory storage. Extensive experiments show that our\nmechanism can significantly improve their performance.\n","authors":["Shuai Wang","Yibing Zhan","Yong Luo","Han Hu","Wei Yu","Yonggang Wen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.05620v1.pdf","comment":"11 pages, 4 figues. Accepted by IJCAI 2024"},{"id":"http://arxiv.org/abs/2409.04275v2","updated":"2024-09-09T13:51:57Z","published":"2024-09-06T13:37:08Z","title":"AttentionX: Exploiting Consensus Discrepancy In Attention from A\n Distributed Optimization Perspective","summary":" In this paper, we extend the standard Attention in transformer by exploiting\nthe consensus discrepancy from a distributed optimization perspective, referred\nto as AttentionX. It is noted that the primal-dual method of multipliers (PDMM)\n\\cite{Zhang16PDMM} is designed to iteratively solve a broad class of\ndistributed optimization problems over a pear-to-pear (P2P) network, where\nneighbouring nodes gradually reach consensus as specified by predefined linear\nedge-constraints in the optimization process. In particular, at each iteration\nof PDMM, each node in a network first performs information-gathering from\nneighbours and then performs local information-fusion. From a high-level point\nof view, the $KQ$-softmax-based weighted summation of $V$-representations in\nAttention corresponds information-gathering from neighbours while the\nfeature-processing via the feed-forward network (FFN) in transformer\ncorresponds to local information fusion. PDMM exploits the Lagrangian\nmultipliers to capture the historical consensus discrepancy in the form of\nresidual errors of the linear edge-constraints, which plays a crucial role for\nthe algorithm to converge. Inspired by PDMM, we propose AttentionX to\nincorporate the consensus discrepancy in the output update-expression of the\nstandard Attention. The consensus discrepancy in AttentionX refers to the\ndifference between the weighted summation of $V$-representations and scaled\n$V$-representions themselves. Experiments on ViT and nanoGPT show promising\nperformance.\n","authors":["Guoqiang Zhang","Richard Heusdens"],"pdf_url":"https://arxiv.org/pdf/2409.04275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17244v2","updated":"2024-09-09T13:51:49Z","published":"2024-08-30T12:36:00Z","title":"Categorical data clustering: 25 years beyond K-modes","summary":" The clustering of categorical data is a common and important task in computer\nscience, offering profound implications across a spectrum of applications.\nUnlike purely numerical data, categorical data often lack inherent ordering as\nin nominal data, or have varying levels of order as in ordinal data, thus\nrequiring specialized methodologies for efficient organization and analysis.\nThis review provides a comprehensive synthesis of categorical data clustering\nin the past twenty-five years, starting from the introduction of K-modes. It\nelucidates the pivotal role of categorical data clustering in diverse fields\nsuch as health sciences, natural sciences, social sciences, education,\nengineering and economics. Practical comparisons are conducted for algorithms\nhaving public implementations, highlighting distinguishing clustering\nmethodologies and revealing the performance of recent algorithms on several\nbenchmark categorical datasets. Finally, challenges and opportunities in the\nfield are discussed.\n","authors":["Tai Dinh","Wong Hauchi","Philippe Fournier-Viger","Daniil Lisik","Minh-Quyet Ha","Hieu-Chi Dam","Van-Nam Huynh"],"pdf_url":"https://arxiv.org/pdf/2408.17244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05602v1","updated":"2024-09-09T13:38:00Z","published":"2024-09-09T13:38:00Z","title":"Normalizing Energy Consumption for Hardware-Independent Evaluation","summary":" The increasing use of machine learning (ML) models in signal processing has\nraised concerns about their environmental impact, particularly during\nresource-intensive training phases. In this study, we present a novel\nmethodology for normalizing energy consumption across different hardware\nplatforms to facilitate fair and consistent comparisons. We evaluate different\nnormalization strategies by measuring the energy used to train different ML\narchitectures on different GPUs, focusing on audio tagging tasks. Our approach\nshows that the number of reference points, the type of regression and the\ninclusion of computational metrics significantly influences the normalization\nprocess. We find that the appropriate selection of two reference points\nprovides robust normalization, while incorporating the number of floating-point\noperations and parameters improves the accuracy of energy consumption\npredictions. By supporting more accurate energy consumption evaluation, our\nmethodology promotes the development of environmentally sustainable ML\npractices.\n","authors":["Constance Douwes","Romain Serizel"],"pdf_url":"https://arxiv.org/pdf/2409.05602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05598v1","updated":"2024-09-09T13:31:00Z","published":"2024-09-09T13:31:00Z","title":"When resampling/reweighting improves feature learning in imbalanced\n classification?: A toy-model study","summary":" A toy model of binary classification is studied with the aim of clarifying\nthe class-wise resampling/reweighting effect on the feature learning\nperformance under the presence of class imbalance. In the analysis, a\nhigh-dimensional limit of the feature is taken while keeping the dataset size\nratio against the feature dimension finite and the non-rigorous replica method\nfrom statistical mechanics is employed. The result shows that there exists a\ncase in which the no resampling/reweighting situation gives the best feature\nlearning performance irrespectively of the choice of losses or classifiers,\nsupporting recent findings in Cao et al. (2019); Kang et al. (2019). It is also\nrevealed that the key of the result is the symmetry of the loss and the problem\nsetting. Inspired by this, we propose a further simplified model exhibiting the\nsame property for the multiclass setting. These clarify when the class-wise\nresampling/reweighting becomes effective in imbalanced classification.\n","authors":["Tomoyuki Obuchi","Toshiyuki Tanaka"],"pdf_url":"https://arxiv.org/pdf/2409.05598v1.pdf","comment":"30 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.05595v1","updated":"2024-09-09T13:29:53Z","published":"2024-09-09T13:29:53Z","title":"SynMorph: Generating Synthetic Face Morphing Dataset with Mated Samples","summary":" Face morphing attack detection (MAD) algorithms have become essential to\novercome the vulnerability of face recognition systems. To solve the lack of\nlarge-scale and public-available datasets due to privacy concerns and\nrestrictions, in this work we propose a new method to generate a synthetic face\nmorphing dataset with 2450 identities and more than 100k morphs. The proposed\nsynthetic face morphing dataset is unique for its high-quality samples,\ndifferent types of morphing algorithms, and the generalization for both single\nand differential morphing attack detection algorithms. For experiments, we\napply face image quality assessment and vulnerability analysis to evaluate the\nproposed synthetic face morphing dataset from the perspective of biometric\nsample quality and morphing attack potential on face recognition systems. The\nresults are benchmarked with an existing SOTA synthetic dataset and a\nrepresentative non-synthetic and indicate improvement compared with the SOTA.\nAdditionally, we design different protocols and study the applicability of\nusing the proposed synthetic dataset on training morphing attack detection\nalgorithms.\n","authors":["Haoyu Zhang","Raghavendra Ramachandra","Kiran Raja","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2409.05595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05577v1","updated":"2024-09-09T13:02:50Z","published":"2024-09-09T13:02:50Z","title":"Approximation Bounds for Recurrent Neural Networks with Application to\n Regression","summary":" We study the approximation capacity of deep ReLU recurrent neural networks\n(RNNs) and explore the convergence properties of nonparametric least squares\nregression using RNNs. We derive upper bounds on the approximation error of\nRNNs for H\\\"older smooth functions, in the sense that the output at each time\nstep of an RNN can approximate a H\\\"older function that depends only on past\nand current information, termed a past-dependent function. This allows a\ncarefully constructed RNN to simultaneously approximate a sequence of\npast-dependent H\\\"older functions. We apply these approximation results to\nderive non-asymptotic upper bounds for the prediction error of the empirical\nrisk minimizer in regression problem. Our error bounds achieve minimax optimal\nrate under both exponentially $\\beta$-mixing and i.i.d. data assumptions,\nimproving upon existing ones. Our results provide statistical guarantees on the\nperformance of RNNs.\n","authors":["Yuling Jiao","Yang Wang","Bokai Yan"],"pdf_url":"https://arxiv.org/pdf/2409.05577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05573v1","updated":"2024-09-09T12:56:02Z","published":"2024-09-09T12:56:02Z","title":"Learning to Model Graph Structural Information on MLPs via Graph\n Structure Self-Contrasting","summary":" Recent years have witnessed great success in handling graph-related tasks\nwith Graph Neural Networks (GNNs). However, most existing GNNs are based on\nmessage passing to perform feature aggregation and transformation, where the\nstructural information is explicitly involved in the forward propagation by\ncoupling with node features through graph convolution at each layer. As a\nresult, subtle feature noise or structure perturbation may cause severe error\npropagation, resulting in extremely poor robustness. In this paper, we rethink\nthe roles played by graph structural information in graph data training and\nidentify that message passing is not the only path to modeling structural\ninformation. Inspired by this, we propose a simple but effective Graph\nStructure Self-Contrasting (GSSC) framework that learns graph structural\ninformation without message passing. The proposed framework is based purely on\nMulti-Layer Perceptrons (MLPs), where the structural information is only\nimplicitly incorporated as prior knowledge to guide the computation of\nsupervision signals, substituting the explicit message propagation as in GNNs.\nSpecifically, it first applies structural sparsification to remove potentially\nuninformative or noisy edges in the neighborhood, and then performs structural\nself-contrasting in the sparsified neighborhood to learn robust node\nrepresentations. Finally, structural sparsification and self-contrasting are\nformulated as a bi-level optimization problem and solved in a unified\nframework. Extensive experiments have qualitatively and quantitatively\ndemonstrated that the GSSC framework can produce truly encouraging performance\nwith better generalization and robustness than other leading competitors.\n","authors":["Lirong Wu","Haitao Lin","Guojiang Zhao","Cheng Tan","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2409.05573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19852v4","updated":"2024-09-09T12:54:05Z","published":"2024-03-28T21:54:48Z","title":"A Review of Graph Neural Networks in Epidemic Modeling","summary":" Since the onset of the COVID-19 pandemic, there has been a growing interest\nin studying epidemiological models. Traditional mechanistic models\nmathematically describe the transmission mechanisms of infectious diseases.\nHowever, they often suffer from limitations of oversimplified or fixed\nassumptions, which could cause sub-optimal predictive power and inefficiency in\ncapturing complex relation information. Consequently, Graph Neural\nNetworks(GNNs) have emerged as a progressively popular tool in epidemic\nresearch. In this paper, we endeavor to furnish a comprehensive review of GNNs\nin epidemic tasks and highlight potential future directions. To accomplish this\nobjective, we introduce hierarchical taxonomies for both epidemic tasks and\nmethodologies, offering a trajectory of development within this domain. For\nepidemic tasks, we establish a taxonomy akin to those typically employed within\nthe epidemic domain. For methodology, we categorize existing work into Neural\nModels and Hybrid Models. Following this, we perform an exhaustive and\nsystematic examination of the methodologies, encompassing both the tasks and\ntheir technical details. Furthermore, we discuss the limitations of existing\nmethods from diverse perspectives and systematically propose future research\ndirections. This survey aims to bridge literature gaps and promote the\nprogression of this promising field, with a list of relevant papers at\nhttps://github.com/Emory-Melody/awesome-epidemic-modeling-papers. We hope that\nit will facilitate synergies between the communities of GNNs and epidemiology,\nand contribute to their collective progress.\n","authors":["Zewen Liu","Guancheng Wan","B. Aditya Prakash","Max S. Y. Lau","Wei Jin"],"pdf_url":"https://arxiv.org/pdf/2403.19852v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06016v2","updated":"2024-09-09T12:50:03Z","published":"2024-06-10T04:35:14Z","title":"EpiLearn: A Python Library for Machine Learning in Epidemic Modeling","summary":" EpiLearn is a Python toolkit developed for modeling, simulating, and\nanalyzing epidemic data. Although there exist several packages that also deal\nwith epidemic modeling, they are often restricted to mechanistic models or\ntraditional statistical tools. As machine learning continues to shape the\nworld, the gap between these packages and the latest models has become larger.\nTo bridge the gap and inspire innovative research in epidemic modeling,\nEpiLearn not only provides support for evaluating epidemic models based on\nmachine learning, but also incorporates comprehensive tools for analyzing\nepidemic data, such as simulation, visualization, transformations, etc. For the\nconvenience of both epidemiologists and data scientists, we provide a unified\nframework for training and evaluation of epidemic models on two tasks:\nForecasting and Source Detection. To facilitate the development of new models,\nEpiLearn follows a modular design, making it flexible and easy to use. In\naddition, an interactive web application is also developed to visualize the\nreal-world or simulated epidemic data. Our package is available at\nhttps://github.com/Emory-Melody/EpiLearn.\n","authors":["Zewen Liu","Yunxiao Li","Mingyang Wei","Guancheng Wan","Max S. Y. Lau","Wei Jin"],"pdf_url":"https://arxiv.org/pdf/2406.06016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15669v5","updated":"2024-09-09T12:43:17Z","published":"2023-09-27T14:09:15Z","title":"On the Computational Entanglement of Distant Features in Adversarial\n Machine Learning","summary":" In this research, we introduce 'computational entanglement', a phenomenon in\noverparameterized neural networks where the model exploits noise patterns in\nways conceptually linked to the effects of length contraction. More specific,\nour findings demonstrate that overparameterized feedforward linear networks can\neasily achieve zero loss by fitting random noise, even with test samples that\nwere never encountered during training. This phenomenon accompanies length\ncontraction, where trained and test samples converge at the same point within a\nspacetime diagram. Unlike most models that rely on supervised learning, our\nmethod operates unsupervised, without the need for labels or gradient-based\noptimization. Additionally, we show a novel application of computational\nentanglement: transforming adversarial examples-highly non-robuts inputs\nimperceptible to human observers-into outputs that are recognizable and robust.\nThis challenges conventional views on non-robust features in adversarial\nexample generation, providing new insights into the underlying mechanisms. Our\nresults emphasize the importance of computational entanglement for enhancing\nmodel robustness and understanding neural networks in adversarial contexts.\n","authors":["YenLung Lai","Xingbo Dong","Zhe Jin"],"pdf_url":"https://arxiv.org/pdf/2309.15669v5.pdf","comment":"abstract updated"},{"id":"http://arxiv.org/abs/2407.11654v2","updated":"2024-09-09T12:36:29Z","published":"2024-07-16T12:21:29Z","title":"R-SFLLM: Jamming Resilient Framework for Split Federated Learning with\n Large Language Models","summary":" Split federated learning (SFL) is a compute-efficient paradigm in distributed\nmachine learning (ML), where components of large ML models are outsourced to\nremote servers. A significant challenge in SFL, particularly when deployed over\nwireless channels, is the susceptibility of transmitted model parameters to\nadversarial jamming that could jeopardize the learning process. This is\nparticularly pronounced for word embedding parameters in large language models\n(LLMs), which are crucial for language understanding. In this paper, rigorous\ninsights are provided into the influence of jamming LLM word embeddings in SFL\nby deriving an expression for the ML training loss divergence and showing that\nit is upper-bounded by the mean squared error (MSE). Based on this analysis, a\nphysical layer framework is developed for resilient SFL with LLMs (R-SFLLM)\nover wireless networks. R-SFLLM leverages wireless sensing data to gather\ninformation on the jamming directions-of-arrival (DoAs) for the purpose of\ndevising a novel, sensing-assisted anti-jamming strategy while jointly\noptimizing beamforming, user scheduling, and resource allocation. Extensive\nexperiments using BERT and RoBERTa models demonstrate R-SFLLM's effectiveness,\nachieving close-to-baseline performance across various natural language\nprocessing (NLP) tasks and datasets. The proposed methodology further\nintroduces an adversarial training component, where controlled noise exposure\nsignificantly enhances the LLM's resilience to perturbed parameters during\ntraining. The results show that more noise-sensitive models, such as RoBERTa,\nbenefit from this feature, especially when resource allocation is unfair. It is\nalso shown that worst-case jamming in particular translates into worst-case\nmodel outcomes, thereby necessitating the need for jamming-resilient SFL\nprotocols.\n","authors":["Aladin Djuhera","Vlad C. Andrei","Xinyang Li","Ullrich J. Mönich","Holger Boche","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2407.11654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05556v1","updated":"2024-09-09T12:25:10Z","published":"2024-09-09T12:25:10Z","title":"SciAgents: Automating scientific discovery through multi-agent\n intelligent graph reasoning","summary":" A key challenge in artificial intelligence is the creation of systems capable\nof autonomously advancing scientific understanding by exploring novel domains,\nidentifying complex patterns, and uncovering previously unseen connections in\nvast scientific data. In this work, we present SciAgents, an approach that\nleverages three core concepts: (1) the use of large-scale ontological knowledge\ngraphs to organize and interconnect diverse scientific concepts, (2) a suite of\nlarge language models (LLMs) and data retrieval tools, and (3) multi-agent\nsystems with in-situ learning capabilities. Applied to biologically inspired\nmaterials, SciAgents reveals hidden interdisciplinary relationships that were\npreviously considered unrelated, achieving a scale, precision, and exploratory\npower that surpasses traditional human-driven research methods. The framework\nautonomously generates and refines research hypotheses, elucidating underlying\nmechanisms, design principles, and unexpected material properties. By\nintegrating these capabilities in a modular fashion, the intelligent system\nyields material discoveries, critique and improve existing hypotheses, retrieve\nup-to-date data about existing research, and highlights their strengths and\nlimitations. Our case studies demonstrate scalable capabilities to combine\ngenerative AI, ontological representations, and multi-agent modeling,\nharnessing a `swarm of intelligence' similar to biological systems. This\nprovides new avenues for materials discovery and accelerates the development of\nadvanced materials by unlocking Nature's design principles.\n","authors":["Alireza Ghafarollahi","Markus J. Buehler"],"pdf_url":"https://arxiv.org/pdf/2409.05556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08174v4","updated":"2024-09-09T12:00:58Z","published":"2023-12-13T14:34:12Z","title":"Double Machine Learning for Static Panel Models with Fixed Effects","summary":" Recent advances in causal inference have seen the development of methods\nwhich make use of the predictive power of machine learning algorithms. In this\npaper, we use these algorithms to approximate high-dimensional and non-linear\nnuisance functions of the confounders and double machine learning (DML) to make\ninferences about the effects of policy interventions from panel data. We\npropose new estimators by extending correlated random effects, within-group and\nfirst-difference estimation for linear models to an extension of Robinson\n(1988)'s partially linear regression model to static panel data models with\nindividual fixed effects and unspecified non-linear confounding effects. We\nprovide an illustrative example of DML for observational panel data showing the\nimpact of the introduction of the minimum wage on voting behaviour in the UK.\n","authors":["Paul Clarke","Annalivia Polselli"],"pdf_url":"https://arxiv.org/pdf/2312.08174v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05539v1","updated":"2024-09-09T11:59:42Z","published":"2024-09-09T11:59:42Z","title":"CoBo: Collaborative Learning via Bilevel Optimization","summary":" Collaborative learning is an important tool to train multiple clients more\neffectively by enabling communication among clients. Identifying helpful\nclients, however, presents challenging and often introduces significant\noverhead. In this paper, we model client-selection and model-training as two\ninterconnected optimization problems, proposing a novel bilevel optimization\nproblem for collaborative learning. We introduce CoBo, a scalable and elastic,\nSGD-type alternating optimization algorithm that efficiently addresses these\nproblem with theoretical convergence guarantees. Empirically, CoBo achieves\nsuperior performance, surpassing popular personalization algorithms by 9.3% in\naccuracy on a task with high heterogeneity, involving datasets distributed\namong 80 clients.\n","authors":["Diba Hashemi","Lie He","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2409.05539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09470v3","updated":"2024-09-09T11:30:41Z","published":"2024-02-12T08:16:10Z","title":"Rolling Diffusion Models","summary":" Diffusion models have recently been increasingly applied to temporal data\nsuch as video, fluid mechanics simulations, or climate data. These methods\ngenerally treat subsequent frames equally regarding the amount of noise in the\ndiffusion process. This paper explores Rolling Diffusion: a new approach that\nuses a sliding window denoising process. It ensures that the diffusion process\nprogressively corrupts through time by assigning more noise to frames that\nappear later in a sequence, reflecting greater uncertainty about the future as\nthe generation process unfolds. Empirically, we show that when the temporal\ndynamics are complex, Rolling Diffusion is superior to standard diffusion. In\nparticular, this result is demonstrated in a video prediction task using the\nKinetics-600 video dataset and in a chaotic fluid dynamics forecasting\nexperiment.\n","authors":["David Ruhe","Jonathan Heek","Tim Salimans","Emiel Hoogeboom"],"pdf_url":"https://arxiv.org/pdf/2402.09470v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.04856v2","updated":"2024-09-09T11:25:48Z","published":"2024-02-07T13:54:38Z","title":"Explaining Learned Reward Functions with Counterfactual Trajectories","summary":" Learning rewards from human behaviour or feedback is a promising approach to\naligning AI systems with human values but fails to consistently extract correct\nreward functions. Interpretability tools could enable users to understand and\nevaluate possible flaws in learned reward functions. We propose Counterfactual\nTrajectory Explanations (CTEs) to interpret reward functions in reinforcement\nlearning by contrasting an original with a counterfactual partial trajectory\nand the rewards they each receive. We derive six quality criteria for CTEs and\npropose a novel Monte-Carlo-based algorithm for generating CTEs that optimises\nthese quality criteria. Finally, we measure how informative the generated\nexplanations are to a proxy-human model by training it on CTEs. CTEs are\ndemonstrably informative for the proxy-human model, increasing the similarity\nbetween its predictions and the reward function on unseen trajectories.\nFurther, it learns to accurately judge differences in rewards between\ntrajectories and generalises to out-of-distribution examples. Although CTEs do\nnot lead to a perfect understanding of the reward, our method, and more\ngenerally the adaptation of XAI methods, are presented as a fruitful approach\nfor interpreting learned reward functions.\n","authors":["Jan Wehner","Frans Oliehoek","Luciano Cavalcante Siebert"],"pdf_url":"https://arxiv.org/pdf/2402.04856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05513v1","updated":"2024-09-09T11:13:32Z","published":"2024-09-09T11:13:32Z","title":"Interpolation, Extrapolation, Hyperpolation: Generalising into new\n dimensions","summary":" This paper introduces the concept of hyperpolation: a way of generalising\nfrom a limited set of data points that is a peer to the more familiar concepts\nof interpolation and extrapolation. Hyperpolation is the task of estimating the\nvalue of a function at new locations that lie outside the subspace (or\nmanifold) of the existing data. We shall see that hyperpolation is possible and\nexplore its links to creativity in the arts and sciences. We will also examine\nthe role of hyperpolation in machine learning and suggest that the lack of\nfundamental creativity in current AI systems is deeply connected to their\nlimited ability to hyperpolate.\n","authors":["Toby Ord"],"pdf_url":"https://arxiv.org/pdf/2409.05513v1.pdf","comment":"22 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.05508v1","updated":"2024-09-09T11:02:27Z","published":"2024-09-09T11:02:27Z","title":"A general reduced-order neural operator for spatio-temporal predictive\n learning on complex spatial domains","summary":" Predictive learning for spatio-temporal processes (PL-STP) on complex spatial\ndomains plays a critical role in various scientific and engineering fields,\nwith its essence being the construction of operators between\ninfinite-dimensional function spaces. This paper focuses on the unequal-domain\nmappings in PL-STP and categorising them into increase-domain and\ndecrease-domain mapping. Recent advances in deep learning have revealed the\ngreat potential of neural operators (NOs) to learn operators directly from\nobservational data. However, existing NOs require input space and output space\nto be the same domain, which pose challenges in ensuring predictive accuracy\nand stability for unequal-domain mappings. To this end, this study presents a\ngeneral reduced-order neural operator named Reduced-Order Neural Operator on\nRiemannian Manifolds (RO-NORM), which consists of two parts: the unequal-domain\nencoder/decoder and the same-domain approximator. Motivated by the variable\nseparation in classical modal decomposition, the unequal-domain encoder/decoder\nuses the pre-computed bases to reformulate the spatio-temporal function as a\nsum of products between spatial (or temporal) bases and corresponding\ntemporally (or spatially) distributed weight functions, thus the original\nunequal-domain mapping can be converted into a same-domain mapping.\nConsequently, the same-domain approximator NORM is applied to model the\ntransformed mapping. The performance of our proposed method has been evaluated\non six benchmark cases, including parametric PDEs, engineering and biomedical\napplications, and compared with four baseline algorithms: DeepONet,\nPOD-DeepONet, PCA-Net, and vanilla NORM. The experimental results demonstrate\nthe superiority of RO-NORM in prediction accuracy and training efficiency for\nPL-STP.\n","authors":["Qinglu Meng","Yingguang Li","Zhiliang Deng","Xu Liu","Gengxiang Chen","Qiutong Wu","Changqing Liu","Xiaozhong Hao"],"pdf_url":"https://arxiv.org/pdf/2409.05508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05500v1","updated":"2024-09-09T10:52:58Z","published":"2024-09-09T10:52:58Z","title":"Optimizing VarLiNGAM for Scalable and Efficient Time Series Causal\n Discovery","summary":" Causal discovery is designed to identify causal relationships in data, a task\nthat has become increasingly complex due to the computational demands of\ntraditional methods such as VarLiNGAM, which combines Vector Autoregressive\nModel with Linear Non-Gaussian Acyclic Model for time series data.\n This study is dedicated to optimising causal discovery specifically for time\nseries data, which is common in practical applications. Time series causal\ndiscovery is particularly challenging due to the need to account for temporal\ndependencies and potential time lag effects. By designing a specialised dataset\ngenerator and reducing the computational complexity of the VarLiNGAM model from\n\\( O(m^3 \\cdot n) \\) to \\( O(m^3 + m^2 \\cdot n) \\), this study significantly\nimproves the feasibility of processing large datasets. The proposed methods\nhave been validated on advanced computational platforms and tested across\nsimulated, real-world, and large-scale datasets, showcasing enhanced efficiency\nand performance. The optimised algorithm achieved 7 to 13 times speedup\ncompared with the original algorithm and around 4.5 times speedup compared with\nthe GPU-accelerated version on large-scale datasets with feature sizes between\n200 and 400.\n Our methods aim to push the boundaries of current causal discovery\ncapabilities, making them more robust, scalable, and applicable to real-world\nscenarios, thus facilitating breakthroughs in various fields such as healthcare\nand finance.\n","authors":["Ziyang Jiao","Ce Guo","Wayne Luk"],"pdf_url":"https://arxiv.org/pdf/2409.05500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12864v2","updated":"2024-09-09T10:51:33Z","published":"2024-07-12T14:31:54Z","title":"Clustering Time-Evolving Networks Using the Spatio-Temporal Graph\n Laplacian","summary":" Time-evolving graphs arise frequently when modeling complex dynamical systems\nsuch as social networks, traffic flow, and biological processes. Developing\ntechniques to identify and analyze communities in these time-varying graph\nstructures is an important challenge. In this work, we generalize existing\nspectral clustering algorithms from static to dynamic graphs using canonical\ncorrelation analysis (CCA) to capture the temporal evolution of clusters. Based\non this extended canonical correlation framework, we define the spatio-temporal\ngraph Laplacian and investigate its spectral properties. We connect these\nconcepts to dynamical systems theory via transfer operators, and illustrate the\nadvantages of our method on benchmark graphs by comparison with existing\nmethods. We show that the spatio-temporal graph Laplacian allows for a clear\ninterpretation of cluster structure evolution over time for directed and\nundirected graphs.\n","authors":["Maia Trower","Nataša Djurdjevac Conrad","Stefan Klus"],"pdf_url":"https://arxiv.org/pdf/2407.12864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05495v1","updated":"2024-09-09T10:47:41Z","published":"2024-09-09T10:47:41Z","title":"Using machine learning for fault detection in lighthouse light sensors","summary":" Lighthouses play a crucial role in ensuring maritime safety by signaling\nhazardous areas such as dangerous coastlines, shoals, reefs, and rocks, along\nwith aiding harbor entries and aerial navigation. This is achieved through the\nuse of photoresistor sensors that activate or deactivate based on the time of\nday. However, a significant issue is the potential malfunction of these\nsensors, leading to the gradual misalignment of the light's operational timing.\nThis paper introduces an innovative machine learning-based approach for\nautomatically detecting such malfunctions. We evaluate four distinct\nalgorithms: decision trees, random forest, extreme gradient boosting, and\nmulti-layer perceptron. Our findings indicate that the multi-layer perceptron\nis the most effective, capable of detecting timing discrepancies as small as\n10-15 minutes. This accuracy makes it a highly efficient tool for automating\nthe detection of faults in lighthouse light sensors.\n","authors":["Michael Kampouridis","Nikolaos Vastardis","George Rayment"],"pdf_url":"https://arxiv.org/pdf/2409.05495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05484v1","updated":"2024-09-09T10:29:28Z","published":"2024-09-09T10:29:28Z","title":"CRADLE-VAE: Enhancing Single-Cell Gene Perturbation Modeling with\n Counterfactual Reasoning-based Artifact Disentanglement","summary":" Predicting cellular responses to various perturbations is a critical focus in\ndrug discovery and personalized therapeutics, with deep learning models playing\na significant role in this endeavor. Single-cell datasets contain technical\nartifacts that may hinder the predictability of such models, which poses\nquality control issues highly regarded in this area. To address this, we\npropose CRADLE-VAE, a causal generative framework tailored for single-cell gene\nperturbation modeling, enhanced with counterfactual reasoning-based artifact\ndisentanglement. Throughout training, CRADLE-VAE models the underlying latent\ndistribution of technical artifacts and perturbation effects present in\nsingle-cell datasets. It employs counterfactual reasoning to effectively\ndisentangle such artifacts by modulating the latent basal spaces and learns\nrobust features for generating cellular response data with improved quality.\nExperimental results demonstrate that this approach improves not only treatment\neffect estimation performance but also generative quality as well. The\nCRADLE-VAE codebase is publicly available at\nhttps://github.com/dmis-lab/CRADLE-VAE.\n","authors":["Seungheun Baek","Soyon Park","Yan Ting Chok","Junhyun Lee","Jueon Park","Mogan Gim","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2409.05484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05482v1","updated":"2024-09-09T10:25:13Z","published":"2024-09-09T10:25:13Z","title":"Advancing Machine Learning for Stellar Activity and Exoplanet Period\n Rotation","summary":" This study applied machine learning models to estimate stellar rotation\nperiods from corrected light curve data obtained by the NASA Kepler mission.\nTraditional methods often struggle to estimate rotation periods accurately due\nto noise and variability in the light curve data. The workflow involved using\ninitial period estimates from the LS-Periodogram and Transit Least Squares\ntechniques, followed by splitting the data into training, validation, and\ntesting sets. We employed several machine learning algorithms, including\nDecision Tree, Random Forest, K-Nearest Neighbors, and Gradient Boosting, and\nalso utilized a Voting Ensemble approach to improve prediction accuracy and\nrobustness.\n The analysis included data from multiple Kepler IDs, providing detailed\nmetrics on orbital periods and planet radii. Performance evaluation showed that\nthe Voting Ensemble model yielded the most accurate results, with an RMSE\napproximately 50\\% lower than the Decision Tree model and 17\\% better than the\nK-Nearest Neighbors model. The Random Forest model performed comparably to the\nVoting Ensemble, indicating high accuracy. In contrast, the Gradient Boosting\nmodel exhibited a worse RMSE compared to the other approaches. Comparisons of\nthe predicted rotation periods to the photometric reference periods showed\nclose alignment, suggesting the machine learning models achieved high\nprediction accuracy. The results indicate that machine learning, particularly\nensemble methods, can effectively solve the problem of accurately estimating\nstellar rotation periods, with significant implications for advancing the study\nof exoplanets and stellar astrophysics.\n","authors":["Fatemeh Fazel Hesar","Bernard Foing","Ana M. Heras","Mojtaba Raouf","Victoria Foing","Shima Javanmardi","Fons J. Verbeek"],"pdf_url":"https://arxiv.org/pdf/2409.05482v1.pdf","comment":"15 pages, 8 figures. Submitted for publication in A&A"},{"id":"http://arxiv.org/abs/2409.05477v1","updated":"2024-09-09T10:11:25Z","published":"2024-09-09T10:11:25Z","title":"Retrofitting Temporal Graph Neural Networks with Transformer","summary":" Temporal graph neural networks (TGNNs) outperform regular GNNs by\nincorporating time information into graph-based operations. However, TGNNs\nadopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored\ntraining frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN,\nwhich uses Transformer decoder as the backbone model for TGNN to enjoy\nTransformer's codebase for efficient training. In particular, Transformer\nachieves tremendous success for language modeling, and thus the community\ndeveloped high-performance kernels (e.g., flash-attention and memory-efficient\nattention) and efficient distributed training schemes (e.g., PyTorch FSDP,\nDeepSpeed, and Megatron-LM). We observe that TGNN resembles language modeling,\ni.e., the message aggregation operation between chronologically occurring nodes\nand their temporal neighbors in TGNNs can be structured as sequence modeling.\nBeside this similarity, we also incorporate a series of algorithm designs\nincluding suffix infilling, temporal graph attention with self-loop, and causal\nmasking self-attention to make TF-TGN work. During training, existing systems\nare slow in transforming the graph topology and conducting graph sampling. As\nsuch, we propose methods to parallelize the CSR format conversion and graph\nsampling. We also adapt Transformer codebase to train TF-TGN efficiently with\nmultiple GPUs. We experiment with 9 graphs and compare with 2 state-of-the-art\nTGNN training frameworks. The results show that TF-TGN can accelerate training\nby over 2.20 while providing comparable or even superior accuracy to existing\nSOTA TGNNs. TF-TGN is available at https://github.com/qianghuangwhu/TF-TGN.\n","authors":["Qiang Huang","Xiao Yan","Xin Wang","Susie Xi Rao","Zhichao Han","Fangcheng Fu","Wentao Zhang","Jiawei Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.05477v1.pdf","comment":"conference Under review"},{"id":"http://arxiv.org/abs/2409.05475v1","updated":"2024-09-09T10:07:12Z","published":"2024-09-09T10:07:12Z","title":"Reinforcement Learning for Variational Quantum Circuits Design","summary":" Variational Quantum Algorithms have emerged as promising tools for solving\noptimization problems on quantum computers. These algorithms leverage a\nparametric quantum circuit called ansatz, where its parameters are adjusted by\na classical optimizer with the goal of optimizing a certain cost function.\nHowever, a significant challenge lies in designing effective circuits for\naddressing specific problems. In this study, we leverage the powerful and\nflexible Reinforcement Learning paradigm to train an agent capable of\nautonomously generating quantum circuits that can be used as ansatzes in\nvariational algorithms to solve optimization problems. The agent is trained on\ndiverse problem instances, including Maximum Cut, Maximum Clique and Minimum\nVertex Cover, built from different graph topologies and sizes. Our analysis of\nthe circuits generated by the agent and the corresponding solutions shows that\nthe proposed method is able to generate effective ansatzes. While our goal is\nnot to propose any new specific ansatz, we observe how the agent has discovered\na novel family of ansatzes effective for Maximum Cut problems, which we call\n$R_{yz}$-connected. We study the characteristics of one of these ansatzes by\ncomparing it against state-of-the-art quantum algorithms across instances of\nvarying graph topologies, sizes, and problem types. Our results indicate that\nthe $R_{yz}$-connected circuit achieves high approximation ratios for Maximum\nCut problems, further validating our proposed agent. In conclusion, our study\nhighlights the potential of Reinforcement Learning techniques in assisting\nresearchers to design effective quantum circuits which could have applications\nin a wide number of tasks.\n","authors":["Simone Foderà","Gloria Turati","Riccardo Nembrini","Maurizio Ferrari Dacrema","Paolo Cremonesi"],"pdf_url":"https://arxiv.org/pdf/2409.05475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03753v2","updated":"2024-09-09T10:04:00Z","published":"2024-09-05T17:59:15Z","title":"WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild","summary":" The increasing availability of real-world conversation data offers exciting\nopportunities for researchers to study user-chatbot interactions. However, the\nsheer volume of this data makes manually examining individual conversations\nimpractical. To overcome this challenge, we introduce WildVis, an interactive\ntool that enables fast, versatile, and large-scale conversation analysis.\nWildVis provides search and visualization capabilities in the text and\nembedding spaces based on a list of criteria. To manage million-scale datasets,\nwe implemented optimizations including search index construction, embedding\nprecomputation and compression, and caching to ensure responsive user\ninteractions within seconds. We demonstrate WildVis' utility through three case\nstudies: facilitating chatbot misuse research, visualizing and comparing topic\ndistributions across datasets, and characterizing user-specific conversation\npatterns. WildVis is open-source and designed to be extendable, supporting\nadditional datasets and customized search and visualization functionalities.\n","authors":["Yuntian Deng","Wenting Zhao","Jack Hessel","Xiang Ren","Claire Cardie","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.03753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05459v1","updated":"2024-09-09T09:39:47Z","published":"2024-09-09T09:39:47Z","title":"Beyond Flatland: A Geometric Take on Matching Methods for Treatment\n Effect Estimation","summary":" Matching is a popular approach in causal inference to estimate treatment\neffects by pairing treated and control units that are most similar in terms of\ntheir covariate information. However, classic matching methods completely\nignore the geometry of the data manifold, which is crucial to define a\nmeaningful distance for matching, and struggle when covariates are noisy and\nhigh-dimensional. In this work, we propose GeoMatching, a matching method to\nestimate treatment effects that takes into account the intrinsic data geometry\ninduced by existing causal mechanisms among the confounding variables. First,\nwe learn a low-dimensional, latent Riemannian manifold that accounts for\nuncertainty and geometry of the original input data. Second, we estimate\ntreatment effects via matching in the latent space based on the learned latent\nRiemannian metric. We provide theoretical insights and empirical results in\nsynthetic and real-world scenarios, demonstrating that GeoMatching yields more\neffective treatment effect estimators, even as we increase input\ndimensionality, in the presence of outliers, or in semi-supervised scenarios.\n","authors":["Melanie F. Pradier","Javier González"],"pdf_url":"https://arxiv.org/pdf/2409.05459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11657v2","updated":"2024-09-09T09:37:28Z","published":"2024-05-19T20:06:38Z","title":"On the Expressivity of Recurrent Neural Cascades with Identity","summary":" Recurrent Neural Cascades (RNC) are the class of recurrent neural networks\nwith no cyclic dependencies among recurrent neurons. Their subclass RNC+ with\npositive recurrent weights has been shown to be closely connected to the\nstar-free regular languages, which are the expressivity of many\nwell-established temporal logics. The existing expressivity results show that\nthe regular languages captured by RNC+ are the star-free ones, and they leave\nopen the possibility that RNC+ may capture languages beyond regular. We exclude\nthis possibility for languages that include an identity element, i.e., an input\nthat can occur an arbitrary number of times without affecting the output.\nNamely, in the presence of an identity element, we show that the languages\ncaptured by RNC+ are exactly the star-free regular languages. Identity elements\nare ubiquitous in temporal patterns, and hence our results apply to a large\nnumber of applications. The implications of our results go beyond expressivity.\nAt their core, we establish a close structural correspondence between RNC+ and\nsemiautomata cascades, showing that every neuron can be equivalently captured\nby a three-state semiautomaton. A notable consequence of this result is that\nRNC+ are no more succinct than cascades of three-state semiautomata.\n","authors":["Nadezda Alexandrovna Knorozova","Alessandro Ronca"],"pdf_url":"https://arxiv.org/pdf/2405.11657v2.pdf","comment":"Full version with appendix of a paper with the same title that will\n appear in the proceedings of KR 2024"},{"id":"http://arxiv.org/abs/2309.03665v2","updated":"2024-09-09T09:34:36Z","published":"2023-09-07T12:02:00Z","title":"How adversarial attacks can disrupt seemingly stable accurate\n classifiers","summary":" Adversarial attacks dramatically change the output of an otherwise accurate\nlearning system using a seemingly inconsequential modification to a piece of\ninput data. Paradoxically, empirical evidence indicates that even systems which\nare robust to large random perturbations of the input data remain susceptible\nto small, easily constructed, adversarial perturbations of their inputs. Here,\nwe show that this may be seen as a fundamental feature of classifiers working\nwith high dimensional input data. We introduce a simple generic and\ngeneralisable framework for which key behaviours observed in practical systems\narise with high probability -- notably the simultaneous susceptibility of the\n(otherwise accurate) model to easily constructed adversarial attacks, and\nrobustness to random perturbations of the input data. We confirm that the same\nphenomena are directly observed in practical neural networks trained on\nstandard image classification problems, where even large additive random noise\nfails to trigger the adversarial instability of the network. A surprising\ntakeaway is that even small margins separating a classifier's decision surface\nfrom training and testing data can hide adversarial susceptibility from being\ndetected using randomly sampled perturbations. Counterintuitively, using\nadditive noise during training or testing is therefore inefficient for\neradicating or detecting adversarial examples, and more demanding adversarial\ntraining is required.\n","authors":["Oliver J. Sutton","Qinghua Zhou","Ivan Y. Tyukin","Alexander N. Gorban","Alexander Bastounis","Desmond J. Higham"],"pdf_url":"https://arxiv.org/pdf/2309.03665v2.pdf","comment":"11 pages, 8 figures, additional supplementary materials"},{"id":"http://arxiv.org/abs/2311.04748v3","updated":"2024-09-09T09:32:15Z","published":"2023-11-08T15:17:13Z","title":"Intrinsic Bayesian Cramér-Rao Bound with an Application to Covariance\n Matrix Estimation","summary":" This paper presents a new performance bound for estimation problems where the\nparameter to estimate lies in a Riemannian manifold (a smooth manifold endowed\nwith a Riemannian metric) and follows a given prior distribution. In this\nsetup, the chosen Riemannian metric induces a geometry for the parameter\nmanifold, as well as an intrinsic notion of the estimation error measure.\nPerformance bound for such error measure were previously obtained in the\nnon-Bayesian case (when the unknown parameter is assumed to deterministic), and\nreferred to as \\textit{intrinsic} Cram\\'er-Rao bound. The presented result then\nappears either as: \\textit{a}) an extension of the intrinsic Cram\\'er-Rao bound\nto the Bayesian estimation framework; \\textit{b}) a generalization of the\nVan-Trees inequality (Bayesian Cram\\'er-Rao bound) that accounts for the\naforementioned geometric structures. In a second part, we leverage this\nformalism to study the problem of covariance matrix estimation when the data\nfollow a Gaussian distribution, and whose covariance matrix is drawn from an\ninverse Wishart distribution. Performance bounds for this problem are obtained\nfor both the mean squared error (Euclidean metric) and the natural Riemannian\ndistance for Hermitian positive definite matrices (affine invariant metric).\nNumerical simulation illustrate that assessing the error with the affine\ninvariant metric is revealing of interesting properties of the maximum a\nposteriori and minimum mean square error estimator, which are not observed when\nusing the Euclidean metric.\n","authors":["Florent Bouchard","Alexandre Renaux","Guillaume Ginolhac","Arnaud Breloy"],"pdf_url":"https://arxiv.org/pdf/2311.04748v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05433v1","updated":"2024-09-09T08:34:22Z","published":"2024-09-09T08:34:22Z","title":"State-Novelty Guided Action Persistence in Deep Reinforcement Learning","summary":" While a powerful and promising approach, deep reinforcement learning (DRL)\nstill suffers from sample inefficiency, which can be notably improved by\nresorting to more sophisticated techniques to address the\nexploration-exploitation dilemma. One such technique relies on action\npersistence (i.e., repeating an action over multiple steps). However, previous\nwork exploiting action persistence either applies a fixed strategy or learns\nadditional value functions (or policy) for selecting the repetition number. In\nthis paper, we propose a novel method to dynamically adjust the action\npersistence based on the current exploration status of the state space. In such\na way, our method does not require training of additional value functions or\npolicy. Moreover, the use of a smooth scheduling of the repeat probability\nallows a more effective balance between exploration and exploitation.\nFurthermore, our method can be seamlessly integrated into various basic\nexploration strategies to incorporate temporal persistence. Finally, extensive\nexperiments on different DMControl tasks demonstrate that our state-novelty\nguided action persistence method significantly improves the sample efficiency.\n","authors":["Jianshu Hu","Paul Weng","Yutong Ban"],"pdf_url":"https://arxiv.org/pdf/2409.05433v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2405.03725v2","updated":"2024-09-09T08:33:40Z","published":"2024-05-06T06:17:16Z","title":"Deep Oscillatory Neural Network","summary":" We propose a novel, brain-inspired deep neural network model known as the\nDeep Oscillatory Neural Network (DONN). Deep neural networks like the Recurrent\nNeural Networks indeed possess sequence processing capabilities but the\ninternal states of the network are not designed to exhibit brain-like\noscillatory activity. With this motivation, the DONN is designed to have\noscillatory internal dynamics. Neurons of the DONN are either nonlinear neural\noscillators or traditional neurons with sigmoidal or ReLU activation. The\nneural oscillator used in the model is the Hopf oscillator, with the dynamics\ndescribed in the complex domain. Input can be presented to the neural\noscillator in three possible modes. The sigmoid and ReLU neurons also use\ncomplex-valued extensions. All the weight stages are also complex-valued.\nTraining follows the general principle of weight change by minimizing the\noutput error and therefore has an overall resemblance to complex\nbackpropagation. A generalization of DONN to convolutional networks known as\nthe Oscillatory Convolutional Neural Network is also proposed. The two proposed\noscillatory networks are applied to a variety of benchmark problems in signal\nand image/video processing. The performance of the proposed models is either\ncomparable or superior to published results on the same data sets.\n","authors":["Nurani Rajagopal Rohan","Vigneswaran C","Sayan Ghosh","Kishore Rajendran","Gaurav A","V Srinivasa Chakravarthy"],"pdf_url":"https://arxiv.org/pdf/2405.03725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02476v4","updated":"2024-09-09T08:11:45Z","published":"2024-04-03T05:32:10Z","title":"Deep Reinforcement Learning for Traveling Purchaser Problems","summary":" The traveling purchaser problem (TPP) is an important combinatorial\noptimization problem with broad applications. Due to the coupling between\nrouting and purchasing, existing works on TPPs commonly address route\nconstruction and purchase planning simultaneously, which, however, leads to\nexact methods with high computational cost and heuristics with sophisticated\ndesign but limited performance. In sharp contrast, we propose a novel approach\nbased on deep reinforcement learning (DRL), which addresses route construction\nand purchase planning separately, while evaluating and optimizing the solution\nfrom a global perspective. The key components of our approach include a\nbipartite graph representation for TPPs to capture the market-product\nrelations, and a policy network that extracts information from the bipartite\ngraph and uses it to sequentially construct the route. One significant benefit\nof our framework is that we can efficiently construct the route using the\npolicy network, and once the route is determined, the associated purchasing\nplan can be easily derived through linear programming, while, leveraging DRL,\nwe can train the policy network to optimize the global solution objective.\nFurthermore, by introducing a meta-learning strategy, the policy network can be\ntrained stably on large-sized TPP instances, and generalize well across\ninstances of varying sizes and distributions, even to much larger instances\nthat are never seen during training. Experiments on various synthetic TPP\ninstances and the TPPLIB benchmark demonstrate that our DRL-based approach can\nsignificantly outperform well-established TPP heuristics, reducing the\noptimality gap by 40%-90%, and also showing an advantage in runtime, especially\non large-sized instances.\n","authors":["Haofeng Yuan","Rongping Zhu","Wanlu Yang","Shiji Song","Keyou You","Wei Fan","C. L. Philip Chen"],"pdf_url":"https://arxiv.org/pdf/2404.02476v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00523v2","updated":"2024-09-09T08:09:14Z","published":"2024-08-01T12:54:46Z","title":"Jailbreaking Text-to-Image Models with LLM-Based Agents","summary":" Recent advancements have significantly improved automated task-solving\ncapabilities using autonomous agents powered by large language models (LLMs).\nHowever, most LLM-based agents focus on dialogue, programming, or specialized\ndomains, leaving their potential for addressing generative AI safety tasks\nlargely unexplored. In this paper, we propose Atlas, an advanced LLM-based\nmulti-agent framework targeting generative AI models, specifically focusing on\njailbreak attacks against text-to-image (T2I) models with built-in safety\nfilters. Atlas consists of two agents, namely the mutation agent and the\nselection agent, each comprising four key modules: a vision-language model\n(VLM) or LLM brain, planning, memory, and tool usage. The mutation agent uses\nits VLM brain to determine whether a prompt triggers the T2I model's safety\nfilter. It then collaborates iteratively with the LLM brain of the selection\nagent to generate new candidate jailbreak prompts with the highest potential to\nbypass the filter. In addition to multi-agent communication, we leverage\nin-context learning (ICL) memory mechanisms and the chain-of-thought (COT)\napproach to learn from past successes and failures, thereby enhancing Atlas's\nperformance. Our evaluation demonstrates that Atlas successfully jailbreaks\nseveral state-of-the-art T2I models equipped with multi-modal safety filters in\na black-box setting. Additionally, Atlas outperforms existing methods in both\nquery efficiency and the quality of generated images. This work convincingly\ndemonstrates the successful application of LLM-based agents in studying the\nsafety vulnerabilities of popular text-to-image generation models. We urge the\ncommunity to consider advanced techniques like ours in response to the rapidly\nevolving text-to-image generation field.\n","authors":["Yingkai Dong","Zheng Li","Xiangtao Meng","Ning Yu","Shanqing Guo"],"pdf_url":"https://arxiv.org/pdf/2408.00523v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15910v5","updated":"2024-09-09T08:07:43Z","published":"2023-12-26T07:04:39Z","title":"Reinforcement Unlearning","summary":" Machine unlearning refers to the process of mitigating the influence of\nspecific training data on machine learning models based on removal requests\nfrom data owners. However, one important area that has been largely overlooked\nin the research of unlearning is reinforcement learning. Reinforcement learning\nfocuses on training an agent to make optimal decisions within an environment to\nmaximize its cumulative rewards. During the training, the agent tends to\nmemorize the features of the environment, which raises a significant concern\nabout privacy. As per data protection regulations, the owner of the environment\nholds the right to revoke access to the agent's training data, thus\nnecessitating the development of a novel and pressing research field, known as\n\\emph{reinforcement unlearning}. Reinforcement unlearning focuses on revoking\nentire environments rather than individual data samples. This unique\ncharacteristic presents three distinct challenges: 1) how to propose unlearning\nschemes for environments; 2) how to avoid degrading the agent's performance in\nremaining environments; and 3) how to evaluate the effectiveness of unlearning.\nTo tackle these challenges, we propose two reinforcement unlearning methods.\nThe first method is based on decremental reinforcement learning, which aims to\nerase the agent's previously acquired knowledge gradually. The second method\nleverages environment poisoning attacks, which encourage the agent to learn\nnew, albeit incorrect, knowledge to remove the unlearning environment.\nParticularly, to tackle the third challenge, we introduce the concept of\n``environment inference attack'' to evaluate the unlearning outcomes.\n","authors":["Dayong Ye","Tianqing Zhu","Congcong Zhu","Derui Wang","Kun Gao","Zewei Shi","Sheng Shen","Wanlei Zhou","Minhui Xue"],"pdf_url":"https://arxiv.org/pdf/2312.15910v5.pdf","comment":"Accepted by NDSS 2025"},{"id":"http://arxiv.org/abs/2409.05402v1","updated":"2024-09-09T08:01:28Z","published":"2024-09-09T08:01:28Z","title":"HyperSMOTE: A Hypergraph-based Oversampling Approach for Imbalanced Node\n Classifications","summary":" Hypergraphs are increasingly utilized in both unimodal and multimodal data\nscenarios due to their superior ability to model and extract higher-order\nrelationships among nodes, compared to traditional graphs. However, current\nhypergraph models are encountering challenges related to imbalanced data, as\nthis imbalance can lead to biases in the model towards the more prevalent\nclasses. While the existing techniques, such as GraphSMOTE, have improved\nclassification accuracy for minority samples in graph data, they still fall\nshort when addressing the unique structure of hypergraphs. Inspired by SMOTE\nconcept, we propose HyperSMOTE as a solution to alleviate the class imbalance\nissue in hypergraph learning. This method involves a two-step process:\ninitially synthesizing minority class nodes, followed by the nodes integration\ninto the original hypergraph. We synthesize new nodes based on samples from\nminority classes and their neighbors. At the same time, in order to solve the\nproblem on integrating the new node into the hypergraph, we train a decoder\nbased on the original hypergraph incidence matrix to adaptively associate the\naugmented node to hyperedges. We conduct extensive evaluation on multiple\nsingle-modality datasets, such as Cora, Cora-CA and Citeseer, as well as\nmultimodal conversation dataset MELD to verify the effectiveness of HyperSMOTE,\nshowing an average performance gain of 3.38% and 2.97% on accuracy,\nrespectively.\n","authors":["Ziming Zhao","Tiehua Zhang","Zijian Yi","Zhishu Shen"],"pdf_url":"https://arxiv.org/pdf/2409.05402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05399v1","updated":"2024-09-09T07:55:59Z","published":"2024-09-09T07:55:59Z","title":"Sequential Posterior Sampling with Diffusion Models","summary":" Diffusion models have quickly risen in popularity for their ability to model\ncomplex distributions and perform effective posterior sampling. Unfortunately,\nthe iterative nature of these generative models makes them computationally\nexpensive and unsuitable for real-time sequential inverse problems such as\nultrasound imaging. Considering the strong temporal structure across sequences\nof frames, we propose a novel approach that models the transition dynamics to\nimprove the efficiency of sequential diffusion posterior sampling in\nconditional image synthesis. Through modeling sequence data using a video\nvision transformer (ViViT) transition model based on previous diffusion\noutputs, we can initialize the reverse diffusion trajectory at a lower noise\nscale, greatly reducing the number of iterations required for convergence. We\ndemonstrate the effectiveness of our approach on a real-world dataset of high\nframe rate cardiac ultrasound images and show that it achieves the same\nperformance as a full diffusion trajectory while accelerating inference\n25$\\times$, enabling real-time posterior sampling. Furthermore, we show that\nthe addition of a transition model improves the PSNR up to 8\\% in cases with\nsevere motion. Our method opens up new possibilities for real-time applications\nof diffusion models in imaging and other domains requiring real-time inference.\n","authors":["Tristan S. W. Stevens","Oisín Nolan","Jean-Luc Robert","Ruud J. G. van Sloun"],"pdf_url":"https://arxiv.org/pdf/2409.05399v1.pdf","comment":"5 pages, 4 figures, preprint"},{"id":"http://arxiv.org/abs/2409.05395v1","updated":"2024-09-09T07:49:09Z","published":"2024-09-09T07:49:09Z","title":"Shaking Up VLMs: Comparing Transformers and Structured State Space\n Models for Vision & Language Modeling","summary":" This study explores replacing Transformers in Visual Language Models (VLMs)\nwith Mamba, a recent structured state space model (SSM) that demonstrates\npromising performance in sequence modeling. We test models up to 3B parameters\nunder controlled conditions, showing that Mamba-based VLMs outperforms\nTransformers-based VLMs in captioning, question answering, and reading\ncomprehension. However, we find that Transformers achieve greater performance\nin visual grounding and the performance gap widens with scale. We explore two\nhypotheses to explain this phenomenon: 1) the effect of task-agnostic visual\nencoding on the updates of the hidden states, and 2) the difficulty in\nperforming visual grounding from the perspective of in-context multimodal\nretrieval. Our results indicate that a task-aware encoding yields minimal\nperformance gains on grounding, however, Transformers significantly outperform\nMamba at in-context multimodal retrieval. Overall, Mamba shows promising\nperformance on tasks where the correct output relies on a summary of the image\nbut struggles when retrieval of explicit information from the context is\nrequired.\n","authors":["Georgios Pantazopoulos","Malvina Nikandrou","Alessandro Suglia","Oliver Lemon","Arash Eshghi"],"pdf_url":"https://arxiv.org/pdf/2409.05395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03893v2","updated":"2024-09-09T07:47:58Z","published":"2024-09-05T19:59:42Z","title":"Understanding Fairness in Recommender Systems: A Healthcare Perspective","summary":" Fairness in AI-driven decision-making systems has become a critical concern,\nespecially when these systems directly affect human lives. This paper explores\nthe public's comprehension of fairness in healthcare recommendations. We\nconducted a survey where participants selected from four fairness metrics --\nDemographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive\nValue -- across different healthcare scenarios to assess their understanding of\nthese concepts. Our findings reveal that fairness is a complex and often\nmisunderstood concept, with a generally low level of public understanding\nregarding fairness metrics in recommender systems. This study highlights the\nneed for enhanced information and education on algorithmic fairness to support\ninformed decision-making in using these systems. Furthermore, the results\nsuggest that a one-size-fits-all approach to fairness may be insufficient,\npointing to the importance of context-sensitive designs in developing equitable\nAI systems.\n","authors":["Veronica Kecki","Alan Said"],"pdf_url":"https://arxiv.org/pdf/2409.03893v2.pdf","comment":"Accepted to the 18th ACM Conference on Recommender Systems"},{"id":"http://arxiv.org/abs/2407.02263v4","updated":"2024-09-09T07:38:07Z","published":"2024-07-02T13:40:29Z","title":"FreeCG: Free the Design Space of Clebsch-Gordan Transform for Machine\n Learning Force Fields","summary":" Machine Learning Force Fields (MLFFs) are of great importance for chemistry,\nphysics, materials science, and many other related fields. The Clebsch-Gordan\nTransform (CG transform) effectively encodes many-body interactions and is thus\nan important building block for many models of MLFFs. However, the\npermutation-equivariance requirement of MLFFs limits the design space of CG\ntransform, that is, intensive CG transform has to be conducted for each\nneighboring edge and the operations should be performed in the same manner for\nall edges. This constraint results in reduced expressiveness of the model while\nsimultaneously increasing computational demands. To overcome this challenge, we\nfirst implement the CG transform layer on the permutation-invariant abstract\nedges generated from real edge information. We show that this approach allows\ncomplete freedom in the design of the layer without compromising the crucial\nsymmetry. Developing on this free design space, we further propose group CG\ntransform with sparse path, abstract edges shuffling, and attention enhancer to\nform a powerful and efficient CG transform layer. Our method, known as FreeCG,\nachieves state-of-the-art (SOTA) results in force prediction for MD17, rMD17,\nMD22, and is well extended to property prediction in QM9 datasets with several\nimprovements greater than 15% and the maximum beyond 20%. The extensive\nreal-world applications showcase high practicality. FreeCG introduces a novel\nparadigm for carrying out efficient and expressive CG transform in future\ngeometric neural network designs. To demonstrate this, the recent SOTA,\nQuinNet, is also enhanced under our paradigm. Code will be publicly available.\n","authors":["Shihao Shao","Haoran Geng","Zun Wang","Qinghua Cui"],"pdf_url":"https://arxiv.org/pdf/2407.02263v4.pdf","comment":"25 pages, 8 tables, 11 figures"},{"id":"http://arxiv.org/abs/2409.05389v1","updated":"2024-09-09T07:34:08Z","published":"2024-09-09T07:34:08Z","title":"A Novel Representation of Periodic Pattern and Its Application to\n Untrained Anomaly Detection","summary":" There are a variety of industrial products that possess periodic textures or\nsurfaces, such as carbon fiber textiles and display panels. Traditional\nimage-based quality inspection methods for these products require identifying\nthe periodic patterns from normal images (without anomaly and noise) and\nsubsequently detecting anomaly pixels with inconsistent appearances. However,\nit remains challenging to accurately extract the periodic pattern from a single\nimage in the presence of unknown anomalies and measurement noise. To deal with\nthis challenge, this paper proposes a novel self-representation of the periodic\nimage defined on a set of continuous parameters. In this way, periodic pattern\nlearning can be embedded into a joint optimization framework, which is named\nperiodic-sparse decomposition, with simultaneously modeling the sparse\nanomalies and Gaussian noise. Finally, for the real-world industrial images\nthat may not strictly satisfy the periodic assumption, we propose a novel\npixel-level anomaly scoring strategy to enhance the performance of anomaly\ndetection. Both simulated and real-world case studies demonstrate the\neffectiveness of the proposed methodology for periodic pattern learning and\nanomaly detection.\n","authors":["Peng Ye","Chengyu Tao","Juan Du"],"pdf_url":"https://arxiv.org/pdf/2409.05389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14894v3","updated":"2024-09-09T07:07:11Z","published":"2023-10-23T13:04:15Z","title":"Local Universal Explainer (LUX) -- a rule-based explainer with factual,\n counterfactual and visual explanations","summary":" Explainable artificial intelligence (XAI) is one of the most intensively\ndeveloped area of AI in recent years. It is also one of the most fragmented\nwith multiple methods that focus on different aspects of explanations. This\nmakes difficult to obtain the full spectrum of explanation at once in a compact\nand consistent way. To address this issue, we present Local Universal Explainer\n(LUX), which is a rule-based explainer that can generate factual,\ncounterfactual and visual explanations. It is based on a modified version of\ndecision tree algorithms that allows for oblique splits and integration with\nfeature importance XAI methods such as SHAP. It limits the use data generation\nin opposite to other algorithms, but is focused on selecting local concepts in\na form of high-density clusters of real data that have the highest impact on\nforming the decision boundary of the explained model and generating artificial\nsamples with novel SHAP-guided sampling algorithm. We tested our method on real\nand synthetic datasets and compared it with state-of-the-art rule-based\nexplainers such as LORE, EXPLAN and Anchor. Our method outperforms the existing\napproaches in terms of simplicity, fidelity, representativeness, and\nconsistency.\n","authors":["Szymon Bobek","Grzegorz J. Nalepa"],"pdf_url":"https://arxiv.org/pdf/2310.14894v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14840v2","updated":"2024-09-09T06:57:22Z","published":"2024-08-27T07:51:26Z","title":"CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding","summary":" Knowledge graph embedding (KGE) constitutes a foundational task, directed\ntowards learning representations for entities and relations within knowledge\ngraphs (KGs), with the objective of crafting representations comprehensive\nenough to approximate the logical and symbolic interconnections among entities.\nIn this paper, we define a metric Z-counts to measure the difficulty of\ntraining each triple ($<$head entity, relation, tail entity$>$) in KGs with\ntheoretical analysis. Based on this metric, we propose \\textbf{CL4KGE}, an\nefficient \\textbf{C}urriculum \\textbf{L}earning based training strategy for\n\\textbf{KGE}. This method includes a difficulty measurer and a training\nscheduler that aids in the training of KGE models. Our approach possesses the\nflexibility to act as a plugin within a wide range of KGE models, with the\nadded advantage of adaptability to the majority of KGs in existence. The\nproposed method has been evaluated on popular KGE models, and the results\ndemonstrate that it enhances the state-of-the-art methods. The use of Z-counts\nas a metric has enabled the identification of challenging triples in KGs, which\nhelps in devising effective training strategies.\n","authors":["Yang Liu","Chuan Zhou","Peng Zhang","Yanan Cao","Yongchao Liu","Zhao Li","Hongyang Chen"],"pdf_url":"https://arxiv.org/pdf/2408.14840v2.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.05358v1","updated":"2024-09-09T06:39:56Z","published":"2024-09-09T06:39:56Z","title":"BAMDP Shaping: a Unified Theoretical Framework for Intrinsic Motivation\n and Reward Shaping","summary":" Intrinsic motivation (IM) and reward shaping are common methods for guiding\nthe exploration of reinforcement learning (RL) agents by adding pseudo-rewards.\nDesigning these rewards is challenging, however, and they can\ncounter-intuitively harm performance. To address this, we characterize them as\nreward shaping in Bayes-Adaptive Markov Decision Processes (BAMDPs), which\nformalizes the value of exploration by formulating the RL process as updating a\nprior over possible MDPs through experience. RL algorithms can be viewed as\nBAMDP policies; instead of attempting to find optimal algorithms by solving\nBAMDPs directly, we use it at a theoretical framework for understanding how\npseudo-rewards guide suboptimal algorithms. By decomposing BAMDP state value\ninto the value of the information collected plus the prior value of the\nphysical state, we show how psuedo-rewards can help by compensating for RL\nalgorithms' misestimation of these two terms, yielding a new typology of IM and\nreward shaping approaches. We carefully extend the potential-based shaping\ntheorem to BAMDPs to prove that when pseudo-rewards are BAMDP Potential-based\nshaping Functions (BAMPFs), they preserve optimal, or approximately optimal,\nbehavior of RL algorithms; otherwise, they can corrupt even optimal learners.\nWe finally give guidance on how to design or convert existing pseudo-rewards to\nBAMPFs by expressing assumptions about the environment as potential functions\non BAMDP states.\n","authors":["Aly Lidayan","Michael Dennis","Stuart Russell"],"pdf_url":"https://arxiv.org/pdf/2409.05358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05357v1","updated":"2024-09-09T06:35:24Z","published":"2024-09-09T06:35:24Z","title":"Attention Based Machine Learning Methods for Data Reduction with\n Guaranteed Error Bounds","summary":" Scientific applications in fields such as high energy physics, computational\nfluid dynamics, and climate science generate vast amounts of data at high\nvelocities. This exponential growth in data production is surpassing the\nadvancements in computing power, network capabilities, and storage capacities.\nTo address this challenge, data compression or reduction techniques are\ncrucial. These scientific datasets have underlying data structures that consist\nof structured and block structured multidimensional meshes where each grid\npoint corresponds to a tensor. It is important that data reduction techniques\nleverage strong spatial and temporal correlations that are ubiquitous in these\napplications. Additionally, applications such as CFD, process tensors\ncomprising hundred plus species and their attributes at each grid point.\nReduction techniques should be able to leverage interrelationships between the\nelements in each tensor. In this paper, we propose an attention-based\nhierarchical compression method utilizing a block-wise compression setup. We\nintroduce an attention-based hyper-block autoencoder to capture inter-block\ncorrelations, followed by a block-wise encoder to capture block-specific\ninformation. A PCA-based post-processing step is employed to guarantee error\nbounds for each data block. Our method effectively captures both spatiotemporal\nand inter-variable correlations within and between data blocks. Compared to the\nstate-of-the-art SZ3, our method achieves up to 8 times higher compression\nratio on the multi-variable S3D dataset. When evaluated on single-variable\nsetups using the E3SM and XGC datasets, our method still achieves up to 3 times\nand 2 times higher compression ratio, respectively.\n","authors":["Xiao Li","Jaemoon Lee","Anand Rangarajan","Sanjay Ranka"],"pdf_url":"https://arxiv.org/pdf/2409.05357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05356v1","updated":"2024-09-09T06:28:47Z","published":"2024-09-09T06:28:47Z","title":"IndicVoices-R: Unlocking a Massive Multilingual Multi-speaker Speech\n Corpus for Scaling Indian TTS","summary":" Recent advancements in text-to-speech (TTS) synthesis show that large-scale\nmodels trained with extensive web data produce highly natural-sounding output.\nHowever, such data is scarce for Indian languages due to the lack of\nhigh-quality, manually subtitled data on platforms like LibriVox or YouTube. To\naddress this gap, we enhance existing large-scale ASR datasets containing\nnatural conversations collected in low-quality environments to generate\nhigh-quality TTS training data. Our pipeline leverages the cross-lingual\ngeneralization of denoising and speech enhancement models trained on English\nand applied to Indian languages. This results in IndicVoices-R (IV-R), the\nlargest multilingual Indian TTS dataset derived from an ASR dataset, with 1,704\nhours of high-quality speech from 10,496 speakers across 22 Indian languages.\nIV-R matches the quality of gold-standard TTS datasets like LJSpeech, LibriTTS,\nand IndicTTS. We also introduce the IV-R Benchmark, the first to assess\nzero-shot, few-shot, and many-shot speaker generalization capabilities of TTS\nmodels on Indian voices, ensuring diversity in age, gender, and style. We\ndemonstrate that fine-tuning an English pre-trained model on a combined dataset\nof high-quality IndicTTS and our IV-R dataset results in better zero-shot\nspeaker generalization compared to fine-tuning on the IndicTTS dataset alone.\nFurther, our evaluation reveals limited zero-shot generalization for Indian\nvoices in TTS models trained on prior datasets, which we improve by fine-tuning\nthe model on our data containing diverse set of speakers across language\nfamilies. We open-source all data and code, releasing the first TTS model for\nall 22 official Indian languages.\n","authors":["Ashwin Sankar","Srija Anand","Praveen Srinivasa Varadhan","Sherry Thomas","Mehak Singal","Shridhar Kumar","Deovrat Mehendale","Aditi Krishana","Giri Raju","Mitesh Khapra"],"pdf_url":"https://arxiv.org/pdf/2409.05356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05354v1","updated":"2024-09-09T06:27:54Z","published":"2024-09-09T06:27:54Z","title":"Recursive Nested Filtering for Efficient Amortized Bayesian Experimental\n Design","summary":" This paper introduces the Inside-Out Nested Particle Filter (IO-NPF), a\nnovel, fully recursive, algorithm for amortized sequential Bayesian\nexperimental design in the non-exchangeable setting. We frame policy\noptimization as maximum likelihood estimation in a non-Markovian state-space\nmodel, achieving (at most) $\\mathcal{O}(T^2)$ computational complexity in the\nnumber of experiments. We provide theoretical convergence guarantees and\nintroduce a backward sampling algorithm to reduce trajectory degeneracy. IO-NPF\noffers a practical, extensible, and provably consistent approach to sequential\nBayesian experimental design, demonstrating improved efficiency over existing\nmethods.\n","authors":["Sahel Iqbal","Hany Abdulsamad","Sara Pérez-Vieites","Simo Särkkä","Adrien Corenflos"],"pdf_url":"https://arxiv.org/pdf/2409.05354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14482v2","updated":"2024-09-09T06:19:07Z","published":"2024-07-19T17:35:47Z","title":"ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG\n Capabilities","summary":" In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K\ncontext window, designed to bridge the gap between open-source LLMs and leading\nproprietary models (e.g., GPT-4-Turbo) in long-context understanding and\nretrieval-augmented generation (RAG) capabilities. These two capabilities are\nessential for LLMs to process large volumes of information that cannot fit into\na single prompt and are complementary to each other, depending on the\ndownstream tasks and computational budgets. We present a detailed continued\ntraining recipe to extend the context window of Llama3-70B-base from 8K to 128K\ntokens, along with a three-stage instruction tuning process to enhance the\nmodel's instruction-following, RAG performance, and long-context understanding\ncapabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model\noutperforms most existing state-of-the-art models, including\nGPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on\nultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only\na 4K context window, showing the strong long context capability across varying\nsequence lengths. We further provide extensive comparisons between direct\nlong-context and RAG solutions using the same state-of-the-art long-context\nLLMs. Interestingly, we find that the performance of strong long-context LLMs\nusing RAG improves when retrieving a larger number of chunks. With a large set\nof top-k chunks, RAG consistently outperforms direct long-context solution\nusing the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B\nand Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To\nadvance research in this field, we open-sourced the model weights, training\ndata, and the evaluation setup for the for the community:\nhttps://chatqa2-project.github.io/\n","authors":["Peng Xu","Wei Ping","Xianchao Wu","Chejian Xu","Zihan Liu","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2407.14482v2.pdf","comment":"v2: major update with significantly improved results"},{"id":"http://arxiv.org/abs/2409.05349v1","updated":"2024-09-09T06:10:31Z","published":"2024-09-09T06:10:31Z","title":"On the Convergence Analysis of Over-Parameterized Variational\n Autoencoders: A Neural Tangent Kernel Perspective","summary":" Variational Auto-Encoders (VAEs) have emerged as powerful probabilistic\nmodels for generative tasks. However, their convergence properties have not\nbeen rigorously proven. The challenge of proving convergence is inherently\ndifficult due to the highly non-convex nature of the training objective and the\nimplementation of a Stochastic Neural Network (SNN) within VAE architectures.\nThis paper addresses these challenges by characterizing the optimization\ntrajectory of SNNs utilized in VAEs through the lens of Neural Tangent Kernel\n(NTK) techniques. These techniques govern the optimization and generalization\nbehaviors of ultra-wide neural networks. We provide a mathematical proof of VAE\nconvergence under mild assumptions, thus advancing the theoretical\nunderstanding of VAE optimization dynamics. Furthermore, we establish a novel\nconnection between the optimization problem faced by over-parameterized SNNs\nand the Kernel Ridge Regression (KRR) problem. Our findings not only contribute\nto the theoretical foundation of VAEs but also open new avenues for\ninvestigating the optimization of generative models using advanced kernel\nmethods. Our theoretical claims are verified by experimental simulations.\n","authors":["Li Wang","Wei Huang"],"pdf_url":"https://arxiv.org/pdf/2409.05349v1.pdf","comment":"Accepted by Machine Learning journal"},{"id":"http://arxiv.org/abs/2409.05347v1","updated":"2024-09-09T06:04:42Z","published":"2024-09-09T06:04:42Z","title":"TriplePlay: Enhancing Federated Learning with CLIP for Non-IID Data and\n Resource Efficiency","summary":" The rapid advancement and increasing complexity of pretrained models,\nexemplified by CLIP, offer significant opportunities as well as challenges for\nFederated Learning (FL), a critical component of privacy-preserving artificial\nintelligence. This research delves into the intricacies of integrating large\nfoundation models like CLIP within FL frameworks to enhance privacy,\nefficiency, and adaptability across heterogeneous data landscapes. It\nspecifically addresses the challenges posed by non-IID data distributions, the\ncomputational and communication overheads of leveraging such complex models,\nand the skewed representation of classes within datasets. We propose\nTriplePlay, a framework that integrates CLIP as an adapter to enhance FL's\nadaptability and performance across diverse data distributions. This approach\naddresses the long-tail distribution challenge to ensure fairness while\nreducing resource demands through quantization and low-rank adaptation\ntechniques.Our simulation results demonstrate that TriplePlay effectively\ndecreases GPU usage costs and speeds up the learning process, achieving\nconvergence with reduced communication overhead.\n","authors":["Ahmed Imteaj","Md Zarif Hossain","Saika Zaman","Abdur R. Shahid"],"pdf_url":"https://arxiv.org/pdf/2409.05347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05346v1","updated":"2024-09-09T06:04:41Z","published":"2024-09-09T06:04:41Z","title":"GDFlow: Anomaly Detection with NCDE-based Normalizing Flow for Advanced\n Driver Assistance System","summary":" For electric vehicles, the Adaptive Cruise Control (ACC) in Advanced Driver\nAssistance Systems (ADAS) is designed to assist braking based on driving\nconditions, road inclines, predefined deceleration strengths, and user braking\npatterns. However, the driving data collected during the development of ADAS\nare generally limited and lack diversity. This deficiency leads to late or\naggressive braking for different users. Crucially, it is necessary to\neffectively identify anomalies, such as unexpected or inconsistent braking\npatterns in ADAS, especially given the challenge of working with unlabelled,\nlimited, and noisy datasets from real-world electric vehicles. In order to\ntackle the aforementioned challenges in ADAS, we propose Graph Neural\nControlled Differential Equation Normalizing Flow (GDFlow), a model that\nleverages Normalizing Flow (NF) with Neural Controlled Differential Equations\n(NCDE) to learn the distribution of normal driving patterns continuously.\nCompared to the traditional clustering or anomaly detection algorithms, our\napproach effectively captures the spatio-temporal information from different\nsensor data and more accurately models continuous changes in driving patterns.\nAdditionally, we introduce a quantile-based maximum likelihood objective to\nimprove the likelihood estimate of the normal data near the boundary of the\ndistribution, enhancing the model's ability to distinguish between normal and\nanomalous patterns. We validate GDFlow using real-world electric vehicle\ndriving data that we collected from Hyundai IONIQ5 and GV80EV, achieving\nstate-of-the-art performance compared to six baselines across four dataset\nconfigurations of different vehicle types and drivers. Furthermore, our model\noutperforms the latest anomaly detection methods across four time series\nbenchmark datasets. Our approach demonstrates superior efficiency in inference\ntime compared to existing methods.\n","authors":["Kangjun Lee","Minha Kim","Youngho Jun","Simon S. Woo"],"pdf_url":"https://arxiv.org/pdf/2409.05346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05345v1","updated":"2024-09-09T06:03:23Z","published":"2024-09-09T06:03:23Z","title":"Robust Non-adaptive Group Testing under Errors in Group Membership\n Specifications","summary":" Given $p$ samples, each of which may or may not be defective, group testing\n(GT) aims to determine their defect status by performing tests on $n < p$\n`groups', where a group is formed by mixing a subset of the $p$ samples.\nAssuming that the number of defective samples is very small compared to $p$, GT\nalgorithms have provided excellent recovery of the status of all $p$ samples\nwith even a small number of groups. Most existing methods, however, assume that\nthe group memberships are accurately specified. This assumption may not always\nbe true in all applications, due to various resource constraints. Such errors\ncould occur, eg, when a technician, preparing the groups in a laboratory,\nunknowingly mixes together an incorrect subset of samples as compared to what\nwas specified. We develop a new GT method, the Debiased Robust Lasso Test\nMethod (DRLT), that handles such group membership specification errors. The\nproposed DRLT method is based on an approach to debias, or reduce the inherent\nbias in, estimates produced by Lasso, a popular and effective sparse regression\ntechnique. We also provide theoretical upper bounds on the reconstruction error\nproduced by our estimator. Our approach is then combined with two carefully\ndesigned hypothesis tests respectively for (i) the identification of defective\nsamples in the presence of errors in group membership specifications, and (ii)\nthe identification of groups with erroneous membership specifications. The DRLT\napproach extends the literature on bias mitigation of statistical estimators\nsuch as the LASSO, to handle the important case when some of the measurements\ncontain outliers, due to factors such as group membership specification errors.\nWe present numerical results which show that our approach outperforms several\nbaselines and robust regression techniques for identification of defective\nsamples as well as erroneously specified groups.\n","authors":["Shuvayan Banerjee","Radhendushka Srivastava","James Saunderson","Ajit Rajwade"],"pdf_url":"https://arxiv.org/pdf/2409.05345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00629v2","updated":"2024-09-09T06:01:24Z","published":"2024-09-01T06:26:42Z","title":"Assessing the Impact of Upselling in Online Fantasy Sports","summary":" This study explores the impact of upselling on user engagement. We model\nusers' deposit behaviour on the fantasy sports platform Dream11. Subsequently,\nwe develop an experimental framework to evaluate the effect of upselling using\nan intensity parameter. Our live experiments on user deposit behaviour reveal\ndecreased user recall with heightened upselling intensity. Our findings\nindicate that increased upselling intensity improves user deposit metrics and\nconcurrently diminishes user satisfaction and conversion rates. We conduct\nrobust counterfactual analysis and train causal meta-learners to personalise\nusers' upselling intensity levels to reach an optimal trade-off point.\n","authors":["Aayush Chaudhary"],"pdf_url":"https://arxiv.org/pdf/2409.00629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03777v2","updated":"2024-09-09T05:58:29Z","published":"2024-08-22T03:59:57Z","title":"A Greedy Hierarchical Approach to Whole-Network Filter-Pruning in CNNs","summary":" Deep convolutional neural networks (CNNs) have achieved impressive\nperformance in many computer vision tasks. However, their large model sizes\nrequire heavy computational resources, making pruning redundant filters from\nexisting pre-trained CNNs an essential task in developing efficient models for\nresource-constrained devices. Whole-network filter pruning algorithms prune\nvarying fractions of filters from each layer, hence providing greater\nflexibility. Current whole-network pruning methods are either computationally\nexpensive due to the need to calculate the loss for each pruned filter using a\ntraining dataset, or use various heuristic / learned criteria for determining\nthe pruning fractions for each layer. This paper proposes a two-level\nhierarchical approach for whole-network filter pruning which is efficient and\nuses the classification loss as the final criterion. The lower-level algorithm\n(called filter-pruning) uses a sparse-approximation formulation based on linear\napproximation of filter weights. We explore two algorithms: orthogonal matching\npursuit-based greedy selection and a greedy backward pruning approach. The\nbackward pruning algorithm uses a novel closed-form error criterion for\nefficiently selecting the optimal filter at each stage, thus making the whole\nalgorithm much faster. The higher-level algorithm (called layer-selection)\ngreedily selects the best-pruned layer (pruning using the filter-selection\nalgorithm) using a global pruning criterion. We propose algorithms for two\ndifferent global-pruning criteria: (1) layer-wise relative error (HBGS), and\n(2) final classification error (HBGTS). Our suite of algorithms outperforms\nstate-of-the-art pruning methods on ResNet18, ResNet32, ResNet56, VGG16, and\nResNext101. Our method reduces the RAM requirement for ResNext101 from 7.6 GB\nto 1.5 GB and achieves a 94% reduction in FLOPS without losing accuracy on\nCIFAR-10.\n","authors":["Kiran Purohit","Anurag Reddy Parvathgari","Sourangshu Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2409.03777v2.pdf","comment":"Accepted in TMLR 2024"},{"id":"http://arxiv.org/abs/2409.05339v1","updated":"2024-09-09T05:31:51Z","published":"2024-09-09T05:31:51Z","title":"Graffin: Stand for Tails in Imbalanced Node Classification","summary":" Graph representation learning (GRL) models have succeeded in many scenarios.\nReal-world graphs have imbalanced distribution, such as node labels and\ndegrees, which leaves a critical challenge to GRL. Imbalanced inputs can lead\nto imbalanced outputs. However, most existing works ignore it and assume that\nthe distribution of input graphs is balanced, which cannot align with real\nsituations, resulting in worse model performance on tail data. The domination\nof head data makes tail data underrepresented when training graph neural\nnetworks (GNNs). Thus, we propose Graffin, a pluggable tail data augmentation\nmodule, to address the above issues. Inspired by recurrent neural networks\n(RNNs), Graffin flows head features into tail data through graph serialization\ntechniques to alleviate the imbalance of tail representation. The local and\nglobal structures are fused to form the node representation under the combined\neffect of neighborhood and sequence information, which enriches the semantics\nof tail data. We validate the performance of Graffin on four real-world\ndatasets in node classification tasks. Results show that Graffin can improve\nthe adaptation to tail data without significantly degrading the overall model\nperformance.\n","authors":["Xiaorui Qi","Yanlong Wen","Xiaojie Yuan"],"pdf_url":"https://arxiv.org/pdf/2409.05339v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.05335v1","updated":"2024-09-09T05:26:33Z","published":"2024-09-09T05:26:33Z","title":"A Multi-Modal Deep Learning Based Approach for House Price Prediction","summary":" Accurate prediction of house price, a vital aspect of the residential real\nestate sector, is of substantial interest for a wide range of stakeholders.\nHowever, predicting house prices is a complex task due to the significant\nvariability influenced by factors such as house features, location,\nneighborhood, and many others. Despite numerous attempts utilizing a wide array\nof algorithms, including recent deep learning techniques, to predict house\nprices accurately, existing approaches have fallen short of considering a wide\nrange of factors such as textual and visual features. This paper addresses this\ngap by comprehensively incorporating attributes, such as features, textual\ndescriptions, geo-spatial neighborhood, and house images, typically showcased\nin real estate listings in a house price prediction system. Specifically, we\npropose a multi-modal deep learning approach that leverages different types of\ndata to learn more accurate representation of the house. In particular, we\nlearn a joint embedding of raw house attributes, geo-spatial neighborhood, and\nmost importantly from textual description and images representing the house;\nand finally use a downstream regression model to predict the house price from\nthis jointly learned embedding vector. Our experimental results with a\nreal-world dataset show that the text embedding of the house advertisement\ndescription and image embedding of the house pictures in addition to raw\nattributes and geo-spatial embedding, can significantly improve the house price\nprediction accuracy. The relevant source code and dataset are publicly\naccessible at the following URL: https://github.com/4P0N/mhpp\n","authors":["Md Hasebul Hasan","Md Abid Jahan","Mohammed Eunus Ali","Yuan-Fang Li","Timos Sellis"],"pdf_url":"https://arxiv.org/pdf/2409.05335v1.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2409.05327v1","updated":"2024-09-09T04:42:57Z","published":"2024-09-09T04:42:57Z","title":"ICPR 2024 Competition on Safe Segmentation of Drive Scenes in\n Unstructured Traffic and Adverse Weather Conditions","summary":" The ICPR 2024 Competition on Safe Segmentation of Drive Scenes in\nUnstructured Traffic and Adverse Weather Conditions served as a rigorous\nplatform to evaluate and benchmark state-of-the-art semantic segmentation\nmodels under challenging conditions for autonomous driving. Over several\nmonths, participants were provided with the IDD-AW dataset, consisting of 5000\nhigh-quality RGB-NIR image pairs, each annotated at the pixel level and\ncaptured under adverse weather conditions such as rain, fog, low light, and\nsnow. A key aspect of the competition was the use and improvement of the Safe\nmean Intersection over Union (Safe mIoU) metric, designed to penalize unsafe\nincorrect predictions that could be overlooked by traditional mIoU. This\ninnovative metric emphasized the importance of safety in developing autonomous\ndriving systems. The competition showed significant advancements in the field,\nwith participants demonstrating models that excelled in semantic segmentation\nand prioritized safety and robustness in unstructured and adverse conditions.\nThe results of the competition set new benchmarks in the domain, highlighting\nthe critical role of safety in deploying autonomous vehicles in real-world\nscenarios. The contributions from this competition are expected to drive\nfurther innovation in autonomous driving technology, addressing the critical\nchallenges of operating in diverse and unpredictable environments.\n","authors":["Furqan Ahmed Shaik","Sandeep Nagar","Aiswarya Maturi","Harshit Kumar Sankhla","Dibyendu Ghosh","Anshuman Majumdar","Srikanth Vidapanakal","Kunal Chaudhary","Sunny Manchanda","Girish Varma"],"pdf_url":"https://arxiv.org/pdf/2409.05327v1.pdf","comment":"15 pages, 7 figures, ICPR Competition Paper"},{"id":"http://arxiv.org/abs/2403.19159v2","updated":"2024-09-09T04:39:31Z","published":"2024-03-28T06:03:47Z","title":"Disentangling Length from Quality in Direct Preference Optimization","summary":" Reinforcement Learning from Human Feedback (RLHF) has been a crucial\ncomponent in the recent success of Large Language Models. However, RLHF is know\nto exploit biases in human preferences, such as verbosity. A well-formatted and\neloquent answer is often more highly rated by users, even when it is less\nhelpful and objective. A number of approaches have been developed to control\nthose biases in the classical RLHF literature, but the problem remains\nrelatively under-explored for Direct Alignment Algorithms such as Direct\nPreference Optimization (DPO). Unlike classical RLHF, DPO does not train a\nseparate reward model or use reinforcement learning directly, so previous\napproaches developed to control verbosity cannot be directly applied to this\nsetting. Our work makes several contributions. For the first time, we study the\nlength problem in the DPO setting, showing significant exploitation in DPO and\nlinking it to out-of-distribution bootstrapping. We then develop a principled\nbut simple regularization strategy that prevents length exploitation, while\nstill maintaining improvements in model quality. We demonstrate these effects\nacross datasets on summarization and dialogue, where we achieve up to 20\\%\nimprovement in win rates when controlling for length, despite the GPT4 judge's\nwell-known verbosity bias.\n","authors":["Ryan Park","Rafael Rafailov","Stefano Ermon","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2403.19159v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02802v2","updated":"2024-09-09T04:36:10Z","published":"2024-09-04T15:22:08Z","title":"Boosting Certificate Robustness for Time Series Classification with\n Efficient Self-Ensemble","summary":" Recently, the issue of adversarial robustness in the time series domain has\ngarnered significant attention. However, the available defense mechanisms\nremain limited, with adversarial training being the predominant approach,\nthough it does not provide theoretical guarantees. Randomized Smoothing has\nemerged as a standout method due to its ability to certify a provable lower\nbound on robustness radius under $\\ell_p$-ball attacks. Recognizing its\nsuccess, research in the time series domain has started focusing on these\naspects. However, existing research predominantly focuses on time series\nforecasting, or under the non-$\\ell_p$ robustness in statistic feature\naugmentation for time series classification~(TSC). Our review found that\nRandomized Smoothing performs modestly in TSC, struggling to provide effective\nassurances on datasets with poor robustness. Therefore, we propose a\nself-ensemble method to enhance the lower bound of the probability confidence\nof predicted labels by reducing the variance of classification margins, thereby\ncertifying a larger radius. This approach also addresses the computational\noverhead issue of Deep Ensemble~(DE) while remaining competitive and, in some\ncases, outperforming it in terms of robustness. Both theoretical analysis and\nexperimental results validate the effectiveness of our method, demonstrating\nsuperior performance in robustness testing compared to baseline approaches.\n","authors":["Chang Dong","Zhengyang Li","Liangwei Zheng","Weitong Chen","Wei Emma Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.02802v2.pdf","comment":"6 figures, 4 tables, 10 pages"},{"id":"http://arxiv.org/abs/2409.05325v1","updated":"2024-09-09T04:36:06Z","published":"2024-09-09T04:36:06Z","title":"Sample-Efficient Bayesian Optimization with Transfer Learning for\n Heterogeneous Search Spaces","summary":" Bayesian optimization (BO) is a powerful approach to sample-efficient\noptimization of black-box functions. However, in settings with very few\nfunction evaluations, a successful application of BO may require transferring\ninformation from historical experiments. These related experiments may not have\nexactly the same tunable parameters (search spaces), motivating the need for BO\nwith transfer learning for heterogeneous search spaces. In this paper, we\npropose two methods for this setting. The first approach leverages a Gaussian\nprocess (GP) model with a conditional kernel to transfer information between\ndifferent search spaces. Our second approach treats the missing parameters as\nhyperparameters of the GP model that can be inferred jointly with the other GP\nhyperparameters or set to fixed values. We show that these two methods perform\nwell on several benchmark problems.\n","authors":["Aryan Deshwal","Sait Cakmak","Yuhou Xia","David Eriksson"],"pdf_url":"https://arxiv.org/pdf/2409.05325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13964v4","updated":"2024-09-09T04:00:44Z","published":"2024-04-22T08:10:38Z","title":"An Economic Solution to Copyright Challenges of Generative AI","summary":" Generative artificial intelligence (AI) systems are trained on large data\ncorpora to generate new pieces of text, images, videos, and other media. There\nis growing concern that such systems may infringe on the copyright interests of\ntraining data contributors. To address the copyright challenges of generative\nAI, we propose a framework that compensates copyright owners proportionally to\ntheir contributions to the creation of AI-generated content. The metric for\ncontributions is quantitatively determined by leveraging the probabilistic\nnature of modern generative AI models and using techniques from cooperative\ngame theory in economics. This framework enables a platform where AI developers\nbenefit from access to high-quality training data, thus improving model\nperformance. Meanwhile, copyright owners receive fair compensation, driving the\ncontinued provision of relevant data for generative model training. Experiments\ndemonstrate that our framework successfully identifies the most relevant data\nsources used in artwork generation, ensuring a fair and interpretable\ndistribution of revenues among copyright owners.\n","authors":["Jiachen T. Wang","Zhun Deng","Hiroaki Chiba-Okabe","Boaz Barak","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2404.13964v4.pdf","comment":"Add additional experiments on language domain"},{"id":"http://arxiv.org/abs/2409.05314v1","updated":"2024-09-09T03:58:51Z","published":"2024-09-09T03:58:51Z","title":"Tele-LLMs: A Series of Specialized Large Language Models for\n Telecommunications","summary":" The emergence of large language models (LLMs) has significantly impacted\nvarious fields, from natural language processing to sectors like medicine and\nfinance. However, despite their rapid proliferation, the applications of LLMs\nin telecommunications remain limited, often relying on general-purpose models\nthat lack domain-specific specialization. This lack of specialization results\nin underperformance, particularly when dealing with telecommunications-specific\ntechnical terminology and their associated mathematical representations. This\npaper addresses this gap by first creating and disseminating Tele-Data, a\ncomprehensive dataset of telecommunications material curated from relevant\nsources, and Tele-Eval, a large-scale question-and-answer dataset tailored to\nthe domain. Through extensive experiments, we explore the most effective\ntraining techniques for adapting LLMs to the telecommunications domain, ranging\nfrom examining the division of expertise across various telecommunications\naspects to employing parameter-efficient techniques. We also investigate how\nmodels of different sizes behave during adaptation and analyze the impact of\ntheir training data on this behavior. Leveraging these findings, we develop and\nopen-source Tele-LLMs, the first series of language models ranging from 1B to\n8B parameters, specifically tailored for telecommunications. Our evaluations\ndemonstrate that these models outperform their general-purpose counterparts on\nTele-Eval while retaining their previously acquired capabilities, thus avoiding\nthe catastrophic forgetting phenomenon.\n","authors":["Ali Maatouk","Kenny Chirino Ampudia","Rex Ying","Leandros Tassiulas"],"pdf_url":"https://arxiv.org/pdf/2409.05314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17329v3","updated":"2024-09-09T03:36:47Z","published":"2023-12-28T19:09:56Z","title":"PINN surrogate of Li-ion battery models for parameter inference. Part I:\n Implementation and multi-fidelity hierarchies for the single-particle model","summary":" To plan and optimize energy storage demands that account for Li-ion battery\naging dynamics, techniques need to be developed to diagnose battery internal\nstates accurately and rapidly. This study seeks to reduce the computational\nresources needed to determine a battery's internal states by replacing\nphysics-based Li-ion battery models -- such as the single-particle model (SPM)\nand the pseudo-2D (P2D) model -- with a physics-informed neural network (PINN)\nsurrogate. The surrogate model makes high-throughput techniques, such as\nBayesian calibration, tractable to determine battery internal parameters from\nvoltage responses. This manuscript is the first of a two-part series that\nintroduces PINN surrogates of Li-ion battery models for parameter inference\n(i.e., state-of-health diagnostics). In this first part, a method is presented\nfor constructing a PINN surrogate of the SPM. A multi-fidelity hierarchical\ntraining, where several neural nets are trained with multiple physics-loss\nfidelities is shown to significantly improve the surrogate accuracy when only\ntraining on the governing equation residuals. The implementation is made\navailable in a companion repository (https://github.com/NREL/pinnstripes). The\ntechniques used to develop a PINN surrogate of the SPM are extended in Part II\nfor the PINN surrogate for the P2D battery model, and explore the Bayesian\ncalibration capabilities of both surrogates.\n","authors":["Malik Hassanaly","Peter J. Weddle","Ryan N. King","Subhayan De","Alireza Doostan","Corey R. Randall","Eric J. Dufek","Andrew M. Colclasure","Kandler Smith"],"pdf_url":"https://arxiv.org/pdf/2312.17329v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05305v1","updated":"2024-09-09T03:26:07Z","published":"2024-09-09T03:26:07Z","title":"Closed-Form Interpretation of Neural Network Latent Spaces with Symbolic\n Gradients","summary":" It has been demonstrated in many scientific fields that artificial neural\nnetworks like autoencoders or Siamese networks encode meaningful concepts in\ntheir latent spaces. However, there does not exist a comprehensive framework\nfor retrieving this information in a human-readable form without prior\nknowledge. In order to extract these concepts, we introduce a framework for\nfinding closed-form interpretations of neurons in latent spaces of artificial\nneural networks. The interpretation framework is based on embedding trained\nneural networks into an equivalence class of functions that encode the same\nconcept. We interpret these neural networks by finding an intersection between\nthe equivalence class and human-readable equations defined by a symbolic search\nspace. The approach is demonstrated by retrieving invariants of matrices and\nconserved quantities of dynamical systems from latent spaces of Siamese neural\nnetworks.\n","authors":["Zakaria Patel","Sebastian J. Wetzel"],"pdf_url":"https://arxiv.org/pdf/2409.05305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05303v1","updated":"2024-09-09T03:17:28Z","published":"2024-09-09T03:17:28Z","title":"Resource-Efficient Generative AI Model Deployment in Mobile Edge\n Networks","summary":" The surging development of Artificial Intelligence-Generated Content (AIGC)\nmarks a transformative era of the content creation and production. Edge servers\npromise attractive benefits, e.g., reduced service delay and backhaul traffic\nload, for hosting AIGC services compared to cloud-based solutions. However, the\nscarcity of available resources on the edge pose significant challenges in\ndeploying generative AI models. In this paper, by characterizing the resource\nand delay demands of typical generative AI models, we find that the consumption\nof storage and GPU memory, as well as the model switching delay represented by\nI/O delay during the preloading phase, are significant and vary across models.\nThese multidimensional coupling factors render it difficult to make efficient\nedge model deployment decisions. Hence, we present a collaborative edge-cloud\nframework aiming to properly manage generative AI model deployment on the edge.\nSpecifically, we formulate edge model deployment problem considering\nheterogeneous features of models as an optimization problem, and propose a\nmodel-level decision selection algorithm to solve it. It enables pooled\nresource sharing and optimizes the trade-off between resource consumption and\ndelay in edge generative AI model deployment. Simulation results validate the\nefficacy of the proposed algorithm compared with baselines, demonstrating its\npotential to reduce overall costs by providing feature-aware model deployment\ndecisions.\n","authors":["Yuxin Liang","Peng Yang","Yuanyuan He","Feng Lyu"],"pdf_url":"https://arxiv.org/pdf/2409.05303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14066v3","updated":"2024-09-09T03:06:21Z","published":"2024-07-19T06:50:24Z","title":"360VFI: A Dataset and Benchmark for Omnidirectional Video Frame\n Interpolation","summary":" Head-mounted 360{\\deg} displays and portable 360{\\deg} cameras have\nsignificantly progressed, providing viewers a realistic and immersive\nexperience. However, many omnidirectional videos have low frame rates that can\nlead to visual fatigue, and the prevailing plane frame interpolation\nmethodologies are unsuitable for omnidirectional video interpolation because\nthey are designed solely for traditional videos. This paper introduces the\nbenchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We\npresent a practical implementation that introduces a distortion prior from\nomnidirectional video into the network to modulate distortions. Specifically,\nwe propose a pyramid distortion-sensitive feature extractor that uses the\nunique characteristics of equirectangular projection (ERP) format as prior\ninformation. Moreover, we devise a decoder that uses an affine transformation\nto further facilitate the synthesis of intermediate frames. 360VFI is the first\ndataset and benchmark that explores the challenge of Omnidirectional Video\nFrame Interpolation. Through our benchmark analysis, we present four different\ndistortion condition scenes in the proposed 360VFI dataset to evaluate the\nchallenges triggered by distortion during interpolation. Besides, experimental\nresults demonstrate that Omnidirectional Video Interpolation can be effectively\nimproved by modeling for omnidirectional distortion.\n","authors":["Wenxuan Lu","Mengshun Hu","Yansheng Qiu","Liang Liao","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.14066v3.pdf","comment":"This is a preprint version"},{"id":"http://arxiv.org/abs/2409.05294v1","updated":"2024-09-09T03:02:16Z","published":"2024-09-09T03:02:16Z","title":"TERD: A Unified Framework for Safeguarding Diffusion Models Against\n Backdoors","summary":" Diffusion models have achieved notable success in image generation, but they\nremain highly vulnerable to backdoor attacks, which compromise their integrity\nby producing specific undesirable outputs when presented with a pre-defined\ntrigger. In this paper, we investigate how to protect diffusion models from\nthis dangerous threat. Specifically, we propose TERD, a backdoor defense\nframework that builds unified modeling for current attacks, which enables us to\nderive an accessible reversed loss. A trigger reversion strategy is further\nemployed: an initial approximation of the trigger through noise sampled from a\nprior distribution, followed by refinement through differential multi-step\nsamplers. Additionally, with the reversed trigger, we propose backdoor\ndetection from the noise space, introducing the first backdoor input detection\napproach for diffusion models and a novel model detection algorithm that\ncalculates the KL divergence between reversed and benign distributions.\nExtensive evaluations demonstrate that TERD secures a 100% True Positive Rate\n(TPR) and True Negative Rate (TNR) across datasets of varying resolutions. TERD\nalso demonstrates nice adaptability to other Stochastic Differential Equation\n(SDE)-based models. Our code is available at https://github.com/PKU-ML/TERD.\n","authors":["Yichuan Mo","Hui Huang","Mingjie Li","Ang Li","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2409.05294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05292v1","updated":"2024-09-09T03:00:53Z","published":"2024-09-09T03:00:53Z","title":"Mpox Narrative on Instagram: A Labeled Multilingual Dataset of Instagram\n Posts on Mpox for Sentiment, Hate Speech, and Anxiety Analysis","summary":" The world is currently experiencing an outbreak of mpox, which has been\ndeclared a Public Health Emergency of International Concern by WHO. No prior\nwork related to social media mining has focused on the development of a dataset\nof Instagram posts about the mpox outbreak. The work presented in this paper\naims to address this research gap and makes two scientific contributions to\nthis field. First, it presents a multilingual dataset of 60,127 Instagram posts\nabout mpox, published between July 23, 2022, and September 5, 2024. The\ndataset, available at https://dx.doi.org/10.21227/7fvc-y093, contains Instagram\nposts about mpox in 52 languages. For each of these posts, the Post ID, Post\nDescription, Date of publication, language, and translated version of the post\n(translation to English was performed using the Google Translate API) are\npresented as separate attributes in the dataset. After developing this dataset,\nsentiment analysis, hate speech detection, and anxiety or stress detection were\nperformed. This process included classifying each post into (i) one of the\nsentiment classes, i.e., fear, surprise, joy, sadness, anger, disgust, or\nneutral, (ii) hate or not hate, and (iii) anxiety/stress detected or no\nanxiety/stress detected. These results are presented as separate attributes in\nthe dataset. Second, this paper presents the results of performing sentiment\nanalysis, hate speech analysis, and anxiety or stress analysis. The variation\nof the sentiment classes - fear, surprise, joy, sadness, anger, disgust, and\nneutral were observed to be 27.95%, 2.57%, 8.69%, 5.94%, 2.69%, 1.53%, and\n50.64%, respectively. In terms of hate speech detection, 95.75% of the posts\ndid not contain hate and the remaining 4.25% of the posts contained hate.\nFinally, 72.05% of the posts did not indicate any anxiety/stress, and the\nremaining 27.95% of the posts represented some form of anxiety/stress.\n","authors":["Nirmalya Thakur"],"pdf_url":"https://arxiv.org/pdf/2409.05292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05291v1","updated":"2024-09-09T02:59:17Z","published":"2024-09-09T02:59:17Z","title":"Towards Fast Rates for Federated and Multi-Task Reinforcement Learning","summary":" We consider a setting involving $N$ agents, where each agent interacts with\nan environment modeled as a Markov Decision Process (MDP). The agents' MDPs\ndiffer in their reward functions, capturing heterogeneous objectives/tasks. The\ncollective goal of the agents is to communicate intermittently via a central\nserver to find a policy that maximizes the average of long-term cumulative\nrewards across environments. The limited existing work on this topic either\nonly provide asymptotic rates, or generate biased policies, or fail to\nestablish any benefits of collaboration. In response, we propose Fast-FedPG - a\nnovel federated policy gradient algorithm with a carefully designed\nbias-correction mechanism. Under a gradient-domination condition, we prove that\nour algorithm guarantees (i) fast linear convergence with exact gradients, and\n(ii) sub-linear rates that enjoy a linear speedup w.r.t. the number of agents\nwith noisy, truncated policy gradients. Notably, in each case, the convergence\nis to a globally optimal policy with no heterogeneity-induced bias. In the\nabsence of gradient-domination, we establish convergence to a first-order\nstationary point at a rate that continues to benefit from collaboration.\n","authors":["Feng Zhu","Robert W. Heath Jr.","Aritra Mitra"],"pdf_url":"https://arxiv.org/pdf/2409.05291v1.pdf","comment":"Accepted to the Decision and Control Conference (CDC), 2024"},{"id":"http://arxiv.org/abs/2408.16975v2","updated":"2024-09-09T02:44:49Z","published":"2024-08-30T02:36:36Z","title":"Technical Report of HelixFold3 for Biomolecular Structure Prediction","summary":" The AlphaFold series has transformed protein structure prediction with\nremarkable accuracy, often matching experimental methods. AlphaFold2,\nAlphaFold-Multimer, and the latest AlphaFold3 represent significant strides in\npredicting single protein chains, protein complexes, and biomolecular\nstructures. While AlphaFold2 and AlphaFold-Multimer are open-sourced,\nfacilitating rapid and reliable predictions, AlphaFold3 remains partially\naccessible through a limited online server and has not been open-sourced,\nrestricting further development. To address these challenges, the PaddleHelix\nteam is developing HelixFold3, aiming to replicate AlphaFold3's capabilities.\nUsing insights from previous models and extensive datasets, HelixFold3 achieves\nan accuracy comparable to AlphaFold3 in predicting the structures of\nconventional ligands, nucleic acids, and proteins. The initial release of\nHelixFold3 is available as open source on GitHub for academic research,\npromising to advance biomolecular research and accelerate discoveries. We also\nprovide online service at PaddleHelix website at\nhttps://paddlehelix.baidu.com/app/all/helixfold3/forecast.\n","authors":["Lihang Liu","Shanzhuo Zhang","Yang Xue","Xianbin Ye","Kunrui Zhu","Yuxin Li","Yang Liu","Wenlai Zhao","Hongkun Yu","Zhihua Wu","Xiaonan Zhang","Xiaomin Fang"],"pdf_url":"https://arxiv.org/pdf/2408.16975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05284v1","updated":"2024-09-09T02:32:45Z","published":"2024-09-09T02:32:45Z","title":"Efficiently Learning Markov Random Fields from Dynamics","summary":" An important task in high-dimensional statistics is learning the parameters\nor dependency structure of an undirected graphical model, or Markov random\nfield (MRF). Much of the prior work on this problem assumes access to i.i.d.\nsamples from the MRF distribution and state-of-the-art algorithms succeed using\n$n^{\\Theta(k)}$ runtime, where $n$ is the dimension and $k$ is the order of the\ninteractions. However, well-known reductions from the sparse parity with noise\nproblem imply that given i.i.d. samples from a sparse, order-$k$ MRF, any\nlearning algorithm likely requires $n^{\\Omega(k)}$ time, impeding the potential\nfor significant computational improvements. In this work, we demonstrate that\nthese fundamental barriers for learning MRFs can surprisingly be completely\ncircumvented when learning from natural, dynamical samples. We show that in\nbounded-degree MRFs, the dependency structure and parameters can be recovered\nusing a trajectory of Glauber dynamics of length $O(n \\log n)$ with runtime\n$O(n^2 \\log n)$. The implicit constants depend only on the degree and\nnon-degeneracy parameters of the model, but not the dimension $n$. In\nparticular, learning MRFs from dynamics is $\\textit{provably computationally\neasier}$ than learning from i.i.d. samples under standard hardness assumptions.\n","authors":["Jason Gaitonde","Ankur Moitra","Elchanan Mossel"],"pdf_url":"https://arxiv.org/pdf/2409.05284v1.pdf","comment":"40 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.03311v2","updated":"2024-09-09T02:01:07Z","published":"2024-07-03T17:54:11Z","title":"Efficient Imitation Without Demonstrations via Value-Penalized Auxiliary\n Control from Examples","summary":" Learning from examples of success is an ap pealing approach to reinforcement\nlearning but it presents a challenging exploration problem, especially for\ncomplex or long-horizon tasks. This work introduces value-penalized auxiliary\ncontrol from examples (VPACE), an algorithm that significantly improves\nexploration in example-based control by adding examples of simple auxiliary\ntasks. For instance, a manipulation task may have auxiliary examples of an\nobject being reached for, grasped, or lifted. We show that the na\\\"{i}ve\napplication of scheduled auxiliary control to example-based learning can lead\nto value overestimation and poor performance. We resolve the problem with an\nabove-success-level value penalty. Across both simulated and real robotic\nenvironments, we show that our approach substantially improves learning\nefficiency for challenging tasks, while maintaining bounded value estimates. We\ncompare with existing approaches to example-based learning, inverse\nreinforcement learning, and an exploration bonus. Preliminary results also\nsuggest that VPACE may learn more efficiently than the more common approaches\nof using full trajectories or true sparse rewards. Videos, code, and datasets:\nhttps://papers.starslab.ca/vpace.\n","authors":["Trevor Ablett","Bryan Chan","Jayce Haoran Wang","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2407.03311v2.pdf","comment":"Submitted to IEEE International Conference on Robotics and Automation\n (ICRA'25), Atlanta, USA, May 19-23, 2025"},{"id":"http://arxiv.org/abs/2404.12613v2","updated":"2024-09-09T01:53:39Z","published":"2024-04-19T03:53:50Z","title":"A Fourier Approach to the Parameter Estimation Problem for\n One-dimensional Gaussian Mixture Models","summary":" The purpose of this paper is twofold. First, we propose a novel algorithm for\nestimating parameters in one-dimensional Gaussian mixture models (GMMs). The\nalgorithm takes advantage of the Hankel structure inherent in the Fourier data\nobtained from independent and identically distributed (i.i.d) samples of the\nmixture. For GMMs with a unified variance, a singular value ratio functional\nusing the Fourier data is introduced and used to resolve the variance and\ncomponent number simultaneously. The consistency of the estimator is derived.\nCompared to classic algorithms such as the method of moments and the maximum\nlikelihood method, the proposed algorithm does not require prior knowledge of\nthe number of Gaussian components or good initial guesses. Numerical\nexperiments demonstrate its superior performance in estimation accuracy and\ncomputational cost. Second, we reveal that there exists a fundamental limit to\nthe problem of estimating the number of Gaussian components or model order in\nthe mixture model if the number of i.i.d samples is finite. For the case of a\nsingle variance, we show that the model order can be successfully estimated\nonly if the minimum separation distance between the component means exceeds a\ncertain threshold value and can fail if below. We derive a lower bound for this\nthreshold value, referred to as the computational resolution limit, in terms of\nthe number of i.i.d samples, the variance, and the number of Gaussian\ncomponents. Numerical experiments confirm this phase transition phenomenon in\nestimating the model order. Moreover, we demonstrate that our algorithm\nachieves better scores in likelihood, AIC, and BIC when compared to the EM\nalgorithm.\n","authors":["Xinyu Liu","Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.12613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03511v2","updated":"2024-09-09T01:44:27Z","published":"2023-05-02T15:33:09Z","title":"Shared Latent Space by Both Languages in Non-Autoregressive Neural\n Machine Translation","summary":" Non-autoregressive neural machine translation (NAT) offers substantial\ntranslation speed up compared to autoregressive neural machine translation (AT)\nat the cost of translation quality. Latent variable modeling has emerged as a\npromising approach to bridge this quality gap, particularly for addressing the\nchronic multimodality problem in NAT. In the previous works that used latent\nvariable modeling, they added an auxiliary model to estimate the posterior\ndistribution of the latent variable conditioned on the source and target\nsentences. However, it causes several disadvantages, such as redundant\ninformation extraction in the latent variable, increasing the number of\nparameters, and a tendency to ignore some information from the inputs. In this\npaper, we propose a novel latent variable modeling that integrates a dual\nreconstruction perspective and an advanced hierarchical latent modeling with a\nshared intermediate latent space across languages. This latent variable\nmodeling hypothetically alleviates or prevents the above disadvantages. In our\nexperiment results, we present comprehensive demonstrations that our proposed\napproach infers superior latent variables which lead better translation\nquality. Finally, in the benchmark translation tasks, such as WMT, we\ndemonstrate that our proposed method significantly improves translation quality\ncompared to previous NAT baselines including the state-of-the-art NAT model.\n","authors":["DongNyeong Heo","Heeyoul Choi"],"pdf_url":"https://arxiv.org/pdf/2305.03511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05265v1","updated":"2024-09-09T01:33:13Z","published":"2024-09-09T01:33:13Z","title":"Learning Submodular Sequencing from Samples","summary":" This paper addresses the problem of sequential submodular maximization:\nselecting and ranking items in a sequence to optimize some composite submodular\nfunction. In contrast to most of the previous works, which assume access to the\nutility function, we assume that we are given only a set of samples. Each\nsample includes a random sequence of items and its associated utility. We\npresent an algorithm that, given polynomially many samples drawn from a\ntwo-stage uniform distribution, achieves an approximation ratio dependent on\nthe curvature of individual submodular functions. Our results apply in a wide\nvariety of real-world scenarios, such as ranking products in online retail\nplatforms, where complete knowledge of the utility function is often impossible\nto obtain. Our algorithm gives an empirically useful solution in such contexts,\nthus proving that limited data can be of great use in sequencing tasks. From a\ntechnical perspective, our results extend prior work on ``optimization from\nsamples'' by generalizing from optimizing a set function to a\nsequence-dependent function.\n","authors":["Jing Yuan","Shaojie Tang"],"pdf_url":"https://arxiv.org/pdf/2409.05265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05258v1","updated":"2024-09-09T00:47:30Z","published":"2024-09-09T00:47:30Z","title":"Towards Automated Machine Learning Research","summary":" This paper explores a top-down approach to automating incremental advances in\nmachine learning research through component-level innovation, facilitated by\nLarge Language Models (LLMs). Our framework systematically generates novel\ncomponents, validates their feasibility, and evaluates their performance\nagainst existing baselines. A key distinction of this approach lies in how\nthese novel components are generated. Unlike traditional AutoML and NAS\nmethods, which often rely on a bottom-up combinatorial search over predefined,\nhardcoded base components, our method leverages the cross-domain knowledge\nembedded in LLMs to propose new components that may not be confined to any\nhard-coded predefined set. By incorporating a reward model to prioritize\npromising hypotheses, we aim to improve the efficiency of the hypothesis\ngeneration and evaluation process. We hope this approach offers a new avenue\nfor exploration and contributes to the ongoing dialogue in the field.\n","authors":["Shervin Ardeshir"],"pdf_url":"https://arxiv.org/pdf/2409.05258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01204v3","updated":"2024-09-09T00:42:16Z","published":"2024-02-02T08:17:41Z","title":"A Survey on Self-Supervised Learning for Non-Sequential Tabular Data","summary":" Self-supervised learning (SSL) has been incorporated into many\nstate-of-the-art models in various domains, where SSL defines pretext tasks\nbased on unlabeled datasets to learn contextualized and robust representations.\nRecently, SSL has become a new trend in exploring the representation learning\ncapability in the realm of tabular data, which is more challenging due to not\nhaving explicit relations for learning descriptive representations. This survey\naims to systematically review and summarize the recent progress and challenges\nof SSL for non-sequential tabular data (SSL4NS-TD). We first present a formal\ndefinition of NS-TD and clarify its correlation to related studies. Then, these\napproaches are categorized into three groups - predictive learning, contrastive\nlearning, and hybrid learning, with their motivations and strengths of\nrepresentative methods in each direction. Moreover, application issues of\nSSL4NS-TD are presented, including automatic data engineering, cross-table\ntransferability, and domain knowledge integration. In addition, we elaborate on\nexisting benchmarks and datasets for NS-TD applications to analyze the\nperformance of existing tabular models. Finally, we discuss the challenges of\nSSL4NS-TD and provide potential directions for future research. We expect our\nwork to be useful in terms of encouraging more research on lowering the barrier\nto entry SSL for the tabular domain, and of improving the foundations for\nimplicit tabular data.\n","authors":["Wei-Yao Wang","Wei-Wei Du","Derek Xu","Wei Wang","Wen-Chih Peng"],"pdf_url":"https://arxiv.org/pdf/2402.01204v3.pdf","comment":"ACML-24 Journal Track. The paper list can be found at\n https://github.com/wwweiwei/awesome-self-supervised-learning-for-tabular-data"},{"id":"http://arxiv.org/abs/2403.00033v5","updated":"2024-09-09T00:21:21Z","published":"2024-02-29T04:01:38Z","title":"Spatial Craving Patterns in Marijuana Users: Insights from fMRI Brain\n Connectivity Analysis with High-Order Graph Attention Neural Networks","summary":" The excessive consumption of marijuana can induce substantial psychological\nand social consequences. In this investigation, we propose an elucidative\nframework termed high-order graph attention neural networks (HOGANN) for the\nclassification of Marijuana addiction, coupled with an analysis of localized\nbrain network communities exhibiting abnormal activities among chronic\nmarijuana users. HOGANN integrates dynamic intrinsic functional brain networks,\nestimated from functional magnetic resonance imaging (fMRI), using graph\nattention-based long short-term memory (GAT-LSTM) to capture temporal network\ndynamics. We employ a high-order attention module for information fusion and\nmessage passing among neighboring nodes, enhancing the network community\nanalysis. Our model is validated across two distinct data cohorts, yielding\nsubstantially higher classification accuracy than benchmark algorithms.\nFurthermore, we discern the most pertinent subnetworks and cognitive regions\naffected by persistent marijuana consumption, indicating adverse effects on\nfunctional brain networks, particularly within the dorsal attention and\nfrontoparietal networks. Intriguingly, our model demonstrates superior\nperformance in cohorts exhibiting prolonged dependence, implying that prolonged\nmarijuana usage induces more pronounced alterations in brain networks. The\nmodel proficiently identifies craving brain maps, thereby delineating critical\nbrain regions for analysis\n","authors":["Jun-En Ding","Shihao Yang","Anna Zilverstand","Kaustubh R. Kulkarni","Xiaosi Gu","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2403.00033v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19578v2","updated":"2024-09-09T00:19:04Z","published":"2024-03-28T17:04:00Z","title":"Keypoint Action Tokens Enable In-Context Imitation Learning in Robotics","summary":" We show that off-the-shelf text-based Transformers, with no additional\ntraining, can perform few-shot in-context visual imitation learning, mapping\nvisual observations to action sequences that emulate the demonstrator's\nbehaviour. We achieve this by transforming visual observations (inputs) and\ntrajectories of actions (outputs) into sequences of tokens that a\ntext-pretrained Transformer (GPT-4 Turbo) can ingest and generate, via a\nframework we call Keypoint Action Tokens (KAT). Despite being trained only on\nlanguage, we show that these Transformers excel at translating tokenised visual\nkeypoint observations into action trajectories, performing on par or better\nthan state-of-the-art imitation learning (diffusion policies) in the low-data\nregime on a suite of real-world, everyday tasks. Rather than operating in the\nlanguage domain as is typical, KAT leverages text-based Transformers to operate\nin the vision and action domains to learn general patterns in demonstration\ndata for highly efficient imitation learning, indicating promising new avenues\nfor repurposing natural language models for embodied tasks. Videos are\navailable at https://www.robot-learning.uk/keypoint-action-tokens.\n","authors":["Norman Di Palo","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2403.19578v2.pdf","comment":"Published at Robotics: Science and Systems (RSS) 2024"},{"id":"http://arxiv.org/abs/2409.05255v1","updated":"2024-09-09T00:18:48Z","published":"2024-09-09T00:18:48Z","title":"Label-free evaluation of lung and heart transplant biopsies using\n virtual staining","summary":" Organ transplantation serves as the primary therapeutic strategy for\nend-stage organ failures. However, allograft rejection is a common complication\nof organ transplantation. Histological assessment is essential for the timely\ndetection and diagnosis of transplant rejection and remains the gold standard.\nNevertheless, the traditional histochemical staining process is time-consuming,\ncostly, and labor-intensive. Here, we present a panel of virtual staining\nneural networks for lung and heart transplant biopsies, which digitally convert\nautofluorescence microscopic images of label-free tissue sections into their\nbrightfield histologically stained counterparts, bypassing the traditional\nhistochemical staining process. Specifically, we virtually generated\nHematoxylin and Eosin (H&E), Masson's Trichrome (MT), and Elastic Verhoeff-Van\nGieson (EVG) stains for label-free transplant lung tissue, along with H&E and\nMT stains for label-free transplant heart tissue. Subsequent blind evaluations\nconducted by three board-certified pathologists have confirmed that the virtual\nstaining networks consistently produce high-quality histology images with high\ncolor uniformity, closely resembling their well-stained histochemical\ncounterparts across various tissue features. The use of virtually stained\nimages for the evaluation of transplant biopsies achieved comparable diagnostic\noutcomes to those obtained via traditional histochemical staining, with a\nconcordance rate of 82.4% for lung samples and 91.7% for heart samples.\nMoreover, virtual staining models create multiple stains from the same\nautofluorescence input, eliminating structural mismatches observed between\nadjacent sections stained in the traditional workflow, while also saving\ntissue, expert time, and staining costs.\n","authors":["Yuzhu Li","Nir Pillar","Tairan Liu","Guangdong Ma","Yuxuan Qi","Kevin de Haan","Yijie Zhang","Xilin Yang","Adrian J. Correa","Guangqian Xiao","Kuang-Yu Jen","Kenneth A. Iczkowski","Yulun Wu","William Dean Wallace","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2409.05255v1.pdf","comment":"21 Pages, 5 Figures"},{"id":"http://arxiv.org/abs/2409.02410v2","updated":"2024-09-09T00:04:21Z","published":"2024-09-04T03:25:48Z","title":"Adaptive Class Emergence Training: Enhancing Neural Network Stability\n and Generalization through Progressive Target Evolution","summary":" Recent advancements in artificial intelligence, particularly deep neural\nnetworks, have pushed the boundaries of what is achievable in complex tasks.\nTraditional methods for training neural networks in classification problems\noften rely on static target outputs, such as one-hot encoded vectors, which can\nlead to unstable optimization and difficulties in handling non-linearities\nwithin data. In this paper, we propose a novel training methodology that\nprogressively evolves the target outputs from a null vector to one-hot encoded\nvectors throughout the training process. This gradual transition allows the\nnetwork to adapt more smoothly to the increasing complexity of the\nclassification task, maintaining an equilibrium state that reduces the risk of\noverfitting and enhances generalization. Our approach, inspired by concepts\nfrom structural equilibrium in finite element analysis, has been validated\nthrough extensive experiments on both synthetic and real-world datasets. The\nresults demonstrate that our method achieves faster convergence, improved\naccuracy, and better generalization, especially in scenarios with high data\ncomplexity and noise. This progressive training framework offers a robust\nalternative to classical methods, opening new perspectives for more efficient\nand stable neural network training.\n","authors":["Jaouad Dabounou"],"pdf_url":"https://arxiv.org/pdf/2409.02410v2.pdf","comment":"15 pages, 9 figures, 2 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.05772v1","updated":"2024-09-09T16:33:40Z","published":"2024-09-09T16:33:40Z","title":"A CLIP-based siamese approach for meme classification","summary":" Memes are an increasingly prevalent element of online discourse in social\nnetworks, especially among young audiences. They carry ideas and messages that\nrange from humorous to hateful, and are widely consumed. Their potentially high\nimpact requires adequate means of control to moderate their use in large scale.\nIn this work, we propose SimCLIP a deep learning-based architecture for\ncross-modal understanding of memes, leveraging a pre-trained CLIP encoder to\nproduce context-aware embeddings and a Siamese fusion technique to capture the\ninteractions between text and image. We perform an extensive experimentation on\nseven meme classification tasks across six datasets. We establish a new state\nof the art in Memotion7k with a 7.25% relative F1-score improvement, and\nachieve super-human performance on Harm-P with 13.73% F1-Score improvement. Our\napproach demonstrates the potential for compact meme classification models,\nenabling accurate and efficient meme monitoring. We share our code at\nhttps://github.com/jahuerta92/meme-classification-simclip\n","authors":["Javier Huertas-Tato","Christos Koutlis","Symeon Papadopoulos","David Camacho","Ioannis Kompatsiaris"],"pdf_url":"https://arxiv.org/pdf/2409.05772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05750v1","updated":"2024-09-09T16:05:40Z","published":"2024-09-09T16:05:40Z","title":"A Toolkit for Joint Speaker Diarization and Identification with\n Application to Speaker-Attributed ASR","summary":" We present a modular toolkit to perform joint speaker diarization and speaker\nidentification. The toolkit can leverage on multiple models and algorithms\nwhich are defined in a configuration file. Such flexibility allows our system\nto work properly in various conditions (e.g., multiple registered speakers'\nsets, acoustic conditions and languages) and across application domains (e.g.\nmedia monitoring, institutional, speech analytics). In this demonstration we\nshow a practical use-case in which speaker-related information is used jointly\nwith automatic speech recognition engines to generate speaker-attributed\ntranscriptions. To achieve that, we employ a user-friendly web-based interface\nto process audio and video inputs with the chosen configuration.\n","authors":["Giovanni Morrone","Enrico Zovato","Fabio Brugnara","Enrico Sartori","Leonardo Badino"],"pdf_url":"https://arxiv.org/pdf/2409.05750v1.pdf","comment":"Show and Tell paper. Presented at Interspeech 2024"},{"id":"http://arxiv.org/abs/2409.04398v2","updated":"2024-09-09T15:08:06Z","published":"2024-09-06T16:43:04Z","title":"HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale\n Space Using Wearable IMUs and LiDAR","summary":" We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture\nmethod, aimed at accurately and efficiently creating a dynamic digital world,\ncontaining large-scale indoor-outdoor scenes, diverse human motions, rich\nhuman-human interactions, and human-environment interactions. By utilizing\nbody-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human\nmotions in unconstrained space without the need for external devices and\npre-built maps. This affords great flexibility and accessibility for\nhuman-centered interaction and 4D scene capturing in various environments.\nTaking into account that IMUs can capture human spatially unrestricted poses\nbut are prone to drifting for long-period using, and while LiDAR is stable for\nglobal localization but rough for local positions and orientations, HiSC4D\nemploys a joint optimization method, harmonizing all sensors and utilizing\nenvironment cues, yielding promising results for long-term capture in large\nscenes. To promote research of egocentric human interaction in large scenes and\nfacilitate downstream tasks, we also present a dataset, containing 8 sequences\nin 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D\nhuman motions with SMPL annotations and dynamic scenes, 31k frames of cropped\nhuman point clouds, and scene mesh of the environment. A variety of scenarios,\nsuch as the basketball gym and commercial street, alongside challenging human\nmotions, such as daily greeting, one-on-one basketball playing, and tour\nguiding, demonstrate the effectiveness and the generalization ability of\nHiSC4D. The dataset and code will be publicated on\nwww.lidarhumanmotion.net/hisc4d available for research purposes.\n","authors":["Yudi Dai","Zhiyong Wang","Xiping Lin","Chenglu Wen","Lan Xu","Siqi Shen","Yuexin Ma","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2409.04398v2.pdf","comment":"17 pages, 10 figures, Jornal"},{"id":"http://arxiv.org/abs/2309.11500v4","updated":"2024-09-09T14:52:15Z","published":"2023-09-20T17:59:32Z","title":"Auto-ACD: A Large-scale Dataset for Audio-Language Representation\n Learning","summary":" Recently, the AI community has made significant strides in developing\npowerful foundation models, driven by large-scale multimodal datasets. However,\nfor audio representation learning, existing datasets suffer from limitations in\nthe following aspects: insufficient volume, simplistic content, and arduous\ncollection procedures. To establish an audio dataset with high-quality\ncaptions, we propose an innovative, automatic approach leveraging multimodal\ninputs, such as video frames, audio streams. Specifically, we construct a\nlarge-scale, high-quality, audio-language dataset, named as Auto-ACD,\ncomprising over 1.5M audio-text pairs. We exploit a series of pre-trained\nmodels or APIs, to determine audio-visual synchronisation, generate image\ncaptions, object detection, or audio tags for specific videos. Subsequently, we\nemploy LLM to paraphrase a congruent caption for each audio, guided by the\nextracted multi-modality clues. To demonstrate the effectiveness of the\nproposed dataset, we train widely used models on our dataset and show\nperformance improvement on various downstream tasks, for example,\naudio-language retrieval, audio captioning, zero-shot classification. In\naddition, we establish a novel benchmark with environmental information and\nprovide a benchmark for audio-text tasks.\n","authors":["Luoyi Sun","Xuenan Xu","Mengyue Wu","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2309.11500v4.pdf","comment":"Accepted by ACM MM 2024"},{"id":"http://arxiv.org/abs/2409.05659v1","updated":"2024-09-09T14:29:22Z","published":"2024-09-09T14:29:22Z","title":"Audio-Visual Speaker Diarization: Current Databases, Approaches and\n Challenges","summary":" Nowadays, the large amount of audio-visual content available has fostered the\nneed to develop new robust automatic speaker diarization systems to analyse and\ncharacterise it. This kind of system helps to reduce the cost of doing this\nprocess manually and allows the use of the speaker information for different\napplications, as a huge quantity of information is present, for example, images\nof faces, or audio recordings. Therefore, this paper aims to address a critical\narea in the field of speaker diarization systems, the integration of\naudio-visual content of different domains. This paper seeks to push beyond\ncurrent state-of-the-art practices by developing a robust audio-visual speaker\ndiarization framework adaptable to various data domains, including TV\nscenarios, meetings, and daily activities. Unlike most of the existing\naudio-visual speaker diarization systems, this framework will also include the\nproposal of an approach to lead the precise assignment of specific identities\nin TV scenarios where celebrities appear. In addition, in this work, we have\nconducted an extensive compilation of the current state-of-the-art approaches\nand the existing databases for developing audio-visual speaker diarization.\n","authors":["Victoria Mingote","Alfonso Ortega","Antonio Miguel","Eduardo Lleida"],"pdf_url":"https://arxiv.org/pdf/2409.05659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05606v1","updated":"2024-09-09T13:39:47Z","published":"2024-09-09T13:39:47Z","title":"CustomContrast: A Multilevel Contrastive Perspective For Subject-Driven\n Text-to-Image Customization","summary":" Subject-driven text-to-image (T2I) customization has drawn significant\ninterest in academia and industry. This task enables pre-trained models to\ngenerate novel images based on unique subjects. Existing studies adopt a\nself-reconstructive perspective, focusing on capturing all details of a single\nimage, which will misconstrue the specific image's irrelevant attributes (e.g.,\nview, pose, and background) as the subject intrinsic attributes. This\nmisconstruction leads to both overfitting or underfitting of irrelevant and\nintrinsic attributes of the subject, i.e., these attributes are\nover-represented or under-represented simultaneously, causing a trade-off\nbetween similarity and controllability. In this study, we argue an ideal\nsubject representation can be achieved by a cross-differential perspective,\ni.e., decoupling subject intrinsic attributes from irrelevant attributes via\ncontrastive learning, which allows the model to focus more on intrinsic\nattributes through intra-consistency (features of the same subject are\nspatially closer) and inter-distinctiveness (features of different subjects\nhave distinguished differences). Specifically, we propose CustomContrast, a\nnovel framework, which includes a Multilevel Contrastive Learning (MCL)\nparadigm and a Multimodal Feature Injection (MFI) Encoder. The MCL paradigm is\nused to extract intrinsic features of subjects from high-level semantics to\nlow-level appearance through crossmodal semantic contrastive learning and\nmultiscale appearance contrastive learning. To facilitate contrastive learning,\nwe introduce the MFI encoder to capture cross-modal representations. Extensive\nexperiments show the effectiveness of CustomContrast in subject similarity and\ntext controllability.\n","authors":["Nan Chen","Mengqi Huang","Zhuowei Chen","Yang Zheng","Lei Zhang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2409.05606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04388v2","updated":"2024-09-09T13:15:41Z","published":"2024-09-06T16:27:52Z","title":"Question-Answering Dense Video Events","summary":" Multimodal Large Language Models (MLLMs) have shown excellent performance in\nquestion-answering of single-event videos. In this paper, we present\nquestion-answering dense video events, a novel task that requires answering and\ngrounding the dense-event questions in long videos, thus challenging MLLMs to\nfaithfully comprehend and reason about multiple events occurring over extended\ntime periods. To facilitate the study, we construct DeVE-QA - a dataset\nfeaturing 78K questions about 26K events on 10.6K long videos. We then\nbenchmark and show that existing MLLMs excelling at single-event QA struggle to\nperform well in DeVE-QA. For improvement, we propose DeVi, a novel\ntraining-free MLLM approach that highlights a hierarchical captioning module, a\ntemporal event memory module, and a self-consistency checking module to\nrespectively detect, contextualize and memorize, and ground dense-events in\nlong videos for question answering. Extensive experiments show that DeVi is\nsuperior at answering dense-event questions and grounding relevant video\nmoments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1\npercent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA\nrespectively.\n","authors":["Hangyu Qin","Junbin Xiao","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2409.04388v2.pdf","comment":"Submitted to AAAI 2025"},{"id":"http://arxiv.org/abs/2408.03632v3","updated":"2024-09-09T12:26:04Z","published":"2024-08-07T08:43:58Z","title":"Concept Conductor: Orchestrating Multiple Personalized Concepts in\n Text-to-Image Synthesis","summary":" The customization of text-to-image models has seen significant advancements,\nyet generating multiple personalized concepts remains a challenging task.\nCurrent methods struggle with attribute leakage and layout confusion when\nhandling multiple concepts, leading to reduced concept fidelity and semantic\nconsistency. In this work, we introduce a novel training-free framework,\nConcept Conductor, designed to ensure visual fidelity and correct layout in\nmulti-concept customization. Concept Conductor isolates the sampling processes\nof multiple custom models to prevent attribute leakage between different\nconcepts and corrects erroneous layouts through self-attention-based spatial\nguidance. Additionally, we present a concept injection technique that employs\nshape-aware masks to specify the generation area for each concept. This\ntechnique injects the structure and appearance of personalized concepts through\nfeature fusion in the attention layers, ensuring harmony in the final image.\nExtensive qualitative and quantitative experiments demonstrate that Concept\nConductor can consistently generate composite images with accurate layouts\nwhile preserving the visual details of each concept. Compared to existing\nbaselines, Concept Conductor shows significant performance improvements. Our\nmethod supports the combination of any number of concepts and maintains high\nfidelity even when dealing with visually similar concepts. The code and models\nare available at https://github.com/Nihukat/Concept-Conductor.\n","authors":["Zebin Yao","Fangxiang Feng","Ruifan Li","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03632v3.pdf","comment":"Github Page: https://github.com/Nihukat/Concept-Conductor"},{"id":"http://arxiv.org/abs/2409.05540v1","updated":"2024-09-09T12:00:17Z","published":"2024-09-09T12:00:17Z","title":"Exploring Rich Subjective Quality Information for Image Quality\n Assessment in the Wild","summary":" Traditional in the wild image quality assessment (IQA) models are generally\ntrained with the quality labels of mean opinion score (MOS), while missing the\nrich subjective quality information contained in the quality ratings, for\nexample, the standard deviation of opinion scores (SOS) or even distribution of\nopinion scores (DOS). In this paper, we propose a novel IQA method named\nRichIQA to explore the rich subjective rating information beyond MOS to predict\nimage quality in the wild. RichIQA is characterized by two key novel designs:\n(1) a three-stage image quality prediction network which exploits the powerful\nfeature representation capability of the Convolutional vision Transformer (CvT)\nand mimics the short-term and long-term memory mechanisms of human brain; (2) a\nmulti-label training strategy in which rich subjective quality information like\nMOS, SOS and DOS are concurrently used to train the quality prediction network.\nPowered by these two novel designs, RichIQA is able to predict the image\nquality in terms of a distribution, from which the mean image quality can be\nsubsequently obtained. Extensive experimental results verify that the\nthree-stage network is tailored to predict rich quality information, while the\nmulti-label training strategy can fully exploit the potentials within\nsubjective quality rating and enhance the prediction performance and\ngeneralizability of the network. RichIQA outperforms state-of-the-art\ncompetitors on multiple large-scale in the wild IQA databases with rich\nsubjective rating labels. The code of RichIQA will be made publicly available\non GitHub.\n","authors":["Xiongkuo Min","Yixuan Gao","Yuqin Cao","Guangtao Zhai","Wenjun Zhang","Huifang Sun","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2409.05540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05496v1","updated":"2024-09-09T10:48:33Z","published":"2024-09-09T10:48:33Z","title":"Educational Virtual Field Trips based on Social VR and 360° Spaces","summary":" Virtual field trips (VFTs) have proven to be valuable learning tools. Such\napplications are mostly based on 360{\\deg} technology and are to be\ncharacterized as single-user applications in technological terms. In contrast,\nSocial VR applications are characterized by multi-user capability and\nuser-specific avatars. From a learning perspective, the concepts of\ncollaborative learning and embodiment have long been proposed as conducive to\nlearning. Both concepts might be supported using Social VR. However, little is\ncurrently known about the use of Social VR for VFTs. Accordingly, the research\nquestions are to what extent VFTs can be implemented in Social VR environments\nand how these Social VR-based VFTs are perceived by learners. This article\npresents an evaluation study on the development and evaluation of a VFT\nenvironment using the Social VR platform Mozilla Hubs. It describes the design\ndecisions to create the environment and evaluation results from a mixed-method\nstudy (N=16) using a questionnaire and focus group discussions. The study\nhighlighted the opportunities offered by Social VR-based VFTs but also revealed\nseveral challenges that need to be addressed to embrace the potential of Social\nVR-based VFTs to be utilized regularly in education.\n","authors":["Surya Kalvakolu","Heinrich Söbke","Heinrich Söbke","Jannicke Baalsrud Hauge","Eckhard Kraft"],"pdf_url":"https://arxiv.org/pdf/2409.05496v1.pdf","comment":"9 pages, 7 figures, 1 table, submitted to Games and Learning Alliance\n Conference"},{"id":"http://arxiv.org/abs/2409.05405v1","updated":"2024-09-09T08:06:50Z","published":"2024-09-09T08:06:50Z","title":"A Survey of Multimodal Composite Editing and Retrieval","summary":" In the real world, where information is abundant and diverse across different\nmodalities, understanding and utilizing various data types to improve retrieval\nsystems is a key focus of research. Multimodal composite retrieval integrates\ndiverse modalities such as text, image and audio, etc. to provide more\naccurate, personalized, and contextually relevant results. To facilitate a\ndeeper understanding of this promising direction, this survey explores\nmultimodal composite editing and retrieval in depth, covering image-text\ncomposite editing, image-text composite retrieval, and other multimodal\ncomposite retrieval. In this survey, we systematically organize the application\nscenarios, methods, benchmarks, experiments, and future directions. Multimodal\nlearning is a hot topic in large model era, and have also witnessed some\nsurveys in multimodal learning and vision-language models with transformers\npublished in the PAMI journal. To the best of our knowledge, this survey is the\nfirst comprehensive review of the literature on multimodal composite retrieval,\nwhich is a timely complement of multimodal fusion to existing reviews. To help\nreaders' quickly track this field, we build the project page for this survey,\nwhich can be found at\nhttps://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval.\n","authors":["Suyan Li","Fuxiang Huang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05405v1.pdf","comment":"22 pages, 3 figures, and 11 tables"},{"id":"http://arxiv.org/abs/2409.05384v1","updated":"2024-09-09T07:32:18Z","published":"2024-09-09T07:32:18Z","title":"Look One and More: Distilling Hybrid Order Relational Knowledge for\n Cross-Resolution Image Recognition","summary":" In spite of great success in many image recognition tasks achieved by recent\ndeep models, directly applying them to recognize low-resolution images may\nsuffer from low accuracy due to the missing of informative details during\nresolution degradation. However, these images are still recognizable for\nsubjects who are familiar with the corresponding high-resolution ones. Inspired\nby that, we propose a teacher-student learning approach to facilitate\nlow-resolution image recognition via hybrid order relational knowledge\ndistillation. The approach refers to three streams: the teacher stream is\npretrained to recognize high-resolution images in high accuracy, the student\nstream is learned to identify low-resolution images by mimicking the teacher's\nbehaviors, and the extra assistant stream is introduced as bridge to help\nknowledge transfer across the teacher to the student. To extract sufficient\nknowledge for reducing the loss in accuracy, the learning of student is\nsupervised with multiple losses, which preserves the similarities in various\norder relational structures. In this way, the capability of recovering missing\ndetails of familiar low-resolution images can be effectively enhanced, leading\nto a better knowledge transfer. Extensive experiments on metric learning,\nlow-resolution image classification and low-resolution face recognition tasks\nshow the effectiveness of our approach, while taking reduced models.\n","authors":["Shiming Ge","Kangkai Zhang","Haolin Liu","Yingying Hua","Shengwei Zhao","Xin Jin","Hao Wen"],"pdf_url":"https://arxiv.org/pdf/2409.05384v1.pdf","comment":"Accepted by AAAI 2020"},{"id":"http://arxiv.org/abs/2409.05330v1","updated":"2024-09-09T05:20:02Z","published":"2024-09-09T05:20:02Z","title":"KAN-Based Fusion of Dual-Domain for Audio-Driven Facial Landmarks\n Generation","summary":" Audio-driven talking face generation is a widely researched topic due to its\nhigh applicability. Reconstructing a talking face using audio significantly\ncontributes to fields such as education, healthcare, online conversations,\nvirtual assistants, and virtual reality. Early studies often focused solely on\nchanging the mouth movements, which resulted in outcomes with limited practical\napplications. Recently, researchers have proposed a new approach of\nconstructing the entire face, including face pose, neck, and shoulders. To\nachieve this, they need to generate through landmarks. However, creating stable\nlandmarks that align well with the audio is a challenge. In this paper, we\npropose the KFusion of Dual-Domain model, a robust model that generates\nlandmarks from audio. We separate the audio into two distinct domains to learn\nemotional information and facial context, then use a fusion mechanism based on\nthe KAN model. Our model demonstrates high efficiency compared to recent\nmodels. This will lay the groundwork for the development of the audio-driven\ntalking face generation problem in the future.\n","authors":["Hoang-Son Vo-Thanh","Quang-Vinh Nguyen","Soo-Hyung Kim"],"pdf_url":"https://arxiv.org/pdf/2409.05330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05297v1","updated":"2024-09-09T03:10:40Z","published":"2024-09-09T03:10:40Z","title":"Adaptive Offloading and Enhancement for Low-Light Video Analytics on\n Mobile Devices","summary":" In this paper, we explore adaptive offloading and enhancement strategies for\nvideo analytics tasks on computing-constrained mobile devices in low-light\nconditions. We observe that the accuracy of low-light video analytics varies\nfrom different enhancement algorithms. The root cause could be the disparities\nin the effectiveness of enhancement algorithms for feature extraction in\nanalytic models. Specifically, the difference in class activation maps (CAMs)\nbetween enhanced and low-light frames demonstrates a positive correlation with\nvideo analytics accuracy. Motivated by such observations, a novel enhancement\nquality assessment method is proposed on CAMs to evaluate the effectiveness of\ndifferent enhancement algorithms for low-light videos. Then, we design a\nmulti-edge system, which adaptively offloads and enhances low-light video\nanalytics tasks from mobile devices. To achieve the trade-off between the\nenhancement quality and the latency for all system-served mobile devices, we\npropose a genetic-based scheduling algorithm, which can find a near-optimal\nsolution in a reasonable time to meet the latency requirement. Thereby, the\noffloading strategies and the enhancement algorithms are properly selected\nunder the condition of limited end-edge bandwidth and edge computation\nresources. Simulation experiments demonstrate the superiority of the proposed\nsystem, improving accuracy up to 20.83\\% compared to existing benchmarks.\n","authors":["Yuanyi He","Peng Yang","Tian Qin","Jiawei Hou","Ning Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14066v3","updated":"2024-09-09T03:06:21Z","published":"2024-07-19T06:50:24Z","title":"360VFI: A Dataset and Benchmark for Omnidirectional Video Frame\n Interpolation","summary":" Head-mounted 360{\\deg} displays and portable 360{\\deg} cameras have\nsignificantly progressed, providing viewers a realistic and immersive\nexperience. However, many omnidirectional videos have low frame rates that can\nlead to visual fatigue, and the prevailing plane frame interpolation\nmethodologies are unsuitable for omnidirectional video interpolation because\nthey are designed solely for traditional videos. This paper introduces the\nbenchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We\npresent a practical implementation that introduces a distortion prior from\nomnidirectional video into the network to modulate distortions. Specifically,\nwe propose a pyramid distortion-sensitive feature extractor that uses the\nunique characteristics of equirectangular projection (ERP) format as prior\ninformation. Moreover, we devise a decoder that uses an affine transformation\nto further facilitate the synthesis of intermediate frames. 360VFI is the first\ndataset and benchmark that explores the challenge of Omnidirectional Video\nFrame Interpolation. Through our benchmark analysis, we present four different\ndistortion condition scenes in the proposed 360VFI dataset to evaluate the\nchallenges triggered by distortion during interpolation. Besides, experimental\nresults demonstrate that Omnidirectional Video Interpolation can be effectively\nimproved by modeling for omnidirectional distortion.\n","authors":["Wenxuan Lu","Mengshun Hu","Yansheng Qiu","Liang Liao","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2407.14066v3.pdf","comment":"This is a preprint version"}]},"2024-09-08T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.05247v1","updated":"2024-09-08T23:51:04Z","published":"2024-09-08T23:51:04Z","title":"Socially Responsible Data for Large Multilingual Language Models","summary":" Large Language Models (LLMs) have rapidly increased in size and apparent\ncapabilities in the last three years, but their training data is largely\nEnglish text. There is growing interest in multilingual LLMs, and various\nefforts are striving for models to accommodate languages of communities outside\nof the Global North, which include many languages that have been historically\nunderrepresented in digital realms. These languages have been coined as \"low\nresource languages\" or \"long-tail languages\", and LLMs performance on these\nlanguages is generally poor. While expanding the use of LLMs to more languages\nmay bring many potential benefits, such as assisting cross-community\ncommunication and language preservation, great care must be taken to ensure\nthat data collection on these languages is not extractive and that it does not\nreproduce exploitative practices of the past. Collecting data from languages\nspoken by previously colonized people, indigenous people, and non-Western\nlanguages raises many complex sociopolitical and ethical questions, e.g.,\naround consent, cultural safety, and data sovereignty. Furthermore, linguistic\ncomplexity and cultural nuances are often lost in LLMs. This position paper\nbuilds on recent scholarship, and our own work, and outlines several relevant\nsocial, cultural, and ethical considerations and potential ways to mitigate\nthem through qualitative research, community partnerships, and participatory\ndesign approaches. We provide twelve recommendations for consideration when\ncollecting language data on underrepresented language communities outside of\nthe Global North.\n","authors":["Andrew Smart","Ben Hutchinson","Lameck Mbangula Amugongo","Suzanne Dikker","Alex Zito","Amber Ebinama","Zara Wudiri","Ding Wang","Erin van Liemt","João Sedoc","Seyi Olojo","Stanley Uwakwe","Edem Wornyo","Sonja Schmer-Galunder","Jamila Smith-Loud"],"pdf_url":"https://arxiv.org/pdf/2409.05247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13440v2","updated":"2024-09-08T21:47:29Z","published":"2024-08-24T02:40:28Z","title":"Knowledge-Aware Conversation Derailment Forecasting Using Graph\n Convolutional Networks","summary":" Online conversations are particularly susceptible to derailment, which can\nmanifest itself in the form of toxic communication patterns including\ndisrespectful comments and abuse. Forecasting conversation derailment predicts\nsigns of derailment in advance enabling proactive moderation of conversations.\nState-of-the-art approaches to conversation derailment forecasting sequentially\nencode conversations and use graph neural networks to model dialogue user\ndynamics. However, existing graph models are not able to capture complex\nconversational characteristics such as context propagation and emotional\nshifts. The use of common sense knowledge enables a model to capture such\ncharacteristics, thus improving performance. Following this approach, here we\nderive commonsense statements from a knowledge base of dialogue contextual\ninformation to enrich a graph neural network classification architecture. We\nfuse the multi-source information on utterance into capsules, which are used by\na transformer-based forecaster to predict conversation derailment. Our model\ncaptures conversation dynamics and context propagation, outperforming the\nstate-of-the-art models on the CGA and CMV benchmark datasets\n","authors":["Enas Altarawneh","Ameeta Agrawal","Michael Jenkin","Manos Papagelis"],"pdf_url":"https://arxiv.org/pdf/2408.13440v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.12982;\n text overlap with arXiv:2106.01071 by other authors"},{"id":"http://arxiv.org/abs/2409.05224v1","updated":"2024-09-08T21:40:44Z","published":"2024-09-08T21:40:44Z","title":"Exploring Intrinsic Language-specific Subspaces in Fine-tuning\n Multilingual Neural Machine Translation","summary":" Multilingual neural machine translation models support fine-tuning hundreds\nof languages simultaneously. However, fine-tuning on full parameters solely is\ninefficient potentially leading to negative interactions among languages. In\nthis work, we demonstrate that the fine-tuning for a language occurs in its\nintrinsic language-specific subspace with a tiny fraction of entire parameters.\nThus, we propose language-specific LoRA to isolate intrinsic language-specific\nsubspaces. Furthermore, we propose architecture learning techniques and\nintroduce a gradual pruning schedule during fine-tuning to exhaustively explore\nthe optimal setting and the minimal intrinsic subspaces for each language,\nresulting in a lightweight yet effective fine-tuning procedure. The\nexperimental results on a 12-language subset and a 30-language subset of\nFLORES-101 show that our methods not only outperform full-parameter fine-tuning\nup to 2.25 spBLEU scores but also reduce trainable parameters to $0.4\\%$ for\nhigh and medium-resource languages and $1.6\\%$ for low-resource ones.\n","authors":["Zhe Cao","Zhi Qu","Hidetaka Kamigaito","Taro Watanabe"],"pdf_url":"https://arxiv.org/pdf/2409.05224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.06682v2","updated":"2024-09-08T20:33:03Z","published":"2024-05-05T18:56:46Z","title":"Self-Reflection in LLM Agents: Effects on Problem-Solving Performance","summary":" In this study, we investigated the effects of self-reflection in large\nlanguage models (LLMs) on problem-solving performance. We instructed nine\npopular LLMs to answer a series of multiple-choice questions to provide a\nperformance baseline. For each incorrectly answered question, we instructed\neight types of self-reflecting LLM agents to reflect on their mistakes and\nprovide themselves with guidance to improve problem-solving. Then, using this\nguidance, each self-reflecting agent attempted to re-answer the same questions.\nOur results indicate that LLM agents are able to significantly improve their\nproblem-solving performance through self-reflection ($p < 0.001$). In addition,\nwe compared the various types of self-reflection to determine their individual\ncontribution to performance. All code and data are available on GitHub at\nhttps://github.com/matthewrenze/self-reflection\n","authors":["Matthew Renze","Erhan Guven"],"pdf_url":"https://arxiv.org/pdf/2405.06682v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00217v2","updated":"2024-09-08T20:08:16Z","published":"2024-08-30T19:14:17Z","title":"ProGRes: Prompted Generative Rescoring on ASR n-Best","summary":" Large Language Models (LLMs) have shown their ability to improve the\nperformance of speech recognizers by effectively rescoring the n-best\nhypotheses generated during the beam search process. However, the best way to\nexploit recent generative instruction-tuned LLMs for hypothesis rescoring is\nstill unclear. This paper proposes a novel method that uses instruction-tuned\nLLMs to dynamically expand the n-best speech recognition hypotheses with new\nhypotheses generated through appropriately-prompted LLMs. Specifically, we\nintroduce a new zero-shot method for ASR n-best rescoring, which combines\nconfidence scores, LLM sequence scoring, and prompt-based hypothesis\ngeneration. We compare Llama-3-Instruct, GPT-3.5 Turbo, and GPT-4 Turbo as\nprompt-based generators with Llama-3 as sequence scorer LLM. We evaluated our\napproach using different speech recognizers and observed significant relative\nimprovement in the word error rate (WER) ranging from 5% to 25%.\n","authors":["Ada Defne Tur","Adel Moumen","Mirco Ravanelli"],"pdf_url":"https://arxiv.org/pdf/2409.00217v2.pdf","comment":"IEEE Spoken Language Technology Workshop"},{"id":"http://arxiv.org/abs/2408.01527v2","updated":"2024-09-08T19:59:06Z","published":"2024-08-02T18:40:10Z","title":"Using LLMs to Establish Implicit User Sentiment of Software Desirability","summary":" This study explores the use of LLMs for providing quantitative zero-shot\nsentiment analysis of implicit software desirability, addressing a critical\nchallenge in product evaluation where traditional review scores, though\nconvenient, fail to capture the richness of qualitative user feedback.\nInnovations include establishing a method that 1) works with qualitative user\nexperience data without the need for explicit review scores, 2) focuses on\nimplicit user satisfaction, and 3) provides scaled numerical sentiment\nanalysis, offering a more nuanced understanding of user sentiment, instead of\nsimply classifying sentiment as positive, neutral, or negative.\n Data is collected using the Microsoft Product Desirability Toolkit (PDT), a\nwell-known qualitative user experience analysis tool. For initial exploration,\nthe PDT metric was given to users of two software systems. PDT data was fed\nthrough several LLMs (Claude Sonnet 3 and 3.5, GPT4, and GPT4o) and through a\nleading transfer learning technique, Twitter-Roberta-Base-Sentiment, and Vader,\na leading sentiment analysis tool. Each system was asked to evaluate the data\nin two ways, by looking at the sentiment expressed in the PDT word/explanation\npairs; and by looking at the sentiment expressed by the users in their grouped\nselection of five words and explanations, as a whole. Each LLM provided a\nsentiment score, its confidence (low, medium, high) in the score, and an\nexplanation of the score.\n All LLMs tested were able to statistically detect user sentiment from the\nusers' grouped data, whereas TRBS and Vader were not. The confidence and\nexplanation of confidence provided by the LLMs assisted in understanding user\nsentiment. This study adds deeper understanding of evaluating user experiences,\ntoward the goal of creating a universal tool that quantifies implicit\nsentiment.\n","authors":["Sherri Weitl-Harms","John D. Hastings","Jonah Lum"],"pdf_url":"https://arxiv.org/pdf/2408.01527v2.pdf","comment":"6 pages, 2 figures, 2 tables, updated to incorporate feedback"},{"id":"http://arxiv.org/abs/2409.05199v1","updated":"2024-09-08T19:24:14Z","published":"2024-09-08T19:24:14Z","title":"Interactive Machine Teaching by Labeling Rules and Instances","summary":" Weakly supervised learning aims to reduce the cost of labeling data by using\nexpert-designed labeling rules. However, existing methods require experts to\ndesign effective rules in a single shot, which is difficult in the absence of\nproper guidance and tooling. Therefore, it is still an open question whether\nexperts should spend their limited time writing rules or instead providing\ninstance labels via active learning. In this paper, we investigate how to\nexploit an expert's limited time to create effective supervision. First, to\ndevelop practical guidelines for rule creation, we conduct an exploratory\nanalysis of diverse collections of existing expert-designed rules and find that\nrule precision is more important than coverage across datasets. Second, we\ncompare rule creation to individual instance labeling via active learning and\ndemonstrate the importance of both across 6 datasets. Third, we propose an\ninteractive learning framework, INTERVAL, that achieves efficiency by\nautomatically extracting candidate rules based on rich patterns (e.g., by\nprompting a language model), and effectiveness by soliciting expert feedback on\nboth candidate rules and individual instances. Across 6 datasets, INTERVAL\noutperforms state-of-the-art weakly supervised approaches by 7% in F1.\nFurthermore, it requires as few as 10 queries for expert feedback to reach F1\nvalues that existing active learning methods cannot match even with 100\nqueries.\n","authors":["Giannis Karamanolakis","Daniel Hsu","Luis Gravano"],"pdf_url":"https://arxiv.org/pdf/2409.05199v1.pdf","comment":"Accepted to TACL 2024"},{"id":"http://arxiv.org/abs/2409.05197v1","updated":"2024-09-08T19:22:58Z","published":"2024-09-08T19:22:58Z","title":"Seemingly Plausible Distractors in Multi-Hop Reasoning: Are Large\n Language Models Attentive Readers?","summary":" State-of-the-art Large Language Models (LLMs) are accredited with an\nincreasing number of different capabilities, ranging from reading\ncomprehension, over advanced mathematical and reasoning skills to possessing\nscientific knowledge. In this paper we focus on their multi-hop reasoning\ncapability: the ability to identify and integrate information from multiple\ntextual sources.\n Given the concerns with the presence of simplifying cues in existing\nmulti-hop reasoning benchmarks, which allow models to circumvent the reasoning\nrequirement, we set out to investigate, whether LLMs are prone to exploiting\nsuch simplifying cues. We find evidence that they indeed circumvent the\nrequirement to perform multi-hop reasoning, but they do so in more subtle ways\nthan what was reported about their fine-tuned pre-trained language model (PLM)\npredecessors. Motivated by this finding, we propose a challenging multi-hop\nreasoning benchmark, by generating seemingly plausible multi-hop reasoning\nchains, which ultimately lead to incorrect answers. We evaluate multiple open\nand proprietary state-of-the-art LLMs, and find that their performance to\nperform multi-hop reasoning is affected, as indicated by up to 45% relative\ndecrease in F1 score when presented with such seemingly plausible alternatives.\nWe conduct a deeper analysis and find evidence that while LLMs tend to ignore\nmisleading lexical cues, misleading reasoning paths indeed present a\nsignificant challenge.\n","authors":["Neeladri Bhuiya","Viktor Schlegel","Stefan Winkler"],"pdf_url":"https://arxiv.org/pdf/2409.05197v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2402.17124v2","updated":"2024-09-08T19:17:32Z","published":"2024-02-27T01:37:23Z","title":"Fact-and-Reflection (FaR) Improves Confidence Calibration of Large\n Language Models","summary":" For a LLM to be trustworthy, its confidence level should be well-calibrated\nwith its actual performance. While it is now common sense that LLM performances\nare greatly impacted by prompts, the confidence calibration in prompting LLMs\nhas yet to be thoroughly explored. In this paper, we explore how different\nprompting strategies influence LLM confidence calibration and how it could be\nimproved. We conduct extensive experiments on six prompting methods in the\nquestion-answering context and we observe that, while these methods help\nimprove the expected LLM calibration, they also trigger LLMs to be\nover-confident when responding to some instances. Inspired by human cognition,\nwe propose Fact-and-Reflection (FaR) prompting, which improves the LLM\ncalibration in two steps. First, FaR elicits the known \"facts\" that are\nrelevant to the input prompt from the LLM. And then it asks the model to\n\"reflect\" over them to generate the final answer. Experiments show that FaR\nprompting achieves significantly better calibration; it lowers the Expected\nCalibration Error by 23.5% on our multi-purpose QA tasks. Notably, FaR\nprompting even elicits the capability of verbally expressing concerns in less\nconfident scenarios, which helps trigger retrieval augmentation for solving\nthese harder instances.\n","authors":["Xinran Zhao","Hongming Zhang","Xiaoman Pan","Wenlin Yao","Dong Yu","Tongshuang Wu","Jianshu Chen"],"pdf_url":"https://arxiv.org/pdf/2402.17124v2.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2406.15695v2","updated":"2024-09-08T19:06:37Z","published":"2024-06-22T00:14:48Z","title":"SS-GEN: A Social Story Generation Framework with Large Language Models","summary":" Children with Autism Spectrum Disorder (ASD) often misunderstand social\nsituations and struggle to participate in daily routines. Social Stories are\ntraditionally crafted by psychology experts under strict constraints to address\nthese challenges but are costly and limited in diversity. As Large Language\nModels (LLMs) advance, there's an opportunity to develop more automated,\naffordable, and accessible methods to generate Social Stories in real-time with\nbroad coverage. However, adapting LLMs to meet the unique and strict\nconstraints of Social Stories is a challenging issue. To this end, we propose\n\\textbf{SS-GEN}, a \\textbf{S}ocial \\textbf{S}tory \\textbf{GEN}eration framework\nwith LLMs. Firstly, we develop a constraint-driven sophisticated strategy named\n\\textbf{\\textsc{StarSow}} to hierarchically prompt LLMs to generate Social\nStories at scale, followed by rigorous human filtering to build a high-quality\ndataset. Additionally, we introduce \\textbf{quality assessment criteria} to\nevaluate the effectiveness of these generated stories. Considering that\npowerful closed-source large models require very complex instructions and\nexpensive API fees, we finally fine-tune smaller language models with our\ncurated high-quality dataset, achieving comparable results at lower costs and\nwith simpler instruction and deployment. This work marks a significant step in\nleveraging AI to personalize Social Stories cost-effectively for autistic\nchildren at scale, which we hope can encourage future research. The prompt,\ncode and data will release in the \\texttt{Technical Appendix} and \\texttt{Code\n\\& Data Appendix} at \\url{https://github.com/MIMIFY/SS-GEN}.\n","authors":["Yi Feng","Mingyang Song","Jiaqi Wang","Zhuang Chen","Guanqun Bi","Minlie Huang","Liping Jing","Jian Yu"],"pdf_url":"https://arxiv.org/pdf/2406.15695v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16570v2","updated":"2024-09-08T18:08:00Z","published":"2024-08-29T14:37:05Z","title":"Predictability maximization and the origins of word order harmony","summary":" We address the linguistic problem of the sequential arrangement of a head and\nits dependents from an information theoretic perspective. In particular, we\nconsider the optimal placement of a head that maximizes the predictability of\nthe sequence. We assume that dependents are statistically independent given a\nhead, in line with the open-choice principle and the core assumptions of\ndependency grammar. We demonstrate the optimality of harmonic order, i.e.,\nplacing the head last maximizes the predictability of the head whereas placing\nthe head first maximizes the predictability of dependents. We also show that\npostponing the head is the optimal strategy to maximize its predictability\nwhile bringing it forward is the optimal strategy to maximize the\npredictability of dependents. We unravel the advantages of the strategy of\nmaximizing the predictability of the head over maximizing the predictability of\ndependents. Our findings shed light on the placements of the head adopted by\nreal languages or emerging in different kinds of experiments.\n","authors":["Ramon Ferrer-i-Cancho"],"pdf_url":"https://arxiv.org/pdf/2408.16570v2.pdf","comment":"Local reorganization of the text; many typos corrected"},{"id":"http://arxiv.org/abs/2404.13043v2","updated":"2024-09-08T17:46:36Z","published":"2024-04-19T17:57:29Z","title":"Data Alignment for Zero-Shot Concept Generation in Dermatology AI","summary":" AI in dermatology is evolving at a rapid pace but the major limitation to\ntraining trustworthy classifiers is the scarcity of data with ground-truth\nconcept level labels, which are meta-labels semantically meaningful to humans.\nFoundation models like CLIP providing zero-shot capabilities can help alleviate\nthis challenge by leveraging vast amounts of image-caption pairs available on\nthe internet. CLIP can be fine-tuned using domain specific image-caption pairs\nto improve classification performance. However, CLIP's pre-training data is not\nwell-aligned with the medical jargon that clinicians use to perform diagnoses.\nThe development of large language models (LLMs) in recent years has led to the\npossibility of leveraging the expressive nature of these models to generate\nrich text. Our goal is to use these models to generate caption text that aligns\nwell with both the clinical lexicon and with the natural human language used in\nCLIP's pre-training data. Starting with captions used for images in PubMed\narticles, we extend them by passing the raw captions through an LLM fine-tuned\non the field's several textbooks. We find that using captions generated by an\nexpressive fine-tuned LLM like GPT-3.5 improves downstream zero-shot concept\nclassification performance.\n","authors":["Soham Gadgil","Mahtab Bigverdi"],"pdf_url":"https://arxiv.org/pdf/2404.13043v2.pdf","comment":"Accepted as a workshop paper to ICLR 2024"},{"id":"http://arxiv.org/abs/2405.14105v3","updated":"2024-09-08T17:15:17Z","published":"2024-05-23T02:14:17Z","title":"Distributed Speculative Inference of Large Language Models is Provably\n Faster","summary":" Accelerating the inference of large language models (LLMs) is an important\nchallenge in artificial intelligence. This paper introduces Distributed\nSpeculative Inference (DSI), a novel distributed inference algorithm that is\nprovably faster than speculative inference (SI)\n[leviathan2023fast,chen2023accelerating,miao2023specinfer] and traditional\nautoregressive inference (non-SI). Like other SI algorithms, DSI works on\nfrozen LLMs, requiring no training or architectural modifications, and it\npreserves the target distribution. Prior studies on SI have demonstrated\nempirical speedups (compared to non-SI) but require fast and accurate drafters,\nwhich are often unavailable in practice. We identify a gap where SI can be\nslower than non-SI given slower or less accurate drafters. We close this gap by\nproving that DSI is faster than both SI and non-SI--given any drafters. DSI\nintroduces a novel type of task parallelism called Speculation Parallelism\n(SP), which orchestrates target and drafter instances to overlap in time,\ncreating a new foundational tradeoff between computational resources and\nlatency. DSI is not only faster than SI but also supports LLMs that cannot be\naccelerated with SI. Our simulations show speedups of off-the-shelf LLMs in\nrealistic single-node settings where DSI is 1.29-1.92x faster than SI.\n","authors":["Nadav Timor","Jonathan Mamou","Daniel Korat","Moshe Berchansky","Oren Pereg","Moshe Wasserblat","Tomer Galanti","Michal Gordon","David Harel"],"pdf_url":"https://arxiv.org/pdf/2405.14105v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.21384v2","updated":"2024-09-08T16:42:28Z","published":"2024-07-31T07:15:33Z","title":"GEGA: Graph Convolutional Networks and Evidence Retrieval Guided\n Attention for Enhanced Document-level Relation Extraction","summary":" Document-level relation extraction (DocRE) aims to extract relations between\nentities from unstructured document text. Compared to sentence-level relation\nextraction, it requires more complex semantic understanding from a broader text\ncontext. Currently, some studies are utilizing logical rules within evidence\nsentences to enhance the performance of DocRE. However, in the data without\nprovided evidence sentences, researchers often obtain a list of evidence\nsentences for the entire document through evidence retrieval (ER). Therefore,\nDocRE suffers from two challenges: firstly, the relevance between evidence and\nentity pairs is weak; secondly, there is insufficient extraction of complex\ncross-relations between long-distance multi-entities. To overcome these\nchallenges, we propose GEGA, a novel model for DocRE. The model leverages graph\nneural networks to construct multiple weight matrices, guiding attention\nallocation to evidence sentences. It also employs multi-scale representation\naggregation to enhance ER. Subsequently, we integrate the most efficient\nevidence information to implement both fully supervised and weakly supervised\ntraining processes for the model. We evaluate the GEGA model on three widely\nused benchmark datasets: DocRED, Re-DocRED, and Revisit-DocRED. The\nexperimental results indicate that our model has achieved comprehensive\nimprovements compared to the existing SOTA model.\n","authors":["Yanxu Mao","Xiaohui Chen","Peipei Liu","Tiehan Cui","Zuhui Yue","Zheng Li"],"pdf_url":"https://arxiv.org/pdf/2407.21384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05152v1","updated":"2024-09-08T16:35:19Z","published":"2024-09-08T16:35:19Z","title":"OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs","summary":" Despite the recent advancements in Large Language Models (LLMs), which have\nsignificantly enhanced the generative capabilities for various NLP tasks, LLMs\nstill face limitations in directly handling retrieval tasks. However, many\npractical applications demand the seamless integration of both retrieval and\ngeneration. This paper introduces a novel and efficient One-pass Generation and\nretrieval framework (OneGen), designed to improve LLMs' performance on tasks\nthat require both generation and retrieval. The proposed framework bridges the\ntraditionally separate training approaches for generation and retrieval by\nincorporating retrieval tokens generated autoregressively. This enables a\nsingle LLM to handle both tasks simultaneously in a unified forward pass. We\nconduct experiments on two distinct types of composite tasks, RAG and Entity\nLinking, to validate the pluggability, effectiveness, and efficiency of OneGen\nin training and inference. Furthermore, our results show that integrating\ngeneration and retrieval within the same context preserves the generative\ncapabilities of LLMs while improving retrieval performance. To the best of our\nknowledge, OneGen is the first to enable LLMs to conduct vector retrieval\nduring the generation.\n","authors":["Jintian Zhang","Cheng Peng","Mengshu Sun","Xiang Chen","Lei Liang","Zhiqiang Zhang","Jun Zhou","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05152v1.pdf","comment":"Work in progress; code is available at\n https://github.com/zjunlp/OneGen"},{"id":"http://arxiv.org/abs/2409.05148v1","updated":"2024-09-08T16:25:38Z","published":"2024-09-08T16:25:38Z","title":"Better Spanish Emotion Recognition In-the-wild: Bringing Attention to\n Deep Spectrum Voice Analysis","summary":" Within the context of creating new Socially Assistive Robots, emotion\nrecognition has become a key development factor, as it allows the robot to\nadapt to the user's emotional state in the wild. In this work, we focused on\nthe analysis of two voice recording Spanish datasets: ELRA-S0329 and\nEmoMatchSpanishDB. Specifically, we centered our work in the paralanguage,\ne.~g. the vocal characteristics that go along with the message and clarifies\nthe meaning. We proposed the use of the DeepSpectrum method, which consists of\nextracting a visual representation of the audio tracks and feeding them to a\npretrained CNN model. For the classification task, DeepSpectrum is often paired\nwith a Support Vector Classifier --DS-SVC--, or a Fully-Connected deep-learning\nclassifier --DS-FC--. We compared the results of the DS-SVC and DS-FC\narchitectures with the state-of-the-art (SOTA) for ELRA-S0329 and\nEmoMatchSpanishDB. Moreover, we proposed our own classifier based upon\nAttention Mechanisms, namely DS-AM. We trained all models against both\ndatasets, and we found that our DS-AM model outperforms the SOTA models for the\ndatasets and the SOTA DeepSpectrum architectures. Finally, we trained our DS-AM\nmodel in one dataset and tested it in the other, to simulate real-world\nconditions on how biased is the model to the dataset.\n","authors":["Elena Ortega-Beltrán","Josep Cabacas-Maso","Ismael Benito-Altamirano","Carles Ventura"],"pdf_url":"https://arxiv.org/pdf/2409.05148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05965v3","updated":"2024-09-08T16:19:53Z","published":"2024-07-08T14:04:58Z","title":"T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models","summary":" The recent development of Sora leads to a new era in text-to-video (T2V)\ngeneration. Along with this comes the rising concern about its security risks.\nThe generated videos may contain illegal or unethical content, and there is a\nlack of comprehensive quantitative understanding of their safety, posing a\nchallenge to their reliability and practical deployment. Previous evaluations\nprimarily focus on the quality of video generation. While some evaluations of\ntext-to-image models have considered safety, they cover fewer aspects and do\nnot address the unique temporal risk inherent in video generation. To bridge\nthis research gap, we introduce T2VSafetyBench, a new benchmark designed for\nconducting safety-critical assessments of text-to-video models. We define 12\ncritical aspects of video generation safety and construct a malicious prompt\ndataset including real-world prompts, LLM-generated prompts and jailbreak\nattack-based prompts. Based on our evaluation results, we draw several\nimportant findings, including: 1) no single model excels in all aspects, with\ndifferent models showing various strengths; 2) the correlation between GPT-4\nassessments and manual reviews is generally high; 3) there is a trade-off\nbetween the usability and safety of text-to-video generative models. This\nindicates that as the field of video generation rapidly advances, safety risks\nare set to surge, highlighting the urgency of prioritizing video safety. We\nhope that T2VSafetyBench can provide insights for better understanding the\nsafety of video generation in the era of generative AI.\n","authors":["Yibo Miao","Yifan Zhu","Yinpeng Dong","Lijia Yu","Jun Zhu","Xiao-Shan Gao"],"pdf_url":"https://arxiv.org/pdf/2407.05965v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05137v1","updated":"2024-09-08T15:42:48Z","published":"2024-09-08T15:42:48Z","title":"READoc: A Unified Benchmark for Realistic Document Structured Extraction","summary":" Document Structured Extraction (DSE) aims to extract structured content from\nraw documents. Despite the emergence of numerous DSE systems, their unified\nevaluation remains inadequate, significantly hindering the field's advancement.\nThis problem is largely attributed to existing benchmark paradigms, which\nexhibit fragmented and localized characteristics. To address these limitations\nand offer a thorough evaluation of DSE systems, we introduce a novel benchmark\nnamed READoc, which defines DSE as a realistic task of converting unstructured\nPDFs into semantically rich Markdown. The READoc dataset is derived from 2,233\ndiverse and real-world documents from arXiv and GitHub. In addition, we develop\na DSE Evaluation S$^3$uite comprising Standardization, Segmentation and Scoring\nmodules, to conduct a unified evaluation of state-of-the-art DSE approaches. By\nevaluating a range of pipeline tools, expert visual models, and general VLMs,\nwe identify the gap between current work and the unified, realistic DSE\nobjective for the first time. We aspire that READoc will catalyze future\nresearch in DSE, fostering more comprehensive and practical solutions.\n","authors":["Zichao Li","Aizier Abulaiti","Yaojie Lu","Xuanang Chen","Jia Zheng","Hongyu Lin","Xianpei Han","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2409.05137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05136v1","updated":"2024-09-08T15:42:18Z","published":"2024-09-08T15:42:18Z","title":"MHS-STMA: Multimodal Hate Speech Detection via Scalable\n Transformer-Based Multilevel Attention Framework","summary":" Social media has a significant impact on people's lives. Hate speech on\nsocial media has emerged as one of society's most serious issues recently. Text\nand pictures are two forms of multimodal data distributed within articles.\nUnimodal analysis has been the primary emphasis of earlier approaches.\nAdditionally, when doing multimodal analysis, researchers neglect to preserve\nthe distinctive qualities associated with each modality. The present article\nsuggests a scalable architecture for multimodal hate content detection called\ntransformer-based multilevel attention (STMA) to address these shortcomings.\nThis architecture consists of three main parts: a combined attention-based deep\nlearning mechanism, a vision attention mechanism encoder, and a caption\nattention-mechanism encoder. To identify hate content, each component uses\nvarious attention processes and uniquely handles multimodal data. Several\nstudies employing multiple assessment criteria on three hate speech datasets:\nHateful memes, MultiOff, and MMHS150K, validate the suggested architecture's\nefficacy. The outcomes demonstrate that on all three datasets, the suggested\nstrategy performs better than the baseline approaches.\n","authors":["Anusha Chhabra","Dinesh Kumar Vishwakarma"],"pdf_url":"https://arxiv.org/pdf/2409.05136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05134v1","updated":"2024-09-08T15:32:17Z","published":"2024-09-08T15:32:17Z","title":"Hate Content Detection via Novel Pre-Processing Sequencing and Ensemble\n Methods","summary":" Social media, particularly Twitter, has seen a significant increase in\nincidents like trolling and hate speech. Thus, identifying hate speech is the\nneed of the hour. This paper introduces a computational framework to curb the\nhate content on the web. Specifically, this study presents an exhaustive study\nof pre-processing approaches by studying the impact of changing the sequence of\ntext pre-processing operations for the identification of hate content. The\nbest-performing pre-processing sequence, when implemented with popular\nclassification approaches like Support Vector Machine, Random Forest, Decision\nTree, Logistic Regression and K-Neighbor provides a considerable boost in\nperformance. Additionally, the best pre-processing sequence is used in\nconjunction with different ensemble methods, such as bagging, boosting and\nstacking to improve the performance further. Three publicly available benchmark\ndatasets (WZ-LS, DT, and FOUNTA), were used to evaluate the proposed approach\nfor hate speech identification. The proposed approach achieves a maximum\naccuracy of 95.14% highlighting the effectiveness of the unique pre-processing\napproach along with an ensemble classifier.\n","authors":["Anusha Chhabra","Dinesh Kumar Vishwakarma"],"pdf_url":"https://arxiv.org/pdf/2409.05134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15737v2","updated":"2024-09-08T15:03:14Z","published":"2024-04-24T08:52:40Z","title":"No Train but Gain: Language Arithmetic for training-free Language\n Adapters enhancement","summary":" Modular deep learning is the state-of-the-art solution for lifting the curse\nof multilinguality, preventing the impact of negative interference and enabling\ncross-lingual performance in Multilingual Pre-trained Language Models. However,\na trade-off of this approach is the reduction in positive transfer learning\nfrom closely related languages. In response, we introduce a novel method called\nlanguage arithmetic, which enables training-free post-processing to address\nthis limitation. Extending the task arithmetic framework, we apply learning via\naddition to the language adapters, transitioning the framework from a\nmulti-task to a multilingual setup. The effectiveness of the proposed solution\nis demonstrated on three downstream tasks in a MAD-X-based set of cross-lingual\nschemes, acting as a post-processing procedure. Language arithmetic\nconsistently improves the baselines with significant gains, especially in the\nmost challenging case of zero-shot application. Our code and models are\navailable at https://github.com/mklimasz/language-arithmetic .\n","authors":["Mateusz Klimaszewski","Piotr Andruszkiewicz","Alexandra Birch"],"pdf_url":"https://arxiv.org/pdf/2404.15737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03627v5","updated":"2024-09-08T15:01:56Z","published":"2024-07-04T04:30:04Z","title":"DSLR: Document Refinement with Sentence-Level Re-ranking and\n Reconstruction to Enhance Retrieval-Augmented Generation","summary":" Recent advancements in Large Language Models (LLMs) have significantly\nimproved their performance across various Natural Language Processing (NLP)\ntasks. However, LLMs still struggle with generating non-factual responses due\nto limitations in their parametric memory. Retrieval-Augmented Generation (RAG)\nsystems address this issue by incorporating external knowledge with a retrieval\nmodule. Despite their successes, however, current RAG systems face challenges\nwith retrieval failures and the limited ability of LLMs to filter out\nirrelevant information. Therefore, in this work, we propose DSLR (Document\nRefinement with Sentence-Level Re-ranking and Reconstruction), an unsupervised\nframework that decomposes retrieved documents into sentences, filters out\nirrelevant sentences, and reconstructs them again into coherent passages. We\nexperimentally validate DSLR on multiple open-domain QA datasets and the\nresults demonstrate that DSLR significantly enhances the RAG performance over\nconventional fixed-size passage. Furthermore, our DSLR enhances performance in\nspecific, yet realistic scenarios without the need for additional training,\nproviding an effective and efficient solution for refining retrieved documents\nin RAG systems.\n","authors":["Taeho Hwang","Soyeong Jeong","Sukmin Cho","SeungYoon Han","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.03627v5.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2409.00124v2","updated":"2024-09-08T14:49:46Z","published":"2024-08-28T17:19:20Z","title":"Leveraging Large Language Models for Wireless Symbol Detection via\n In-Context Learning","summary":" Deep neural networks (DNNs) have made significant strides in tackling\nchallenging tasks in wireless systems, especially when an accurate wireless\nmodel is not available. However, when available data is limited, traditional\nDNNs often yield subpar results due to underfitting. At the same time, large\nlanguage models (LLMs) exemplified by GPT-3, have remarkably showcased their\ncapabilities across a broad range of natural language processing tasks. But\nwhether and how LLMs can benefit challenging non-language tasks in wireless\nsystems is unexplored. In this work, we propose to leverage the in-context\nlearning ability (a.k.a. prompting) of LLMs to solve wireless tasks in the low\ndata regime without any training or fine-tuning, unlike DNNs which require\ntraining. We further demonstrate that the performance of LLMs varies\nsignificantly when employed with different prompt templates. To solve this\nissue, we employ the latest LLM calibration methods. Our results reveal that\nusing LLMs via ICL methods generally outperforms traditional DNNs on the symbol\ndemodulation task and yields highly confident predictions when coupled with\ncalibration techniques.\n","authors":["Momin Abbas","Koushik Kar","Tianyi Chen"],"pdf_url":"https://arxiv.org/pdf/2409.00124v2.pdf","comment":"Accepted at IEEE GLOBECOM 2024"},{"id":"http://arxiv.org/abs/2409.05112v1","updated":"2024-09-08T14:45:47Z","published":"2024-09-08T14:45:47Z","title":"WaterSeeker: Efficient Detection of Watermarked Segments in Large\n Documents","summary":" Watermarking algorithms for large language models (LLMs) have attained high\naccuracy in detecting LLM-generated text. However, existing methods primarily\nfocus on distinguishing fully watermarked text from non-watermarked text,\noverlooking real-world scenarios where LLMs generate only small sections within\nlarge documents. In this scenario, balancing time complexity and detection\nperformance poses significant challenges. This paper presents WaterSeeker, a\nnovel approach to efficiently detect and locate watermarked segments amid\nextensive natural text. It first applies an efficient anomaly extraction method\nto preliminarily locate suspicious watermarked regions. Following this, it\nconducts a local traversal and performs full-text detection for more precise\nverification. Theoretical analysis and experimental results demonstrate that\nWaterSeeker achieves a superior balance between detection accuracy and\ncomputational efficiency. Moreover, WaterSeeker's localization ability supports\nthe development of interpretable AI detection systems. This work pioneers a new\ndirection in watermarked segment detection, facilitating more reliable\nAI-generated content identification.\n","authors":["Leyi Pan","Aiwei Liu","Yijian Lu","Zitian Gao","Yichen Di","Lijie Wen","Irwin King","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2409.05112v1.pdf","comment":"18 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.05105v1","updated":"2024-09-08T14:29:10Z","published":"2024-09-08T14:29:10Z","title":"EdaCSC: Two Easy Data Augmentation Methods for Chinese Spelling\n Correction","summary":" Chinese Spelling Correction (CSC) aims to detect and correct spelling errors\nin Chinese sentences caused by phonetic or visual similarities. While current\nCSC models integrate pinyin or glyph features and have shown significant\nprogress,they still face challenges when dealing with sentences containing\nmultiple typos and are susceptible to overcorrection in real-world scenarios.\nIn contrast to existing model-centric approaches, we propose two data\naugmentation methods to address these limitations. Firstly, we augment the\ndataset by either splitting long sentences into shorter ones or reducing typos\nin sentences with multiple typos. Subsequently, we employ different training\nprocesses to select the optimal model. Experimental evaluations on the SIGHAN\nbenchmarks demonstrate the superiority of our approach over most existing\nmodels, achieving state-of-the-art performance on the SIGHAN15 test set.\n","authors":["Lei Sheng","Shuai-Shuai Xu"],"pdf_url":"https://arxiv.org/pdf/2409.05105v1.pdf","comment":"18 pages, 2 figures"},{"id":"http://arxiv.org/abs/2404.09384v2","updated":"2024-09-08T13:59:39Z","published":"2024-04-14T23:45:23Z","title":"Tasks People Prompt: A Taxonomy of LLM Downstream Tasks in Software\n Verification and Falsification Approaches","summary":" Prompting has become one of the main approaches to leverage emergent\ncapabilities of Large Language Models [Brown et al. NeurIPS 2020, Wei et al.\nTMLR 2022, Wei et al. NeurIPS 2022]. Recently, researchers and practitioners\nhave been \"playing\" with prompts (e.g., In-Context Learning) to see how to make\nthe most of pre-trained Language Models. By homogeneously dissecting more than\na hundred articles, we investigate how software testing and verification\nresearch communities have leveraged LLMs capabilities. First, we validate that\ndownstream tasks are adequate to convey a nontrivial modular blueprint of\nprompt-based proposals in scope. Moreover, we name and classify the concrete\ndownstream tasks we recover in both validation research papers and solution\nproposals. In order to perform classification, mapping, and analysis, we also\ndevelop a novel downstream-task taxonomy. The main taxonomy requirement is to\nhighlight commonalities while exhibiting variation points of task types that\nenable pinpointing emerging patterns in a varied spectrum of Software\nEngineering problems that encompasses testing, fuzzing, fault localization,\nvulnerability detection, static analysis, and program verification approaches.\nAvenues for future research are also discussed based on conceptual clusters\ninduced by the taxonomy.\n","authors":["Víctor A. Braberman","Flavia Bonomo-Braberman","Yiannis Charalambous","Juan G. Colonna","Lucas C. Cordeiro","Rosiane de Freitas"],"pdf_url":"https://arxiv.org/pdf/2404.09384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.18942v2","updated":"2024-09-08T13:12:42Z","published":"2024-04-25T18:48:11Z","title":"GuideWalk: A Novel Graph-Based Word Embedding for Enhanced Text\n Classification","summary":" One of the prime problems of computer science and machine learning is to\nextract information efficiently from large-scale, heterogeneous data. Text\ndata, with its syntax, semantics, and even hidden information content,\npossesses an exceptional place among the data types in concern. The processing\nof the text data requires embedding, a method of translating the content of the\ntext to numeric vectors. A correct embedding algorithm is the starting point\nfor obtaining the full information content of the text data. In this work, a\nnew text embedding approach, namely the Guided Transition Probability Matrix\n(GTPM) model is proposed. The model uses the graph structure of sentences to\ncapture different types of information from text data, such as syntactic,\nsemantic, and hidden content. Using random walks on a weighted word graph, GTPM\ncalculates transition probabilities to derive text embedding vectors. The\nproposed method is tested with real-world data sets and eight well-known and\nsuccessful embedding algorithms. GTPM shows significantly better classification\nperformance for binary and multi-class datasets than well-known algorithms.\nAdditionally, the proposed method demonstrates superior robustness, maintaining\nperformance with limited (only $10\\%$) training data, showing an $8\\%$ decline\ncompared to $15-20\\%$ for baseline methods.\n","authors":["Sarmad N. Mohammed","Semra Gündüç"],"pdf_url":"https://arxiv.org/pdf/2404.18942v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08606v2","updated":"2024-09-08T12:24:11Z","published":"2024-06-12T19:22:29Z","title":"A Generative Marker Enhanced End-to-End Framework for Argument Mining","summary":" Argument Mining (AM) involves identifying and extracting Argumentative\nComponents (ACs) and their corresponding Argumentative Relations (ARs). Most of\nthe prior works have broken down these tasks into multiple sub-tasks. Existing\nend-to-end setups primarily use the dependency parsing approach. This work\nintroduces a generative paradigm-based end-to-end framework argTANL. argTANL\nframes the argumentative structures into label-augmented text, called Augmented\nNatural Language (ANL). This framework jointly extracts both ACs and ARs from a\ngiven argumentative text. Additionally, this study explores the impact of\nArgumentative and Discourse markers on enhancing the model's performance within\nthe proposed framework. Two distinct frameworks, Marker-Enhanced argTANL\n(ME-argTANL) and argTANL with specialized Marker-Based Fine-Tuning, are\nproposed to achieve this. Extensive experiments are conducted on three standard\nAM benchmarks to demonstrate the superior performance of the ME-argTANL.\n","authors":["Nilmadhab Das","Vishal Choudhary","V. Vijaya Saradhi","Ashish Anand"],"pdf_url":"https://arxiv.org/pdf/2406.08606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07644v3","updated":"2024-09-08T09:50:13Z","published":"2023-10-11T16:40:57Z","title":"Toward Understanding BERT-Like Pre-Training for DNA Foundation Models","summary":" With the success of large-scale pre-training in language tasks, there is an\nincreasing trend of applying it to the domain of life sciences. In particular,\npre-training methods based on DNA sequences have received increasing attention\nbecause of their potential to capture general information about genes. However,\nexisting pre-training methods for DNA sequences largely rely on direct\nadoptions of BERT pre-training from NLP, lacking a comprehensive understanding\nand a specifically tailored approach. To address this research gap, we provide\nthe first empirical study with three insightful observations. Based on the\nempirical study, we notice that overlapping tokenizer can benefit the\nfine-tuning of downstream tasks but leads to inadequate pre-training with fast\nconvergence. To unleash the pre-training potential, we introduce a novel\napproach called RandomMask, which gradually increases the task difficulty of\nBERT-like pre-training by continuously expanding its mask boundary, forcing the\nmodel to learn more knowledge. RandomMask is simple but effective, achieving\nstate-of-the-art performance across 6 downstream tasks. RandomMask achieves a\nstaggering 68.16\\% in Matthew's correlation coefficient for Epigenetic Mark\nPrediction, a groundbreaking increase of 19.85\\% over the baseline and a\nremarkable 3.69\\% improvement over the previous state-of-the-art result.\n","authors":["Chaoqi Liang","Lifeng Qiao","Peng Ye","Nanqing Dong","Jianle Sun","Weiqiang Bai","Yuchen Ren","Xinzhu Ma","Hongliang Yan","Chunfeng Song","Wanli Ouyang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2310.07644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15639v2","updated":"2024-09-08T08:50:38Z","published":"2024-04-24T04:25:04Z","title":"CodeIP: A Grammar-Guided Multi-Bit Watermark for Large Language Models\n of Code","summary":" Large Language Models (LLMs) have achieved remarkable progress in code\ngeneration. It now becomes crucial to identify whether the code is AI-generated\nand to determine the specific model used, particularly for purposes such as\nprotecting Intellectual Property (IP) in industry and preventing cheating in\nprogramming exercises. To this end, several attempts have been made to insert\nwatermarks into machine-generated code. However, existing approaches are\nlimited to inserting only a single bit of information or overly depending on\nparticular code patterns. In this paper, we introduce CodeIP, a novel multi-bit\nwatermarking technique that embeds additional information to preserve crucial\nprovenance details, such as the vendor ID of an LLM, thereby safeguarding the\nIPs of LLMs in code generation. Furthermore, to ensure the syntactical\ncorrectness of the generated code, we propose constraining the sampling process\nfor predicting the next token by training a type predictor. Experiments\nconducted on a real-world dataset across five programming languages demonstrate\nthe effectiveness of CodeIP in watermarking LLMs for code generation while\nmaintaining the syntactical correctness of code.\n","authors":["Batu Guan","Yao Wan","Zhangqian Bi","Zheng Wang","Hongyu Zhang","Pan Zhou","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2404.15639v2.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2409.05028v1","updated":"2024-09-08T08:46:05Z","published":"2024-09-08T08:46:05Z","title":"LLM-based Abstraction and Concretization for GUI Test Migration","summary":" GUI test migration aims to produce test cases with events and assertions to\ntest specific functionalities of a target app. Existing migration approaches\ntypically focus on the widget-mapping paradigm that maps widgets from source\napps to target apps. However, since different apps may implement the same\nfunctionality in different ways, direct mapping may result in incomplete or\nbuggy test cases, thus significantly impacting the effectiveness of testing\ntarget functionality and the practical applicability.\n In this paper, we propose a new migration paradigm (i.e.,\nabstraction-concretization paradigm) that first abstracts the test logic for\nthe target functionality and then utilizes this logic to generate the concrete\nGUI test case. Furthermore, we introduce MACdroid, the first approach that\nmigrates GUI test cases based on this paradigm. Specifically, we propose an\nabstraction technique that utilizes source test cases from source apps\ntargeting the same functionality to extract a general test logic for that\nfunctionality. Then, we propose a concretization technique that utilizes the\ngeneral test logic to guide an LLM in generating the corresponding GUI test\ncase (including events and assertions) for the target app. We evaluate MACdroid\non two widely-used datasets (including 31 apps, 34 functionalities, and 123\ntest cases). On the FrUITeR dataset, the test cases generated by MACdroid\nsuccessfully test 64% of the target functionalities, improving the baselines by\n191%. On the Lin dataset, MACdroid successfully tests 75% of the target\nfunctionalities, outperforming the baselines by 42%. These results underscore\nthe effectiveness of MACdroid in GUI test migration.\n","authors":["Yakun Zhang","Chen Liu","Xiaofei Xie","Yun Lin","Jin Song Dong","Dan Hao","Lu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05021v1","updated":"2024-09-08T08:22:17Z","published":"2024-09-08T08:22:17Z","title":"Vision-fused Attack: Advancing Aggressive and Stealthy Adversarial Text\n against Neural Machine Translation","summary":" While neural machine translation (NMT) models achieve success in our daily\nlives, they show vulnerability to adversarial attacks. Despite being harmful,\nthese attacks also offer benefits for interpreting and enhancing NMT models,\nthus drawing increased research attention. However, existing studies on\nadversarial attacks are insufficient in both attacking ability and human\nimperceptibility due to their sole focus on the scope of language. This paper\nproposes a novel vision-fused attack (VFA) framework to acquire powerful\nadversarial text, i.e., more aggressive and stealthy. Regarding the attacking\nability, we design the vision-merged solution space enhancement strategy to\nenlarge the limited semantic solution space, which enables us to search for\nadversarial candidates with higher attacking ability. For human\nimperceptibility, we propose the perception-retained adversarial text selection\nstrategy to align the human text-reading mechanism. Thus, the finally selected\nadversarial text could be more deceptive. Extensive experiments on various\nmodels, including large language models (LLMs) like LLaMA and GPT-3.5, strongly\nsupport that VFA outperforms the comparisons by large margins (up to 81%/14%\nimprovements on ASR/SSIM).\n","authors":["Yanni Xue","Haojie Hao","Jiakai Wang","Qiang Sheng","Renshuai Tao","Yu Liang","Pu Feng","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2409.05021v1.pdf","comment":"IJCAI 2024"},{"id":"http://arxiv.org/abs/2408.13985v3","updated":"2024-09-08T07:44:45Z","published":"2024-08-26T02:35:37Z","title":"TF-Attack: Transferable and Fast Adversarial Attacks on Large Language\n Models","summary":" With the great advancements in large language models (LLMs), adversarial\nattacks against LLMs have recently attracted increasing attention. We found\nthat pre-existing adversarial attack methodologies exhibit limited\ntransferability and are notably inefficient, particularly when applied to LLMs.\nIn this paper, we analyze the core mechanisms of previous predominant\nadversarial attack methods, revealing that 1) the distributions of importance\nscore differ markedly among victim models, restricting the transferability; 2)\nthe sequential attack processes induces substantial time overheads. Based on\nthe above two insights, we introduce a new scheme, named TF-Attack, for\nTransferable and Fast adversarial attacks on LLMs. TF-Attack employs an\nexternal LLM as a third-party overseer rather than the victim model to identify\ncritical units within sentences. Moreover, TF-Attack introduces the concept of\nImportance Level, which allows for parallel substitutions of attacks. We\nconduct extensive experiments on 6 widely adopted benchmarks, evaluating the\nproposed method through both automatic and human metrics. Results show that our\nmethod consistently surpasses previous methods in transferability and delivers\nsignificant speed improvements, up to 20 times faster than earlier attack\nstrategies.\n","authors":["Zelin Li","Kehai Chen","Lemao Liu","Xuefeng Bai","Mingming Yang","Yang Xiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.13985v3.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.05005v1","updated":"2024-09-08T07:26:13Z","published":"2024-09-08T07:26:13Z","title":"Towards Patronizing and Condescending Language in Chinese Videos: A\n Multimodal Dataset and Detector","summary":" Patronizing and Condescending Language (PCL) is a form of discriminatory\ntoxic speech targeting vulnerable groups, threatening both online and offline\nsafety. While toxic speech research has mainly focused on overt toxicity, such\nas hate speech, microaggressions in the form of PCL remain underexplored.\nAdditionally, dominant groups' discriminatory facial expressions and attitudes\ntoward vulnerable communities can be more impactful than verbal cues, yet these\nframe features are often overlooked. In this paper, we introduce the PCLMM\ndataset, the first Chinese multimodal dataset for PCL, consisting of 715\nannotated videos from Bilibili, with high-quality PCL facial frame spans. We\nalso propose the MultiPCL detector, featuring a facial expression detection\nmodule for PCL recognition, demonstrating the effectiveness of modality\ncomplementarity in this challenging task. Our work makes an important\ncontribution to advancing microaggression detection within the domain of toxic\nspeech.\n","authors":["Hongbo Wang","Junyu Lu","Yan Han","Liang Yang","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2409.05005v1.pdf","comment":"Under review in ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.04992v1","updated":"2024-09-08T06:06:44Z","published":"2024-09-08T06:06:44Z","title":"InstInfer: In-Storage Attention Offloading for Cost-Effective\n Long-Context LLM Inference","summary":" The widespread of Large Language Models (LLMs) marks a significant milestone\nin generative AI. Nevertheless, the increasing context length and batch size in\noffline LLM inference escalate the memory requirement of the key-value (KV)\ncache, which imposes a huge burden on the GPU VRAM, especially for\nresource-constraint scenarios (e.g., edge computing and personal devices).\nSeveral cost-effective solutions leverage host memory or SSDs to reduce storage\ncosts for offline inference scenarios and improve the throughput. Nevertheless,\nthey suffer from significant performance penalties imposed by intensive KV\ncache accesses due to limited PCIe bandwidth. To address these issues, we\npropose InstInfer, a novel LLM inference system that offloads the most\nperformance-critical computation (i.e., attention in decoding phase) and data\n(i.e., KV cache) parts to Computational Storage Drives (CSDs), which minimize\nthe enormous KV transfer overheads. InstInfer designs a dedicated flash-aware\nin-storage attention engine with KV cache management mechanisms to exploit the\nhigh internal bandwidths of CSDs instead of being limited by the PCIe\nbandwidth. The optimized P2P transmission between GPU and CSDs further reduces\ndata migration overheads. Experimental results demonstrate that for a 13B model\nusing an NVIDIA A6000 GPU, InstInfer improves throughput for long-sequence\ninference by up to 11.1$\\times$, compared to existing SSD-based solutions such\nas FlexGen.\n","authors":["Xiurui Pan","Endian Li","Qiao Li","Shengwen Liang","Yizhou Shan","Ke Zhou","Yingwei Luo","Xiaolin Wang","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04964v1","updated":"2024-09-08T04:03:55Z","published":"2024-09-08T04:03:55Z","title":"Evaluation of Google Translate for Mandarin Chinese translation using\n sentiment and semantic analysis","summary":" Machine translation using large language models (LLMs) is having a\nsignificant global impact, making communication easier. Mandarin Chinese is the\nofficial language used for communication by the government, education\ninstitutes, and media in China. In this study, we provide an automated\nassessment of machine translation models with human experts using sentiment and\nsemantic analysis. In order to demonstrate our framework, we select classic\nearly twentieth-century novel 'The True Story of Ah Q' with selected Mandarin\nChinese to English translations. We also us Google Translate to generate the\ngiven text into English and then conduct a chapter-wise sentiment analysis and\nsemantic analysis to compare the extracted sentiments across the different\ntranslations. We utilise LLMs for semantic and sentiment analysis. Our results\nindicate that the precision of Google Translate differs both in terms of\nsemantic and sentiment analysis when compared to human expert translations. We\nfind that Google Translate is unable to translate some of the specific words or\nphrases in Chinese, such as Chinese traditional allusions. The mistranslations\nhave to its lack of contextual significance and historical knowledge of China.\nThus, this framework brought us some new insights about machine translation for\nChinese Mandarin. The future work can explore other languages or types of texts\nwith this framework.\n","authors":["Xuechun Wang","Rodney Beard","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2409.04964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00727v2","updated":"2024-09-08T02:42:07Z","published":"2024-08-01T17:18:17Z","title":"Improving Retrieval-Augmented Generation in Medicine with Iterative\n Follow-up Questions","summary":" The emergent abilities of large language models (LLMs) have demonstrated\ngreat potential in solving medical questions. They can possess considerable\nmedical knowledge, but may still hallucinate and are inflexible in the\nknowledge updates. While Retrieval-Augmented Generation (RAG) has been proposed\nto enhance the medical question-answering capabilities of LLMs with external\nknowledge bases, it may still fail in complex cases where multiple rounds of\ninformation-seeking are required. To address such an issue, we propose\niterative RAG for medicine (i-MedRAG), where LLMs can iteratively ask follow-up\nqueries based on previous information-seeking attempts. In each iteration of\ni-MedRAG, the follow-up queries will be answered by a conventional RAG system\nand they will be further used to guide the query generation in the next\niteration. Our experiments show the improved performance of various LLMs\nbrought by i-MedRAG compared with conventional RAG on complex questions from\nclinical vignettes in the United States Medical Licensing Examination (USMLE),\nas well as various knowledge tests in the Massive Multitask Language\nUnderstanding (MMLU) dataset. Notably, our zero-shot i-MedRAG outperforms all\nexisting prompt engineering and fine-tuning methods on GPT-3.5, achieving an\naccuracy of 69.68\\% on the MedQA dataset. In addition, we characterize the\nscaling properties of i-MedRAG with different iterations of follow-up queries\nand different numbers of queries per iteration. Our case studies show that\ni-MedRAG can flexibly ask follow-up queries to form reasoning chains, providing\nan in-depth analysis of medical questions. To the best of our knowledge, this\nis the first-of-its-kind study on incorporating follow-up queries into medical\nRAG.\n","authors":["Guangzhi Xiong","Qiao Jin","Xiao Wang","Minjia Zhang","Zhiyong Lu","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2408.00727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15928v2","updated":"2024-09-08T00:54:58Z","published":"2024-04-24T15:38:22Z","title":"Generalization Measures for Zero-Shot Cross-Lingual Transfer","summary":" A model's capacity to generalize its knowledge to interpret unseen inputs\nwith different characteristics is crucial to build robust and reliable machine\nlearning systems. Language model evaluation tasks lack information metrics\nabout model generalization and their applicability in a new setting is measured\nusing task and language-specific downstream performance, which is often lacking\nin many languages and tasks. In this paper, we explore a set of efficient and\nreliable measures that could aid in computing more information related to the\ngeneralization capability of language models in cross-lingual zero-shot\nsettings. In addition to traditional measures such as variance in parameters\nafter training and distance from initialization, we also measure the\neffectiveness of sharpness in loss landscape in capturing the success in\ncross-lingual transfer and propose a novel and stable algorithm to reliably\ncompute the sharpness of a model optimum that correlates to generalization.\n","authors":["Saksham Bassi","Duygu Ataman","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2404.15928v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.05243v1","updated":"2024-09-08T23:09:22Z","published":"2024-09-08T23:09:22Z","title":"Mamba-Enhanced Text-Audio-Video Alignment Network for Emotion\n Recognition in Conversations","summary":" Emotion Recognition in Conversations (ERCs) is a vital area within multimodal\ninteraction research, dedicated to accurately identifying and classifying the\nemotions expressed by speakers throughout a conversation. Traditional ERC\napproaches predominantly rely on unimodal cues\\-such as text, audio, or visual\ndata\\-leading to limitations in their effectiveness. These methods encounter\ntwo significant challenges: 1) Consistency in multimodal information. Before\nintegrating various modalities, it is crucial to ensure that the data from\ndifferent sources is aligned and coherent. 2) Contextual information capture.\nSuccessfully fusing multimodal features requires a keen understanding of the\nevolving emotional tone, especially in lengthy dialogues where emotions may\nshift and develop over time. To address these limitations, we propose a novel\nMamba-enhanced Text-Audio-Video alignment network (MaTAV) for the ERC task.\nMaTAV is with the advantages of aligning unimodal features to ensure\nconsistency across different modalities and handling long input sequences to\nbetter capture contextual multimodal information. The extensive experiments on\nthe MELD and IEMOCAP datasets demonstrate that MaTAV significantly outperforms\nexisting state-of-the-art methods on the ERC task with a big margin.\n","authors":["Xinran Li","Xiaomao Fan","Qingyang Wu","Xiaojiang Peng","Ye Li"],"pdf_url":"https://arxiv.org/pdf/2409.05243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05230v1","updated":"2024-09-08T22:08:36Z","published":"2024-09-08T22:08:36Z","title":"A Low-Computational Video Synopsis Framework with a Standard Dataset","summary":" Video synopsis is an efficient method for condensing surveillance videos.\nThis technique begins with the detection and tracking of objects, followed by\nthe creation of object tubes. These tubes consist of sequences, each containing\nchronologically ordered bounding boxes of a unique object. To generate a\ncondensed video, the first step involves rearranging the object tubes to\nmaximize the number of non-overlapping objects in each frame. Then, these tubes\nare stitched to a background image extracted from the source video. The lack of\na standard dataset for the video synopsis task hinders the comparison of\ndifferent video synopsis models. This paper addresses this issue by introducing\na standard dataset, called SynoClip, designed specifically for the video\nsynopsis task. SynoClip includes all the necessary features needed to evaluate\nvarious models directly and effectively. Additionally, this work introduces a\nvideo synopsis model, called FGS, with low computational cost. The model\nincludes an empty-frame object detector to identify frames empty of any\nobjects, facilitating efficient utilization of the deep object detector.\nMoreover, a tube grouping algorithm is proposed to maintain relationships among\ntubes in the synthesized video. This is followed by a greedy tube rearrangement\nalgorithm, which efficiently determines the start time of each tube. Finally,\nthe proposed model is evaluated using the proposed dataset. The source code,\nfine-tuned object detection model, and tutorials are available at\nhttps://github.com/Ramtin-ma/VideoSynopsis-FGS.\n","authors":["Ramtin Malekpour","M. Mehrdad Morsali","Hoda Mohammadzade"],"pdf_url":"https://arxiv.org/pdf/2409.05230v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.05225v1","updated":"2024-09-08T21:43:54Z","published":"2024-09-08T21:43:54Z","title":"Comparison of Two Augmentation Methods in Improving Detection Accuracy\n of Hemarthrosis","summary":" With the increase of computing power, machine learning models in medical\nimaging have been introduced to help in rending medical diagnosis and\ninspection, like hemophilia, a rare disorder in which blood cannot clot\nnormally. Often, one of the bottlenecks of detecting hemophilia is the lack of\ndata available to train the algorithm to increase the accuracy. As a possible\nsolution, this research investigated whether introducing augmented data by data\nsynthesis or traditional augmentation techniques can improve model accuracy,\nhelping to diagnose the diseases. To tackle this research, features of\nultrasound images were extracted by the pre-trained VGG-16, and similarities\nwere compared by cosine similarity measure based on extracted features in\ndifferent distributions among real images, synthetic images, and augmentation\nimages (Real vs. Real, Syn vs. Syn, Real vs. Different Batches of Syn, Real vs.\nAugmentation Techniques). Model testing performance was investigated using\nEffientNet-B4 to recognize \"blood\" images with two augmentation methods. In\naddition, a gradient-weighted class activation mapping (Grad-CAM) visualization\nwas used to interpret the unexpected results like loss of accuracy. Synthetic\nand real images do not show high similarity, with a mean similarity score of\n0.4737. Synthetic batch 1 dataset and images by horizontal flip are more\nsimilar to the original images. Classic augmentation techniques and data\nsynthesis can improve model accuracy, and data by traditional augmentation\ntechniques have a better performance than synthetic data. In addition, the\nGrad-CAM heatmap figured out the loss of accuracy is due to a shift in the\ndomain. Overall, this research found that two augmentation methods, data\nsynthesis and traditional augmentation techniques, both can improve accuracy to\na certain extent to help to diagnose rare diseases.\n","authors":["Qianyu Fan","Pascal N. Tyrrell"],"pdf_url":"https://arxiv.org/pdf/2409.05225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05202v1","updated":"2024-09-08T19:32:22Z","published":"2024-09-08T19:32:22Z","title":"A Survey on Mixup Augmentations and Beyond","summary":" As Deep Neural Networks have achieved thrilling breakthroughs in the past\ndecade, data augmentations have garnered increasing attention as regularization\ntechniques when massive labeled data are unavailable. Among existing\naugmentations, Mixup and relevant data-mixing methods that convexly combine\nselected samples and the corresponding labels are widely adopted because they\nyield high performances by generating data-dependent virtual data while easily\nmigrating to various domains. This survey presents a comprehensive review of\nfoundational mixup methods and their applications. We first elaborate on the\ntraining pipeline with mixup augmentations as a unified framework containing\nmodules. A reformulated framework could contain various mixup methods and give\nintuitive operational procedures. Then, we systematically investigate the\napplications of mixup augmentations on vision downstream tasks, various data\nmodalities, and some analysis \\& theorems of mixup. Meanwhile, we conclude the\ncurrent status and limitations of mixup research and point out further work for\neffective and efficient mixup augmentations. This survey can provide\nresearchers with the current state of the art in mixup methods and provide some\ninsights and guidance roles in the mixup arena. An online project with this\nsurvey is available at \\url{https://github.com/Westlake-AI/Awesome-Mixup}.\n","authors":["Xin Jin","Hongyu Zhu","Siyuan Li","Zedong Wang","Zicheng Liu","Chang Yu","Huafeng Qin","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2409.05202v1.pdf","comment":"Preprint V1 with 27 pages main text. Online project at\n https://github.com/Westlake-AI/Awesome-Mixup"},{"id":"http://arxiv.org/abs/2409.05200v1","updated":"2024-09-08T19:24:38Z","published":"2024-09-08T19:24:38Z","title":"Lung-DETR: Deformable Detection Transformer for Sparse Lung Nodule\n Anomaly Detection","summary":" Accurate lung nodule detection for computed tomography (CT) scan imagery is\nchallenging in real-world settings due to the sparse occurrence of nodules and\nsimilarity to other anatomical structures. In a typical positive case, nodules\nmay appear in as few as 3% of CT slices, complicating detection. To address\nthis, we reframe the problem as an anomaly detection task, targeting rare\nnodule occurrences in a predominantly normal dataset. We introduce a novel\nsolution leveraging custom data preprocessing and Deformable Detection\nTransformer (Deformable- DETR). A 7.5mm Maximum Intensity Projection (MIP) is\nutilized to combine adjacent lung slices into single images, reducing the slice\ncount and decreasing nodule sparsity. This enhances spatial context, allowing\nfor better differentiation between nodules and other structures such as complex\nvascular structures and bronchioles. Deformable-DETR is employed to detect\nnodules, with a custom focal loss function to better handle the imbalanced\ndataset. Our model achieves state-of-the-art performance on the LUNA16 dataset\nwith an F1 score of 94.2% (95.2% recall, 93.3% precision) on a dataset sparsely\npopulated with lung nodules that is reflective of real-world clinical data.\n","authors":["Hooman Ramezani","Dionne Aleman","Daniel Létourneau"],"pdf_url":"https://arxiv.org/pdf/2409.05200v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2405.19612v2","updated":"2024-09-08T20:32:47Z","published":"2024-05-30T02:00:03Z","title":"Keyword-driven Retrieval-Augmented Large Language Models for Cold-start\n User Recommendations","summary":" Recent advancements in Large Language Models (LLMs) have shown significant\npotential in enhancing recommender systems. However, addressing the cold-start\nrecommendation problem, where users lack historical data, remains a\nconsiderable challenge. In this paper, we introduce KALM4Rec (Keyword-driven\nRetrieval-Augmented Large Language Models for Cold-start User Recommendations),\na novel framework specifically designed to tackle this problem by requiring\nonly a few input keywords from users in a practical scenario of cold-start user\nrestaurant recommendations. KALM4Rec operates in two main stages: candidates\nretrieval and LLM-based candidates re-ranking. In the first stage,\nkeyword-driven retrieval models are used to identify potential candidates,\naddressing LLMs' limitations in processing extensive tokens and reducing the\nrisk of generating misleading information. In the second stage, we employ LLMs\nwith various prompting strategies, including zero-shot and few-shot techniques,\nto re-rank these candidates by integrating multiple examples directly into the\nLLM prompts. Our evaluation, using a Yelp restaurant dataset with user reviews\nfrom three English-speaking cities, shows that our proposed framework\nsignificantly improves recommendation quality. Specifically, the integration of\nin-context instructions with LLMs for re-ranking markedly enhances the\nperformance of the cold-start user recommender system.\n","authors":["Hai-Dang Kieu","Minh Duc Nguyen","Thanh-Son Nguyen","Dung D. Le"],"pdf_url":"https://arxiv.org/pdf/2405.19612v2.pdf","comment":"10 pages, 10 figures, 4 tables"},{"id":"http://arxiv.org/abs/2409.05152v1","updated":"2024-09-08T16:35:19Z","published":"2024-09-08T16:35:19Z","title":"OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs","summary":" Despite the recent advancements in Large Language Models (LLMs), which have\nsignificantly enhanced the generative capabilities for various NLP tasks, LLMs\nstill face limitations in directly handling retrieval tasks. However, many\npractical applications demand the seamless integration of both retrieval and\ngeneration. This paper introduces a novel and efficient One-pass Generation and\nretrieval framework (OneGen), designed to improve LLMs' performance on tasks\nthat require both generation and retrieval. The proposed framework bridges the\ntraditionally separate training approaches for generation and retrieval by\nincorporating retrieval tokens generated autoregressively. This enables a\nsingle LLM to handle both tasks simultaneously in a unified forward pass. We\nconduct experiments on two distinct types of composite tasks, RAG and Entity\nLinking, to validate the pluggability, effectiveness, and efficiency of OneGen\nin training and inference. Furthermore, our results show that integrating\ngeneration and retrieval within the same context preserves the generative\ncapabilities of LLMs while improving retrieval performance. To the best of our\nknowledge, OneGen is the first to enable LLMs to conduct vector retrieval\nduring the generation.\n","authors":["Jintian Zhang","Cheng Peng","Mengshu Sun","Xiang Chen","Lei Liang","Zhiqiang Zhang","Jun Zhou","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05152v1.pdf","comment":"Work in progress; code is available at\n https://github.com/zjunlp/OneGen"},{"id":"http://arxiv.org/abs/2409.05033v1","updated":"2024-09-08T08:57:12Z","published":"2024-09-08T08:57:12Z","title":"A Survey on Diffusion Models for Recommender Systems","summary":" While traditional recommendation techniques have made significant strides in\nthe past decades, they still suffer from limited generalization performance\ncaused by factors like inadequate collaborative signals, weak latent\nrepresentations, and noisy data. In response, diffusion models (DMs) have\nemerged as promising solutions for recommender systems due to their robust\ngenerative capabilities, solid theoretical foundations, and improved training\nstability. To this end, in this paper, we present the first comprehensive\nsurvey on diffusion models for recommendation, and draw a bird's-eye view from\nthe perspective of the whole pipeline in real-world recommender systems. We\nsystematically categorize existing research works into three primary domains:\n(1) diffusion for data engineering & encoding, focusing on data augmentation\nand representation enhancement; (2) diffusion as recommender models, employing\ndiffusion models to directly estimate user preferences and rank items; and (3)\ndiffusion for content presentation, utilizing diffusion models to generate\npersonalized content such as fashion and advertisement creatives. Our taxonomy\nhighlights the unique strengths of diffusion models in capturing complex data\ndistributions and generating high-quality, diverse samples that closely align\nwith user preferences. We also summarize the core characteristics of the\nadapting diffusion models for recommendation, and further identify key areas\nfor future exploration, which helps establish a roadmap for researchers and\npractitioners seeking to advance recommender systems through the innovative\napplication of diffusion models. To further facilitate the research community\nof recommender systems based on diffusion models, we actively maintain a GitHub\nrepository for papers and other related resources in this rising direction\nhttps://github.com/CHIANGEL/Awesome-Diffusion-for-RecSys.\n","authors":["Jianghao Lin","Jiaqi Liu","Jiachen Zhu","Yunjia Xi","Chengkai Liu","Yangtian Zhang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.05033v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.05022v1","updated":"2024-09-08T08:27:22Z","published":"2024-09-08T08:27:22Z","title":"Sequential Recommendation via Adaptive Robust Attention with\n Multi-dimensional Embeddings","summary":" Sequential recommendation models have achieved state-of-the-art performance\nusing self-attention mechanism. It has since been found that moving beyond only\nusing item ID and positional embeddings leads to a significant accuracy boost\nwhen predicting the next item. In recent literature, it was reported that a\nmulti-dimensional kernel embedding with temporal contextual kernels to capture\nusers' diverse behavioral patterns results in a substantial performance\nimprovement. In this study, we further improve the sequential recommender\nmodel's robustness and generalization by introducing a mix-attention mechanism\nwith a layer-wise noise injection (LNI) regularization. We refer to our\nproposed model as adaptive robust sequential recommendation framework (ADRRec),\nand demonstrate through extensive experiments that our model outperforms\nexisting self-attention architectures.\n","authors":["Linsey Pang","Amir Hossein Raffiee","Wei Liu","Keld Lundgaard"],"pdf_url":"https://arxiv.org/pdf/2409.05022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11119v2","updated":"2024-09-08T04:25:32Z","published":"2024-04-17T07:07:41Z","title":"DREAM: A Dual Representation Learning Model for Multimodal\n Recommendation","summary":" Multimodal recommendation focuses primarily on effectively exploiting both\nbehavioral and multimodal information for the recommendation task. However,\nmost existing models suffer from the following issues when fusing information\nfrom two different domains: (1) Previous works do not pay attention to the\nsufficient utilization of modal information by only using direct concatenation,\naddition, or simple linear layers for modal information extraction. (2)\nPrevious works treat modal features as learnable embeddings, which causes the\nmodal embeddings to gradually deviate from the original modal features during\nlearning. We refer to this issue as Modal Information Forgetting. (3) Previous\napproaches fail to account for the significant differences in the distribution\nbetween behavior and modality, leading to the issue of representation\nmisalignment. To address these challenges, this paper proposes a novel Dual\nREpresentAtion learning model for Multimodal Recommendation called DREAM. For\nsufficient information extraction, we introduce separate dual lines, including\nBehavior Line and Modal Line, in which the Modal-specific Encoder is applied to\nempower modal representations. To address the issue of Modal Information\nForgetting, we introduce the Similarity Supervised Signal to constrain the\nmodal representations. Additionally, we design a Behavior-Modal Alignment\nmodule to fuse the dual representations through Intra-Alignment and\nInter-Alignment. Extensive experiments on three public datasets demonstrate\nthat the proposed DREAM method achieves state-of-the-art (SOTA) results. The\nsource code will be available upon acceptance.\n","authors":["Kangning Zhang","Yingjie Qin","Jiarui Jin","Yifan Liu","Ruilong Su","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11119v2.pdf","comment":"10 pages, 11 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.03253v2","updated":"2024-09-08T23:58:44Z","published":"2024-09-05T05:13:28Z","title":"SpinMultiNet: Neural Network Potential Incorporating Spin Degrees of\n Freedom with Multi-Task Learning","summary":" Neural Network Potentials (NNPs) have attracted significant attention as a\nmethod for accelerating density functional theory (DFT) calculations. However,\nconventional NNP models typically do not incorporate spin degrees of freedom,\nlimiting their applicability to systems where spin states critically influence\nmaterial properties, such as transition metal oxides. This study introduces\nSpinMultiNet, a novel NNP model that integrates spin degrees of freedom through\nmulti-task learning. SpinMultiNet achieves accurate predictions without relying\non correct spin values obtained from DFT calculations. Instead, it utilizes\ninitial spin estimates as input and leverages multi-task learning to optimize\nthe spin latent representation while maintaining both $E(3)$ and time-reversal\nequivariance. Validation on a dataset of transition metal oxides demonstrates\nthe high predictive accuracy of SpinMultiNet. The model successfully reproduces\nthe energy ordering of stable spin configurations originating from\nsuperexchange interactions and accurately captures the rhombohedral distortion\nof the rocksalt structure. These results pave the way for new possibilities in\nmaterials simulations that consider spin degrees of freedom, promising future\napplications in large-scale simulations of various material systems, including\nmagnetic materials.\n","authors":["Koki Ueno","Satoru Ohuchi","Kazuhide Ichikawa","Kei Amii","Kensuke Wakasugi"],"pdf_url":"https://arxiv.org/pdf/2409.03253v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2407.11039v3","updated":"2024-09-08T23:10:59Z","published":"2024-07-06T02:04:17Z","title":"Balancing Immediate Revenue and Future Off-Policy Evaluation in Coupon\n Allocation","summary":" Coupon allocation drives customer purchases and boosts revenue. However, it\npresents a fundamental trade-off between exploiting the current optimal policy\nto maximize immediate revenue and exploring alternative policies to collect\ndata for future policy improvement via off-policy evaluation (OPE). To balance\nthis trade-off, we propose a novel approach that combines a model-based revenue\nmaximization policy and a randomized exploration policy for data collection.\nOur framework enables flexible adjustment of the mixture ratio between these\ntwo policies to optimize the balance between short-term revenue and future\npolicy improvement. We formulate the problem of determining the optimal mixture\nratio as multi-objective optimization, enabling quantitative evaluation of this\ntrade-off. We empirically verified the effectiveness of the proposed mixed\npolicy using synthetic data. Our main contributions are: (1) Demonstrating a\nmixed policy combining deterministic and probabilistic policies, flexibly\nadjusting the data collection vs. revenue trade-off. (2) Formulating the\noptimal mixture ratio problem as multi-objective optimization, enabling\nquantitative evaluation of this trade-off.\n","authors":["Naoki Nishimura","Ken Kobayashi","Kazuhide Nakata"],"pdf_url":"https://arxiv.org/pdf/2407.11039v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12839v3","updated":"2024-09-08T22:48:07Z","published":"2024-06-18T17:56:10Z","title":"Evaluating the design space of diffusion-based generative models","summary":" Most existing theoretical investigations of the accuracy of diffusion models,\nalbeit significant, assume the score function has been approximated to a\ncertain accuracy, and then use this a priori bound to control the error of\ngeneration. This article instead provides a first quantitative understanding of\nthe whole generation process, i.e., both training and sampling. More precisely,\nit conducts a non-asymptotic convergence analysis of denoising score matching\nunder gradient descent. In addition, a refined sampling error analysis for\nvariance exploding models is also provided. The combination of these two\nresults yields a full error analysis, which elucidates (again, but this time\ntheoretically) how to design the training and sampling processes for effective\ngeneration. For instance, our theory implies a preference toward noise\ndistribution and loss weighting in training that qualitatively agree with the\nones used in [Karras et al. 2022]. It also provides perspectives on the choices\nof time and variance schedules in sampling: when the score is well trained, the\ndesign in [Song et al. 2020] is more preferable, but when it is less trained,\nthe design in [Karras et al. 2022] becomes more preferable.\n","authors":["Yuqing Wang","Ye He","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2406.12839v3.pdf","comment":"Comments are welcome. Out of admiration we titled our paper after\n EDM, and hoped theorists' humor is not too corny"},{"id":"http://arxiv.org/abs/2409.05234v1","updated":"2024-09-08T22:27:50Z","published":"2024-09-08T22:27:50Z","title":"Empowering Bayesian Neural Networks with Functional Priors through\n Anchored Ensembling for Mechanics Surrogate Modeling Applications","summary":" In recent years, neural networks (NNs) have become increasingly popular for\nsurrogate modeling tasks in mechanics and materials modeling applications.\nWhile traditional NNs are deterministic functions that rely solely on data to\nlearn the input--output mapping, casting NN training within a Bayesian\nframework allows to quantify uncertainties, in particular epistemic\nuncertainties that arise from lack of training data, and to integrate a priori\nknowledge via the Bayesian prior. However, the high dimensionality and\nnon-physicality of the NN parameter space, and the complex relationship between\nparameters (NN weights) and predicted outputs, renders both prior design and\nposterior inference challenging. In this work we present a novel BNN training\nscheme based on anchored ensembling that can integrate a priori information\navailable in the function space, from e.g. low-fidelity models. The anchoring\nscheme makes use of low-rank correlations between NN parameters, learnt from\npre-training to realizations of the functional prior. We also perform a study\nto demonstrate how correlations between NN weights, which are often neglected\nin existing BNN implementations, is critical to appropriately transfer\nknowledge between the function-space and parameter-space priors. Performance of\nour novel BNN algorithm is first studied on a small 1D example to illustrate\nthe algorithm's behavior in both interpolation and extrapolation settings.\nThen, a thorough assessment is performed on a multi--input--output materials\nsurrogate modeling example, where we demonstrate the algorithm's capabilities\nboth in terms of accuracy and quality of the uncertainty estimation, for both\nin-distribution and out-of-distribution data.\n","authors":["Javad Ghorbanian","Nicholas Casaprima","Audrey Olivier"],"pdf_url":"https://arxiv.org/pdf/2409.05234v1.pdf","comment":"24 pages, 14 figures"},{"id":"http://arxiv.org/abs/2309.10831v4","updated":"2024-09-08T22:01:53Z","published":"2023-09-18T18:05:35Z","title":"Actively Learning Reinforcement Learning: A Stochastic Optimal Control\n Approach","summary":" In this paper we propose a framework towards achieving two intertwined\nobjectives: (i) equipping reinforcement learning with active exploration and\ndeliberate information gathering, such that it regulates state and parameter\nuncertainties resulting from modeling mismatches and noisy sensory; and (ii)\novercoming the computational intractability of stochastic optimal control. We\napproach both objectives by using reinforcement learning to compute the\nstochastic optimal control law. On one hand, we avoid the curse of\ndimensionality prohibiting the direct solution of the stochastic dynamic\nprogramming equation. On the other hand, the resulting stochastic optimal\ncontrol reinforcement learning agent admits caution and probing, that is,\noptimal online exploration and exploitation. Unlike fixed exploration and\nexploitation balance, caution and probing are employed automatically by the\ncontroller in real-time, even after the learning process is terminated. We\nconclude the paper with a numerical simulation, illustrating how a Linear\nQuadratic Regulator with the certainty equivalence assumption may lead to poor\nperformance and filter divergence, while our proposed approach is stabilizing,\nof an acceptable performance, and computationally convenient.\n","authors":["Mohammad S. Ramadan","Mahmoud A. Hayajnh","Michael T. Tolley","Kyriakos G. Vamvoudakis"],"pdf_url":"https://arxiv.org/pdf/2309.10831v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13440v2","updated":"2024-09-08T21:47:29Z","published":"2024-08-24T02:40:28Z","title":"Knowledge-Aware Conversation Derailment Forecasting Using Graph\n Convolutional Networks","summary":" Online conversations are particularly susceptible to derailment, which can\nmanifest itself in the form of toxic communication patterns including\ndisrespectful comments and abuse. Forecasting conversation derailment predicts\nsigns of derailment in advance enabling proactive moderation of conversations.\nState-of-the-art approaches to conversation derailment forecasting sequentially\nencode conversations and use graph neural networks to model dialogue user\ndynamics. However, existing graph models are not able to capture complex\nconversational characteristics such as context propagation and emotional\nshifts. The use of common sense knowledge enables a model to capture such\ncharacteristics, thus improving performance. Following this approach, here we\nderive commonsense statements from a knowledge base of dialogue contextual\ninformation to enrich a graph neural network classification architecture. We\nfuse the multi-source information on utterance into capsules, which are used by\na transformer-based forecaster to predict conversation derailment. Our model\ncaptures conversation dynamics and context propagation, outperforming the\nstate-of-the-art models on the CGA and CMV benchmark datasets\n","authors":["Enas Altarawneh","Ameeta Agrawal","Michael Jenkin","Manos Papagelis"],"pdf_url":"https://arxiv.org/pdf/2408.13440v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2306.12982;\n text overlap with arXiv:2106.01071 by other authors"},{"id":"http://arxiv.org/abs/2409.05227v1","updated":"2024-09-08T21:45:12Z","published":"2024-09-08T21:45:12Z","title":"BBS: Bi-directional Bit-level Sparsity for Deep Learning Acceleration","summary":" Bit-level sparsity methods skip ineffectual zero-bit operations and are\ntypically applicable within bit-serial deep learning accelerators. This type of\nsparsity at the bit-level is especially interesting because it is both\northogonal and compatible with other deep neural network (DNN) efficiency\nmethods such as quantization and pruning. In this work, we improve the\npracticality and efficiency of bitlevel sparsity through a novel algorithmic\nbit-pruning, averaging, and compression method, and a co-designed efficient\nbit-serial hardware accelerator. On the algorithmic side, we introduce\nbidirectional bit sparsity (BBS). The key insight of BBS is that we can\nleverage bit sparsity in a symmetrical way to prune either zero-bits or\none-bits. This significantly improves the load balance of bit-serial computing\nand guarantees the level of sparsity to be more than 50%. On top of BBS, we\nfurther propose two bit-level binary pruning methods that require no\nretraining, and can be seamlessly applied to quantized DNNs. Combining binary\npruning with a new tensor encoding scheme, BBS can both skip computation and\nreduce the memory footprint associated with bi-directional sparse bit columns.\nOn the hardware side, we demonstrate the potential of BBS through BitVert, a\nbitserial architecture with an efficient PE design to accelerate DNNs with low\noverhead, exploiting our proposed binary pruning. Evaluation on seven\nrepresentative DNN models shows that our approach achieves: (1) on average\n1.66$\\times$ reduction in model sizewith negligible accuracy loss of < 0.5%;\n(2) up to 3.03$\\times$ speedupand 2.44$\\times$ energy saving compared to prior\nDNN accelerators.\n","authors":["Yuzong Chen","Jian Meng","Jae-sun Seo","Mohamed S. Abdelfattah"],"pdf_url":"https://arxiv.org/pdf/2409.05227v1.pdf","comment":"Accepted by IEEE/ACM MICRO 2024"},{"id":"http://arxiv.org/abs/2409.05215v1","updated":"2024-09-08T20:08:09Z","published":"2024-09-08T20:08:09Z","title":"Synthetic Tabular Data Generation for Class Imbalance and Fairness: A\n Comparative Study","summary":" Due to their data-driven nature, Machine Learning (ML) models are susceptible\nto bias inherited from data, especially in classification problems where class\nand group imbalances are prevalent. Class imbalance (in the classification\ntarget) and group imbalance (in protected attributes like sex or race) can\nundermine both ML utility and fairness. Although class and group imbalances\ncommonly coincide in real-world tabular datasets, limited methods address this\nscenario. While most methods use oversampling techniques, like interpolation,\nto mitigate imbalances, recent advancements in synthetic tabular data\ngeneration offer promise but have not been adequately explored for this\npurpose. To this end, this paper conducts a comparative analysis to address\nclass and group imbalances using state-of-the-art models for synthetic tabular\ndata generation and various sampling strategies. Experimental results on four\ndatasets, demonstrate the effectiveness of generative models for bias\nmitigation, creating opportunities for further exploration in this direction.\n","authors":["Emmanouil Panagiotou","Arjun Roy","Eirini Ntoutsi"],"pdf_url":"https://arxiv.org/pdf/2409.05215v1.pdf","comment":"Accepted at the ECML PKDD 2024, 4th Workshop on Bias and Fairness in\n AI"},{"id":"http://arxiv.org/abs/2409.05211v1","updated":"2024-09-08T19:59:53Z","published":"2024-09-08T19:59:53Z","title":"ICML Topological Deep Learning Challenge 2024: Beyond the Graph Domain","summary":" This paper describes the 2nd edition of the ICML Topological Deep Learning\nChallenge that was hosted within the ICML 2024 ELLIS Workshop on\nGeometry-grounded Representation Learning and Generative Modeling (GRaM). The\nchallenge focused on the problem of representing data in different discrete\ntopological domains in order to bridge the gap between Topological Deep\nLearning (TDL) and other types of structured datasets (e.g. point clouds,\ngraphs). Specifically, participants were asked to design and implement\ntopological liftings, i.e. mappings between different data structures and\ntopological domains --like hypergraphs, or simplicial/cell/combinatorial\ncomplexes. The challenge received 52 submissions satisfying all the\nrequirements. This paper introduces the main scope of the challenge, and\nsummarizes the main results and findings.\n","authors":["Guillermo Bernárdez","Lev Telyatnikov","Marco Montagna","Federica Baccini","Mathilde Papillon","Miquel Ferriol-Galmés","Mustafa Hajij","Theodore Papamarkou","Maria Sofia Bucarelli","Olga Zaghen","Johan Mathe","Audun Myers","Scott Mahan","Hansen Lillemark","Sharvaree Vadgama","Erik Bekkers","Tim Doster","Tegan Emerson","Henry Kvinge","Katrina Agate","Nesreen K Ahmed","Pengfei Bai","Michael Banf","Claudio Battiloro","Maxim Beketov","Paul Bogdan","Martin Carrasco","Andrea Cavallo","Yun Young Choi","George Dasoulas","Matouš Elphick","Giordan Escalona","Dominik Filipiak","Halley Fritze","Thomas Gebhart","Manel Gil-Sorribes","Salvish Goomanee","Victor Guallar","Liliya Imasheva","Andrei Irimia","Hongwei Jin","Graham Johnson","Nikos Kanakaris","Boshko Koloski","Veljko Kovač","Manuel Lecha","Minho Lee","Pierrick Leroy","Theodore Long","German Magai","Alvaro Martinez","Marissa Masden","Sebastian Mežnar","Bertran Miquel-Oliver","Alexis Molina","Alexander Nikitin","Marco Nurisso","Matt Piekenbrock","Yu Qin","Patryk Rygiel","Alessandro Salatiello","Max Schattauer","Pavel Snopov","Julian Suk","Valentina Sánchez","Mauricio Tec","Francesco Vaccarino","Jonas Verhellen","Frederic Wantiez","Alexander Weers","Patrik Zajec","Blaž Škrlj","Nina Miolane"],"pdf_url":"https://arxiv.org/pdf/2409.05211v1.pdf","comment":"Proceedings of the Geometry-grounded Representation Learning and\n Generative Modeling Workshop (GRaM) at ICML 2024"},{"id":"http://arxiv.org/abs/2408.01527v2","updated":"2024-09-08T19:59:06Z","published":"2024-08-02T18:40:10Z","title":"Using LLMs to Establish Implicit User Sentiment of Software Desirability","summary":" This study explores the use of LLMs for providing quantitative zero-shot\nsentiment analysis of implicit software desirability, addressing a critical\nchallenge in product evaluation where traditional review scores, though\nconvenient, fail to capture the richness of qualitative user feedback.\nInnovations include establishing a method that 1) works with qualitative user\nexperience data without the need for explicit review scores, 2) focuses on\nimplicit user satisfaction, and 3) provides scaled numerical sentiment\nanalysis, offering a more nuanced understanding of user sentiment, instead of\nsimply classifying sentiment as positive, neutral, or negative.\n Data is collected using the Microsoft Product Desirability Toolkit (PDT), a\nwell-known qualitative user experience analysis tool. For initial exploration,\nthe PDT metric was given to users of two software systems. PDT data was fed\nthrough several LLMs (Claude Sonnet 3 and 3.5, GPT4, and GPT4o) and through a\nleading transfer learning technique, Twitter-Roberta-Base-Sentiment, and Vader,\na leading sentiment analysis tool. Each system was asked to evaluate the data\nin two ways, by looking at the sentiment expressed in the PDT word/explanation\npairs; and by looking at the sentiment expressed by the users in their grouped\nselection of five words and explanations, as a whole. Each LLM provided a\nsentiment score, its confidence (low, medium, high) in the score, and an\nexplanation of the score.\n All LLMs tested were able to statistically detect user sentiment from the\nusers' grouped data, whereas TRBS and Vader were not. The confidence and\nexplanation of confidence provided by the LLMs assisted in understanding user\nsentiment. This study adds deeper understanding of evaluating user experiences,\ntoward the goal of creating a universal tool that quantifies implicit\nsentiment.\n","authors":["Sherri Weitl-Harms","John D. Hastings","Jonah Lum"],"pdf_url":"https://arxiv.org/pdf/2408.01527v2.pdf","comment":"6 pages, 2 figures, 2 tables, updated to incorporate feedback"},{"id":"http://arxiv.org/abs/2409.05208v1","updated":"2024-09-08T19:52:00Z","published":"2024-09-08T19:52:00Z","title":"Influence-based Attributions can be Manipulated","summary":" Influence Functions are a standard tool for attributing predictions to\ntraining data in a principled manner and are widely used in applications such\nas data valuation and fairness. In this work, we present realistic incentives\nto manipulate influencebased attributions and investigate whether these\nattributions can be systematically tampered by an adversary. We show that this\nis indeed possible and provide efficient attacks with backward-friendly\nimplementations. Our work raises questions on the reliability of\ninfluence-based attributions under adversarial circumstances.\n","authors":["Chhavi Yadav","Ruihan Wu","Kamalika Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2409.05208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05207v1","updated":"2024-09-08T19:50:25Z","published":"2024-09-08T19:50:25Z","title":"Low Latency Transformer Inference on FPGAs for Physics Applications with\n hls4ml","summary":" This study presents an efficient implementation of transformer architectures\nin Field-Programmable Gate Arrays(FPGAs) using hls4ml. We demonstrate the\nstrategy for implementing the multi-head attention, softmax, and normalization\nlayer and evaluate three distinct models. Their deployment on VU13P FPGA chip\nachieved latency less than 2us, demonstrating the potential for real-time\napplications. HLS4ML compatibility with any TensorFlow-built transformer model\nfurther enhances the scalability and applicability of this work. Index Terms:\nFPGAs, machine learning, transformers, high energy physics, LIGO\n","authors":["Zhixing Jiang","Dennis Yin","Yihui Chen","Elham E Khoda","Scott Hauck","Shih-Chieh Hsu","Ekaterina Govorkova","Philip Harris","Vladimir Loncar","Eric A. Moreno"],"pdf_url":"https://arxiv.org/pdf/2409.05207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05206v1","updated":"2024-09-08T19:46:45Z","published":"2024-09-08T19:46:45Z","title":"SEF: A Method for Computing Prediction Intervals by Shifting the Error\n Function in Neural Networks","summary":" In today's era, Neural Networks (NN) are applied in various scientific fields\nsuch as robotics, medicine, engineering, etc. However, the predictions of\nneural networks themselves contain a degree of uncertainty that must always be\ntaken into account before any decision is made. This is why many researchers\nhave focused on developing different ways to quantify the uncertainty of neural\nnetwork predictions. Some of these methods are based on generating prediction\nintervals (PI) via neural networks for the requested target values. The SEF\n(Shifting the Error Function) method presented in this paper is a new method\nthat belongs to this category of methods. The proposed approach involves\ntraining a single neural network three times, thus generating an estimate along\nwith the corresponding upper and lower bounds for a given problem. A pivotal\naspect of the method is the calculation of a parameter from the initial\nnetwork's estimates, which is then integrated into the loss functions of the\nother two networks. This innovative process effectively produces PIs, resulting\nin a robust and efficient technique for uncertainty quantification. To evaluate\nthe effectiveness of our method, a comparison in terms of successful PI\ngeneration between the SEF, PI3NN and PIVEN methods was made using two\nsynthetic datasets.\n","authors":["E. V. Aretos","D. G. Sotiropoulos"],"pdf_url":"https://arxiv.org/pdf/2409.05206v1.pdf","comment":"The paper has been accepted at the 2024 International Conference on\n Computer and Applications (ICCA24), Cairo, Egypt, December 17-19, 2024.\n https://icca-conf.info/icca-2024"},{"id":"http://arxiv.org/abs/2409.05202v1","updated":"2024-09-08T19:32:22Z","published":"2024-09-08T19:32:22Z","title":"A Survey on Mixup Augmentations and Beyond","summary":" As Deep Neural Networks have achieved thrilling breakthroughs in the past\ndecade, data augmentations have garnered increasing attention as regularization\ntechniques when massive labeled data are unavailable. Among existing\naugmentations, Mixup and relevant data-mixing methods that convexly combine\nselected samples and the corresponding labels are widely adopted because they\nyield high performances by generating data-dependent virtual data while easily\nmigrating to various domains. This survey presents a comprehensive review of\nfoundational mixup methods and their applications. We first elaborate on the\ntraining pipeline with mixup augmentations as a unified framework containing\nmodules. A reformulated framework could contain various mixup methods and give\nintuitive operational procedures. Then, we systematically investigate the\napplications of mixup augmentations on vision downstream tasks, various data\nmodalities, and some analysis \\& theorems of mixup. Meanwhile, we conclude the\ncurrent status and limitations of mixup research and point out further work for\neffective and efficient mixup augmentations. This survey can provide\nresearchers with the current state of the art in mixup methods and provide some\ninsights and guidance roles in the mixup arena. An online project with this\nsurvey is available at \\url{https://github.com/Westlake-AI/Awesome-Mixup}.\n","authors":["Xin Jin","Hongyu Zhu","Siyuan Li","Zedong Wang","Zicheng Liu","Chang Yu","Huafeng Qin","Stan Z. Li"],"pdf_url":"https://arxiv.org/pdf/2409.05202v1.pdf","comment":"Preprint V1 with 27 pages main text. Online project at\n https://github.com/Westlake-AI/Awesome-Mixup"},{"id":"http://arxiv.org/abs/2409.05200v1","updated":"2024-09-08T19:24:38Z","published":"2024-09-08T19:24:38Z","title":"Lung-DETR: Deformable Detection Transformer for Sparse Lung Nodule\n Anomaly Detection","summary":" Accurate lung nodule detection for computed tomography (CT) scan imagery is\nchallenging in real-world settings due to the sparse occurrence of nodules and\nsimilarity to other anatomical structures. In a typical positive case, nodules\nmay appear in as few as 3% of CT slices, complicating detection. To address\nthis, we reframe the problem as an anomaly detection task, targeting rare\nnodule occurrences in a predominantly normal dataset. We introduce a novel\nsolution leveraging custom data preprocessing and Deformable Detection\nTransformer (Deformable- DETR). A 7.5mm Maximum Intensity Projection (MIP) is\nutilized to combine adjacent lung slices into single images, reducing the slice\ncount and decreasing nodule sparsity. This enhances spatial context, allowing\nfor better differentiation between nodules and other structures such as complex\nvascular structures and bronchioles. Deformable-DETR is employed to detect\nnodules, with a custom focal loss function to better handle the imbalanced\ndataset. Our model achieves state-of-the-art performance on the LUNA16 dataset\nwith an F1 score of 94.2% (95.2% recall, 93.3% precision) on a dataset sparsely\npopulated with lung nodules that is reflective of real-world clinical data.\n","authors":["Hooman Ramezani","Dionne Aleman","Daniel Létourneau"],"pdf_url":"https://arxiv.org/pdf/2409.05200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05192v1","updated":"2024-09-08T18:59:52Z","published":"2024-09-08T18:59:52Z","title":"Bellwether Trades: Characteristics of Trades influential in Predicting\n Future Price Movements in Markets","summary":" In this study, we leverage powerful non-linear machine learning methods to\nidentify the characteristics of trades that contain valuable information.\nFirst, we demonstrate the effectiveness of our optimized neural network\npredictor in accurately predicting future market movements. Then, we utilize\nthe information from this successful neural network predictor to pinpoint the\nindividual trades within each data point (trading window) that had the most\nimpact on the optimized neural network's prediction of future price movements.\nThis approach helps us uncover important insights about the heterogeneity in\ninformation content provided by trades of different sizes, venues, trading\ncontexts, and over time.\n","authors":["Tejas Ramdas","Martin T. Wells"],"pdf_url":"https://arxiv.org/pdf/2409.05192v1.pdf","comment":"49 Pages"},{"id":"http://arxiv.org/abs/2409.05191v1","updated":"2024-09-08T18:55:57Z","published":"2024-09-08T18:55:57Z","title":"Generalization of Geometric Graph Neural Networks","summary":" In this paper, we study the generalization capabilities of geometric graph\nneural networks (GNNs). We consider GNNs over a geometric graph constructed\nfrom a finite set of randomly sampled points over an embedded manifold with\ntopological information captured. We prove a generalization gap between the\noptimal empirical risk and the optimal statistical risk of this GNN, which\ndecreases with the number of sampled points from the manifold and increases\nwith the dimension of the underlying manifold. This generalization gap ensures\nthat the GNN trained on a graph on a set of sampled points can be utilized to\nprocess other unseen graphs constructed from the same underlying manifold. The\nmost important observation is that the generalization capability can be\nrealized with one large graph instead of being limited to the size of the graph\nas in previous results. The generalization gap is derived based on the\nnon-asymptotic convergence result of a GNN on the sampled graph to the\nunderlying manifold neural networks (MNNs). We verify this theoretical result\nwith experiments on both Arxiv dataset and Cora dataset.\n","authors":["Zhiyang Wang","Juan Cervino","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2409.05191v1.pdf","comment":"12 pages, 4 figures. arXiv admin note: text overlap with\n arXiv:2406.05225"},{"id":"http://arxiv.org/abs/2409.05188v1","updated":"2024-09-08T18:52:34Z","published":"2024-09-08T18:52:34Z","title":"Learning to Classify Quantum Phases of Matter with a Few Measurements","summary":" We study the identification of quantum phases of matter, at zero temperature,\nwhen only part of the phase diagram is known in advance. Following a supervised\nlearning approach, we show how to use our previous knowledge to construct an\nobservable capable of classifying the phase even in the unknown region. By\nusing a combination of classical and quantum techniques, such as tensor\nnetworks, kernel methods, generalization bounds, quantum algorithms, and shadow\nestimators, we show that, in some cases, the certification of new ground states\ncan be obtained with a polynomial number of measurements. An important\napplication of our findings is the classification of the phases of matter\nobtained in quantum simulators, e.g., cold atom experiments, capable of\nefficiently preparing ground states of complex many-particle systems and\napplying simple measurements, e.g., single qubit measurements, but unable to\nperform a universal set of gates.\n","authors":["Mehran Khosrojerdi","Jason L. Pereira","Alessandro Cuccoli","Leonardo Banchi"],"pdf_url":"https://arxiv.org/pdf/2409.05188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05181v1","updated":"2024-09-08T18:37:08Z","published":"2024-09-08T18:37:08Z","title":"Sliding-Window Thompson Sampling for Non-Stationary Settings","summary":" $\\textit{Restless Bandits}$ describe sequential decision-making problems in\nwhich the rewards evolve with time independently from the actions taken by the\npolicy-maker. It has been shown that classical Bandit algorithms fail when the\nunderlying environment is changing, making clear that in order to tackle more\nchallenging scenarios specifically crafted algorithms are needed. In this\npaper, extending and correcting the work by \\cite{trovo2020sliding}, we analyze\ntwo Thompson-Sampling inspired algorithms, namely $\\texttt{BETA-SWTS}$ and\n$\\texttt{$\\gamma$-SWGTS}$, introduced to face the additional complexity given\nby the non-stationary nature of the settings; in particular we derive a general\nformulation for the regret in $\\textit{any}$ arbitrary restless environment for\nboth Bernoulli and Subgaussian rewards, and, through the introduction of new\nquantities, we delve in what contribution lays the deeper foundations of the\nerror made by the algorithms. Finally, we infer from the general formulation\nthe regret for two of the most common non-stationary settings: the\n$\\textit{Abruptly Changing}$ and the $\\textit{Smoothly Changing}$ environments.\n","authors":["Marco Fiandri","Alberto Maria Metelli","Francesco Trovò"],"pdf_url":"https://arxiv.org/pdf/2409.05181v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.11672v4","updated":"2024-09-08T18:14:11Z","published":"2024-05-19T20:39:46Z","title":"Review of Interpretable Machine Learning Models for Disease Prognosis","summary":" In response to the COVID-19 pandemic, the integration of interpretable\nmachine learning techniques has garnered significant attention, offering\ntransparent and understandable insights crucial for informed clinical decision\nmaking. This literature review delves into the applications of interpretable\nmachine learning in predicting the prognosis of respiratory diseases,\nparticularly focusing on COVID-19 and its implications for future research and\nclinical practice. We reviewed various machine learning models that are not\nonly capable of incorporating existing clinical domain knowledge but also have\nthe learning capability to explore new information from the data. These models\nand experiences not only aid in managing the current crisis but also hold\npromise for addressing future disease outbreaks. By harnessing interpretable\nmachine learning, healthcare systems can enhance their preparedness and\nresponse capabilities, thereby improving patient outcomes and mitigating the\nimpact of respiratory diseases in the years to come.\n","authors":["Jinzhi Shen","Ke Ma"],"pdf_url":"https://arxiv.org/pdf/2405.11672v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00817v3","updated":"2024-09-08T18:09:23Z","published":"2023-11-29T19:09:28Z","title":"TimelyGPT: Extrapolatable Transformer Pre-training for Long-term\n Time-Series Forecasting in Healthcare","summary":" Large-scale pre-trained models (PTMs) such as BERT and GPT have recently\nachieved great success in Natural Language Processing and Computer Vision\ndomains. However, the development of PTMs on healthcare time-series data is\nlagging behind.This underscores the limitations of the existing\ntransformer-based architectures, particularly their scalability to handle\nlarge-scale time series and ability to capture long-term temporal dependencies.\nIn this study, we present Timely Generative Pre-trained Transformer\n(TimelyGPT). TimelyGPT employs an extrapolatable position (xPos) embedding to\nencode trend and periodic patterns into time-series representations. It also\nintegrates recurrent attention and temporal convolution modules to effectively\ncapture global-local temporal dependencies. We evaluated TimelyGPT on two\nlarge-scale healthcare time series datasets corresponding to continuous\nbiosignals and irregularly-sampled time series, respectively. Our experiments\nshow that during pre-training, TimelyGPT excels in learning time-series\nrepresentations from continuously monitored biosignals and irregularly-sampled\ntime series data commonly observed in longitudinal electronic health records\n(EHRs). In forecasting continuous biosignals, TimelyGPT achieves accurate\nextrapolation up to 6,000 timesteps of body temperature during the sleep stage\ntransition, given a short look-up window (i.e., prompt) containing only 2,000\ntimesteps. For irregularly-sampled time series, TimelyGPT with a proposed\ntime-specific inference demonstrates high top recall scores in predicting\nfuture diagnoses using early diagnostic records, effectively handling irregular\nintervals between clinical records. Together, we envision TimelyGPT to be\nuseful in a broad spectrum of health domains, including long-term patient\nhealth state forecasting and patient risk trajectory prediction.\n","authors":["Ziyang Song","Qincheng Lu","Hao Xu","He Zhu","David L. Buckeridge","Yue Li"],"pdf_url":"https://arxiv.org/pdf/2312.00817v3.pdf","comment":"17 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.03336v2","updated":"2024-09-08T14:21:13Z","published":"2024-09-05T08:28:36Z","title":"Estimating Indoor Scene Depth Maps from Ultrasonic Echoes","summary":" Measuring 3D geometric structures of indoor scenes requires dedicated depth\nsensors, which are not always available. Echo-based depth estimation has\nrecently been studied as a promising alternative solution. All previous studies\nhave assumed the use of echoes in the audible range. However, one major problem\nis that audible echoes cannot be used in quiet spaces or other situations where\nproducing audible sounds is prohibited. In this paper, we consider echo-based\ndepth estimation using inaudible ultrasonic echoes. While ultrasonic waves\nprovide high measurement accuracy in theory, the actual depth estimation\naccuracy when ultrasonic echoes are used has remained unclear, due to its\ndisadvantage of being sensitive to noise and susceptible to attenuation. We\nfirst investigate the depth estimation accuracy when the frequency of the sound\nsource is restricted to the high-frequency band, and found that the accuracy\ndecreased when the frequency was limited to ultrasonic ranges. Based on this\nobservation, we propose a novel deep learning method to improve the accuracy of\nultrasonic echo-based depth estimation by using audible echoes as auxiliary\ndata only during training. Experimental results with a public dataset\ndemonstrate that our method improves the estimation accuracy.\n","authors":["Junpei Honma","Akisato Kimura","Go Irie"],"pdf_url":"https://arxiv.org/pdf/2409.03336v2.pdf","comment":"ICIP 2024"},{"id":"http://arxiv.org/abs/2404.13306v2","updated":"2024-09-08T12:07:52Z","published":"2024-04-20T07:28:55Z","title":"FakeBench: Probing Explainable Fake Image Detection via Large Multimodal\n Models","summary":" The ability to distinguish whether an image is generated by artificial\nintelligence (AI) is a crucial ingredient in human intelligence, usually\naccompanied by a complex and dialectical forensic and reasoning process.\nHowever, current fake image detection models and databases focus on binary\nclassification without understandable explanations for the general populace.\nThis weakens the credibility of authenticity judgment and may conceal potential\nmodel biases. Meanwhile, large multimodal models (LMMs) have exhibited immense\nvisual-text capabilities on various tasks, bringing the potential for\nexplainable fake image detection. Therefore, we pioneer the probe of LMMs for\nexplainable fake image detection by presenting a multimodal database\nencompassing textual authenticity descriptions, the FakeBench. For\nconstruction, we first introduce a fine-grained taxonomy of generative visual\nforgery concerning human perception, based on which we collect forgery\ndescriptions in human natural language with a human-in-the-loop strategy.\nFakeBench examines LMMs with four evaluation criteria: detection, reasoning,\ninterpretation and fine-grained forgery analysis, to obtain deeper insights\ninto image authenticity-relevant capabilities. Experiments on various LMMs\nconfirm their merits and demerits in different aspects of fake image detection\ntasks. This research presents a paradigm shift towards transparency for the\nfake image detection area and reveals the need for greater emphasis on forensic\nelements in visual-language research and AI risk control. FakeBench will be\navailable at https://github.com/Yixuan423/FakeBench.\n","authors":["Yixuan Li","Xuelin Liu","Xiaoyang Wang","Bu Sung Lee","Shiqi Wang","Anderson Rocha","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2404.13306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04999v1","updated":"2024-09-08T07:08:58Z","published":"2024-09-08T07:08:58Z","title":"Visual Grounding with Multi-modal Conditional Adaptation","summary":" Visual grounding is the task of locating objects specified by natural\nlanguage expressions. Existing methods extend generic object detection\nframeworks to tackle this task. They typically extract visual and textual\nfeatures separately using independent visual and textual encoders, then fuse\nthese features in a multi-modal decoder for final prediction. However, visual\ngrounding presents unique challenges. It often involves locating objects with\ndifferent text descriptions within the same image. Existing methods struggle\nwith this task because the independent visual encoder produces identical visual\nfeatures for the same image, limiting detection performance. Some recently\napproaches propose various language-guided visual encoders to address this\nissue, but they mostly rely solely on textual information and require\nsophisticated designs. In this paper, we introduce Multi-modal Conditional\nAdaptation (MMCA), which enables the visual encoder to adaptively update\nweights, directing its focus towards text-relevant regions. Specifically, we\nfirst integrate information from different modalities to obtain multi-modal\nembeddings. Then we utilize a set of weighting coefficients, which generated\nfrom the multimodal embeddings, to reorganize the weight update matrices and\napply them to the visual encoder of the visual grounding model. Extensive\nexperiments on four widely used datasets demonstrate that MMCA achieves\nsignificant improvements and state-of-the-art results. Ablation experiments\nfurther demonstrate the lightweight and efficiency of our method. Our source\ncode is available at: https://github.com/Mr-Bigworth/MMCA.\n","authors":["Ruilin Yao","Shengwu Xiong","Yichen Zhao","Yi Rong"],"pdf_url":"https://arxiv.org/pdf/2409.04999v1.pdf","comment":"Accepted by ACM MM 2024 [Oral]"},{"id":"http://arxiv.org/abs/2404.11119v2","updated":"2024-09-08T04:25:32Z","published":"2024-04-17T07:07:41Z","title":"DREAM: A Dual Representation Learning Model for Multimodal\n Recommendation","summary":" Multimodal recommendation focuses primarily on effectively exploiting both\nbehavioral and multimodal information for the recommendation task. However,\nmost existing models suffer from the following issues when fusing information\nfrom two different domains: (1) Previous works do not pay attention to the\nsufficient utilization of modal information by only using direct concatenation,\naddition, or simple linear layers for modal information extraction. (2)\nPrevious works treat modal features as learnable embeddings, which causes the\nmodal embeddings to gradually deviate from the original modal features during\nlearning. We refer to this issue as Modal Information Forgetting. (3) Previous\napproaches fail to account for the significant differences in the distribution\nbetween behavior and modality, leading to the issue of representation\nmisalignment. To address these challenges, this paper proposes a novel Dual\nREpresentAtion learning model for Multimodal Recommendation called DREAM. For\nsufficient information extraction, we introduce separate dual lines, including\nBehavior Line and Modal Line, in which the Modal-specific Encoder is applied to\nempower modal representations. To address the issue of Modal Information\nForgetting, we introduce the Similarity Supervised Signal to constrain the\nmodal representations. Additionally, we design a Behavior-Modal Alignment\nmodule to fuse the dual representations through Intra-Alignment and\nInter-Alignment. Extensive experiments on three public datasets demonstrate\nthat the proposed DREAM method achieves state-of-the-art (SOTA) results. The\nsource code will be available upon acceptance.\n","authors":["Kangning Zhang","Yingjie Qin","Jiarui Jin","Yifan Liu","Ruilong Su","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2404.11119v2.pdf","comment":"10 pages, 11 figures"}]},"2024-09-07T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.05034v3","updated":"2024-09-07T23:45:10Z","published":"2023-07-11T06:18:07Z","title":"Synthetic Dataset for Evaluating Complex Compositional Knowledge for\n Natural Language Inference","summary":" We introduce a synthetic dataset called Sentences Involving Complex\nCompositional Knowledge (SICCK) and a novel analysis that investigates the\nperformance of Natural Language Inference (NLI) models to understand\ncompositionality in logic. We produce 1,304 sentence pairs by modifying 15\nexamples from the SICK dataset (Marelli et al., 2014). To this end, we modify\nthe original texts using a set of phrases - modifiers that correspond to\nuniversal quantifiers, existential quantifiers, negation, and other concept\nmodifiers in Natural Logic (NL) (MacCartney, 2009). We use these phrases to\nmodify the subject, verb, and object parts of the premise and hypothesis.\nLastly, we annotate these modified texts with the corresponding entailment\nlabels following NL rules. We conduct a preliminary verification of how well\nthe change in the structural and semantic composition is captured by neural NLI\nmodels, in both zero-shot and fine-tuned scenarios. We found that the\nperformance of NLI models under the zero-shot setting is poor, especially for\nmodified sentences with negation and existential quantifiers. After fine-tuning\nthis dataset, we observe that models continue to perform poorly over negation,\nexistential and universal modifiers.\n","authors":["Sushma Anand Akoju","Robert Vacareanu","Haris Riaz","Eduardo Blanco","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2307.05034v3.pdf","comment":"Accepted to Natural Language Reasoning and Structured Explanations\n (NLRSE) Workshop, ACL 2023. For dataset, please refer\n https://github.com/sushmaakoju/clulab-releases/blob/master/acl2023-nlrse-sicck/README.md\n and https://github.com/sushmaakoju/acl2023-nlrse-clulab-SICCK-dataset"},{"id":"http://arxiv.org/abs/2409.04934v1","updated":"2024-09-07T23:40:47Z","published":"2024-09-07T23:40:47Z","title":"Maximizing Relation Extraction Potential: A Data-Centric Study to Unveil\n Challenges and Opportunities","summary":" Relation extraction is a Natural Language Processing task aiming to extract\nrelationships from textual data. It is a critical step for information\nextraction. Due to its wide-scale applicability, research in relation\nextraction has rapidly scaled to using highly advanced neural networks. Despite\ntheir computational superiority, modern relation extractors fail to handle\ncomplicated extraction scenarios. However, a comprehensive performance analysis\nof the state-of-the-art relation extractors that compile these challenges has\nbeen missing from the literature, and this paper aims to bridge this gap. The\ngoal has been to investigate the possible data-centric characteristics that\nimpede neural relation extraction. Based on extensive experiments conducted\nusing 15 state-of-the-art relation extraction algorithms ranging from recurrent\narchitectures to large language models and seven large-scale datasets, this\nresearch suggests that modern relation extractors are not robust to complex\ndata and relation characteristics. It emphasizes pivotal issues, such as\ncontextual ambiguity, correlating relations, long-tail data, and fine-grained\nrelation distributions. In addition, it sets a marker for future directions to\nalleviate these issues, thereby proving to be a critical resource for novice\nand advanced researchers. Efficient handling of the challenges described can\nhave significant implications for the field of information extraction, which is\na critical part of popular systems such as search engines and chatbots. Data\nand relevant code can be found at https://github.com/anushkasw/MaxRE.\n","authors":["Anushka Swarup","Avanti Bhandarkar","Olivia P. Dizon-Paradis","Ronald Wilson","Damon L. Woodard"],"pdf_url":"https://arxiv.org/pdf/2409.04934v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2409.04927v1","updated":"2024-09-07T22:54:47Z","published":"2024-09-07T22:54:47Z","title":"Just ASR + LLM? A Study on Speech Large Language Models' Ability to\n Identify and Understand Speaker in Spoken Dialogue","summary":" In recent years, we have observed a rapid advancement in speech language\nmodels (SpeechLLMs), catching up with humans' listening and reasoning\nabilities. Remarkably, SpeechLLMs have demonstrated impressive spoken dialogue\nquestion-answering (SQA) performance in benchmarks like Gaokao, the English\nlistening test of the college entrance exam in China, which seemingly requires\nunderstanding both the spoken content and voice characteristics of speakers in\na conversation. However, after carefully examining Gaokao's questions, we find\nthe correct answers to many questions can be inferred from the conversation\ncontext alone without identifying the speaker asked in the question. Our\nevaluation of state-of-the-art models Qwen-Audio and WavLLM in both Gaokao and\nour proposed \"What Do You Like?\" dataset shows a significantly higher accuracy\nin these context-based questions than in identity-critical questions, which can\nonly be answered correctly with correct speaker identification. Our results and\nanalysis suggest that when solving SQA, the current SpeechLLMs exhibit limited\nspeaker awareness from the audio and behave similarly to an LLM reasoning from\nthe conversation transcription without sound. We propose that our definitions\nand automated classification of context-based and identity-critical questions\ncould offer a more accurate evaluation framework of SpeechLLMs in SQA tasks.\n","authors":["Junkai Wu","Xulin Fan","Bo-Ru Lu","Xilin Jiang","Nima Mesgarani","Mark Hasegawa-Johnson","Mari Ostendorf"],"pdf_url":"https://arxiv.org/pdf/2409.04927v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2409.01835v2","updated":"2024-09-07T22:51:50Z","published":"2024-09-03T12:34:21Z","title":"Towards Generative Class Prompt Learning for Fine-grained Visual\n Recognition","summary":" Although foundational vision-language models (VLMs) have proven to be very\nsuccessful for various semantic discrimination tasks, they still struggle to\nperform faithfully for fine-grained categorization. Moreover, foundational\nmodels trained on one domain do not generalize well on a different domain\nwithout fine-tuning. We attribute these to the limitations of the VLM's\nsemantic representations and attempt to improve their fine-grained visual\nawareness using generative modeling. Specifically, we propose two novel\nmethods: Generative Class Prompt Learning (GCPL) and Contrastive Multi-class\nPrompt Learning (CoMPLe). Utilizing text-to-image diffusion models, GCPL\nsignificantly improves the visio-linguistic synergy in class embeddings by\nconditioning on few-shot exemplars with learnable class prompts. CoMPLe builds\non this foundation by introducing a contrastive learning component that\nencourages inter-class separation during the generative optimization process.\nOur empirical results demonstrate that such a generative class prompt learning\napproach substantially outperform existing methods, offering a better\nalternative to few shot image recognition challenges. The source code will be\nmade available at: https://github.com/soumitri2001/GCPL.\n","authors":["Soumitri Chattopadhyay","Sanket Biswas","Emanuele Vivoli","Josep Lladós"],"pdf_url":"https://arxiv.org/pdf/2409.01835v2.pdf","comment":"Accepted in BMVC 2024"},{"id":"http://arxiv.org/abs/2402.04854v6","updated":"2024-09-07T21:38:49Z","published":"2024-02-07T13:54:06Z","title":"Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey","summary":" Research surveys have always posed a challenge for beginner researchers who\nlack of research training. These researchers struggle to understand the\ndirections within their research topic, and the discovery of new research\nfindings within a short time. One way to provide intuitive assistance to\nbeginner researchers is by offering relevant knowledge graphs(KG) and\nrecommending related academic papers. However, existing navigation knowledge\ngraphs primarily rely on keywords in the research field and often fail to\npresent the logical hierarchy among multiple related papers clearly. Moreover,\nmost recommendation systems for academic papers simply rely on high text\nsimilarity, which can leave researchers confused as to why a particular article\nis being recommended. They may lack of grasp important information about the\ninsight connection between \"Issue resolved\" and \"Issue finding\" that they hope\nto obtain. To address these issues, this study aims to support research insight\nsurveys for beginner researchers by establishing a hierarchical tree-structured\nknowledge graph that reflects the inheritance insight of research topics and\nthe relevance insight among the academic papers.\n","authors":["Jinghong Li","Huy Phan","Wen Gu","Koichi Ota","Shinobu Hasegawa"],"pdf_url":"https://arxiv.org/pdf/2402.04854v6.pdf","comment":"This paper has been accepted by 'The 18TH International Conference on\n INnovations in Intelligent SysTems and Applications (INISTA 2024)'"},{"id":"http://arxiv.org/abs/2408.08688v3","updated":"2024-09-07T15:39:49Z","published":"2024-08-16T12:01:55Z","title":"The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic\n Preference Optimization Dataset Generation","summary":" This paper presents synthetic Preference Optimization (PO) datasets generated\nusing multi-agent workflows and evaluates the effectiveness and potential of\nthese workflows in the dataset generation process. PO dataset generation\nrequires two modules: (1) response evaluation, and (2) response generation. In\nthe response evaluation module, the responses from Large Language Models (LLMs)\nare evaluated and ranked - a task typically carried out by human annotators\nthat we automate using LLMs. We assess the response evaluation module in a 2\nstep process. In step 1, we assess LLMs as evaluators using three distinct\nprompting strategies. In step 2, we apply the winning prompting strategy to\ncompare the performance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In\neach step, we use inter-rater agreement using Cohen's Kappa between human\nannotators and LLMs. For the response generation module, we compare different\nconfigurations for the LLM Feedback Loop using the identified LLM evaluator\nconfiguration. We use the win rate (the fraction of times a generation\nframework is selected as the best by an LLM evaluator) to determine the best\nmulti-agent configuration for generation. After identifying the best\nconfigurations for both modules, we use models from the GPT, Gemma, and Llama\nfamilies to generate our PO datasets using the above pipeline. We generate two\ntypes of PO datasets, one to improve the generation capabilities of individual\nLLM and the other to improve the multi-agent workflow. Our evaluation shows\nthat GPT-4o-as-a-Judge is more consistent across datasets when the candidate\nresponses do not include responses from the GPT family. Additionally, we find\nthat the LLM Feedback Loop, with Llama as the generator and Gemma as the\nreviewer, achieves a notable 71.8% and 73.8% win rate over single-agent Llama\nand Gemma, respectively.\n","authors":["Samee Arif","Sualeha Farid","Abdul Hameed Azeemi","Awais Athar","Agha Ali Raza"],"pdf_url":"https://arxiv.org/pdf/2408.08688v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04459v2","updated":"2024-09-07T15:33:38Z","published":"2024-07-05T12:09:40Z","title":"Generalists vs. Specialists: Evaluating Large Language Models for Urdu","summary":" In this paper, we compare general-purpose pretrained models, GPT-4-Turbo and\nLlama-3-8b-Instruct with special-purpose models fine-tuned on specific tasks,\nXLM-Roberta-large, mT5-large, and Llama-3-8b-Instruct. We focus on seven\nclassification and six generation tasks to evaluate the performance of these\nmodels on Urdu language. Urdu has 70 million native speakers, yet it remains\nunderrepresented in Natural Language Processing (NLP). Despite the frequent\nadvancements in Large Language Models (LLMs), their performance in low-resource\nlanguages, including Urdu, still needs to be explored. We also conduct a human\nevaluation for the generation tasks and compare the results with the\nevaluations performed by GPT-4-Turbo and Llama-3-8b-Instruct. We find that\nspecial-purpose models consistently outperform general-purpose models across\nvarious tasks. We also find that the evaluation done by GPT-4-Turbo for\ngeneration tasks aligns more closely with human evaluation compared to the\nevaluation by Llama-3-8b-Instruct. This paper contributes to the NLP community\nby providing insights into the effectiveness of general and specific-purpose\nLLMs for low-resource languages.\n","authors":["Samee Arif","Abdul Hameed Azeemi","Agha Ali Raza","Awais Athar"],"pdf_url":"https://arxiv.org/pdf/2407.04459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15915v2","updated":"2024-09-07T15:12:51Z","published":"2024-08-28T16:28:07Z","title":"Leveraging Open Knowledge for Advancing Task Expertise in Large Language\n Models","summary":" The cultivation of expertise for large language models (LLMs) to solve tasks\nof specific areas often requires special-purpose tuning with calibrated\nbehaviors on the expected stable outputs. To avoid huge cost brought by manual\npreparation of instruction datasets and training resources up to hundreds of\nhours, the exploitation of open knowledge including a wealth of low rank\nadaptation (LoRA) models and instruction datasets serves as a good starting\npoint. However, existing methods on model and data selection focus on the\nperformance of general-purpose capabilities while neglecting the knowledge gap\nexposed in domain-specific deployment. In the present study, we propose to\nbridge such gap by introducing few human-annotated samples (i.e., K-shot) for\nadvancing task expertise of LLMs with open knowledge. Specifically, we develop\nan efficient and scalable pipeline to cost-efficiently produce task experts\nwhere K-shot data intervene in selecting the most promising expert candidates\nand the task-relevant instructions. A mixture-of-expert (MoE) system is built\nto make the best use of individual-yet-complementary knowledge between multiple\nexperts. We unveil the two keys to the success of a MoE system, 1) the abidance\nby K-shot, and 2) the insistence on diversity. For the former, we ensure that\nmodels that truly possess problem-solving abilities on K-shot are selected\nrather than those blind guessers. Besides, during data selection, instructions\nthat share task-relevant contexts with K-shot are prioritized. For the latter,\nwe highlight the diversity of constituting experts and that of the fine-tuning\ninstructions throughout the model and data selection process. Extensive\nexperimental results confirm the superiority of our approach over existing\nmethods on utilization of open knowledge across various tasks. Our codes will\nbe available at https://github.com/Yaphabates/Rocket.\n","authors":["Yuncheng Yang","Yulei Qin","Tong Wu","Zihan Xu","Gang Li","Pengcheng Guo","Hang Shao","Yuchen Shi","Ke Li","Xing Sun","Jie Yang","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2408.15915v2.pdf","comment":"29 pages, 12 tables, 10 figures"},{"id":"http://arxiv.org/abs/2407.05361v3","updated":"2024-09-07T15:08:24Z","published":"2024-07-07T13:24:54Z","title":"Emilia: An Extensive, Multilingual, and Diverse Speech Dataset for\n Large-Scale Speech Generation","summary":" Recent advancements in speech generation models have been significantly\ndriven by the use of large-scale training data. However, producing highly\nspontaneous, human-like speech remains a challenge due to the scarcity of\nlarge, diverse, and spontaneous speech datasets. In response, we introduce\nEmilia, the first large-scale, multilingual, and diverse speech generation\ndataset. Emilia starts with over 101k hours of speech across six languages,\ncovering a wide range of speaking styles to enable more natural and spontaneous\nspeech generation. To facilitate the scale-up of Emilia, we also present\nEmilia-Pipe, the first open-source preprocessing pipeline designed to\nefficiently transform raw, in-the-wild speech data into high-quality training\ndata with speech annotations. Experimental results demonstrate the\neffectiveness of both Emilia and Emilia-Pipe. Demos are available at:\nhttps://emilia-dataset.github.io/Emilia-Demo-Page/.\n","authors":["Haorui He","Zengqiang Shang","Chaoren Wang","Xuyuan Li","Yicheng Gu","Hua Hua","Liwei Liu","Chen Yang","Jiaqi Li","Peiyang Shi","Yuancheng Wang","Kai Chen","Pengyuan Zhang","Zhizheng Wu"],"pdf_url":"https://arxiv.org/pdf/2407.05361v3.pdf","comment":"Accepted in SLT 2024. Dataset available:\n https://huggingface.co/datasets/amphion/Emilia-Dataset"},{"id":"http://arxiv.org/abs/2409.04833v1","updated":"2024-09-07T13:57:41Z","published":"2024-09-07T13:57:41Z","title":"Achieving Peak Performance for Large Language Models: A Systematic\n Review","summary":" In recent years, large language models (LLMs) have achieved remarkable\nsuccess in natural language processing (NLP). LLMs require an extreme amount of\nparameters to attain high performance. As models grow into the\ntrillion-parameter range, computational and memory costs increase\nsignificantly. This makes it difficult for many researchers to access the\nresources needed to train or apply these models. Optimizing LLM performance\ninvolves two main approaches: fine-tuning pre-trained models for specific tasks\nto achieve state-of-the-art performance, and reducing costs or improving\ntraining time while maintaining similar performance. This paper presents a\nsystematic literature review (SLR) following the Preferred Reporting Items for\nSystematic Reviews and Meta-Analyses (PRISMA) statement. We reviewed 65\npublications out of 983 from 2017 to December 2023, retrieved from 5 databases.\nThe study presents methods to optimize and accelerate LLMs while achieving\ncutting-edge results without sacrificing accuracy. We begin with an overview of\nthe development of language modeling, followed by a detailed explanation of\ncommonly used frameworks and libraries, and a taxonomy for improving and\nspeeding up LLMs based on three classes: LLM training, LLM inference, and\nsystem serving. We then delve into recent optimization and acceleration\nstrategies such as training optimization, hardware optimization, scalability\nand reliability, accompanied by the taxonomy and categorization of these\nstrategies. Finally, we provide an in-depth comparison of each class and\nstrategy, with two case studies on optimizing model training and enhancing\ninference efficiency. These case studies showcase practical approaches to\naddress LLM resource limitations while maintaining performance.\n","authors":["Zhyar Rzgar K Rostam","Sándor Szénási","Gábor Kertész"],"pdf_url":"https://arxiv.org/pdf/2409.04833v1.pdf","comment":"34 pages, 7 figures, 8 tables. Journal Article: IEEE Access"},{"id":"http://arxiv.org/abs/2409.04831v1","updated":"2024-09-07T13:51:42Z","published":"2024-09-07T13:51:42Z","title":"MILE: A Mutation Testing Framework of In-Context Learning Systems","summary":" In-context Learning (ICL) has achieved notable success in the applications of\nlarge language models (LLMs). By adding only a few input-output pairs that\ndemonstrate a new task, the LLM can efficiently learn the task during inference\nwithout modifying the model parameters. Such mysterious ability of LLMs has\nattracted great research interests in understanding, formatting, and improving\nthe in-context demonstrations, while still suffering from drawbacks like\nblack-box mechanisms and sensitivity against the selection of examples. In this\nwork, inspired by the foundations of adopting testing techniques in machine\nlearning (ML) systems, we propose a mutation testing framework designed to\ncharacterize the quality and effectiveness of test data for ICL systems. First,\nwe propose several mutation operators specialized for ICL demonstrations, as\nwell as corresponding mutation scores for ICL test sets. With comprehensive\nexperiments, we showcase the effectiveness of our framework in evaluating the\nreliability and quality of ICL test suites. Our code is available at\nhttps://github.com/weizeming/MILE.\n","authors":["Zeming Wei","Yihao Zhang","Meng Sun"],"pdf_url":"https://arxiv.org/pdf/2409.04831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04822v1","updated":"2024-09-07T13:28:01Z","published":"2024-09-07T13:28:01Z","title":"Exploring Straightforward Conversational Red-Teaming","summary":" Large language models (LLMs) are increasingly used in business dialogue\nsystems but they pose security and ethical risks. Multi-turn conversations,\nwhere context influences the model's behavior, can be exploited to produce\nundesired responses. In this paper, we examine the effectiveness of utilizing\noff-the-shelf LLMs in straightforward red-teaming approaches, where an attacker\nLLM aims to elicit undesired output from a target LLM, comparing both\nsingle-turn and conversational red-teaming tactics. Our experiments offer\ninsights into various usage strategies that significantly affect their\nperformance as red teamers. They suggest that off-the-shelf models can act as\neffective red teamers and even adjust their attack strategy based on past\nattempts, although their effectiveness decreases with greater alignment.\n","authors":["George Kour","Naama Zwerdling","Marcel Zalmanovici","Ateret Anaby-Tavor","Ora Nova Fandina","Eitan Farchi"],"pdf_url":"https://arxiv.org/pdf/2409.04822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03662v2","updated":"2024-09-07T12:47:54Z","published":"2024-09-05T16:15:12Z","title":"The representation landscape of few-shot learning and fine-tuning in\n large language models","summary":" In-context learning (ICL) and supervised fine-tuning (SFT) are two common\nstrategies for improving the performance of modern large language models (LLMs)\non specific tasks. Despite their different natures, these strategies often lead\nto comparable performance gains. However, little is known about whether they\ninduce similar representations inside LLMs. We approach this problem by\nanalyzing the probability landscape of their hidden representations in the two\ncases. More specifically, we compare how LLMs solve the same question-answering\ntask, finding that ICL and SFT create very different internal structures, in\nboth cases undergoing a sharp transition in the middle of the network. In the\nfirst half of the network, ICL shapes interpretable representations\nhierarchically organized according to their semantic content. In contrast, the\nprobability landscape obtained with SFT is fuzzier and semantically mixed. In\nthe second half of the model, the fine-tuned representations develop\nprobability modes that better encode the identity of answers, while the\nlandscape of ICL representations is characterized by less defined peaks. Our\napproach reveals the diverse computational strategies developed inside LLMs to\nsolve the same task across different conditions, allowing us to make a step\ntowards designing optimal methods to extract information from language models.\n","authors":["Diego Doimo","Alessandro Serra","Alessio Ansuini","Alberto Cazzaniga"],"pdf_url":"https://arxiv.org/pdf/2409.03662v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.03941v2","updated":"2024-09-07T11:40:47Z","published":"2024-07-04T13:54:24Z","title":"Narrow Transformer: StarCoder-Based Java-LM For Desktop","summary":" This paper presents NT-Java-1.1B, an open-source specialized code language\nmodel built on StarCoderBase-1.1B, designed for coding tasks in Java\nprogramming. NT-Java-1.1B achieves state-of-the-art performance, surpassing its\nbase model and majority of other models of similar size on MultiPL-E Java code\nbenchmark. While there have been studies on extending large, generic\npre-trained models to improve proficiency in specific programming languages\nlike Python, similar investigations on small code models for other programming\nlanguages are lacking. Large code models require specialized hardware like GPUs\nfor inference, highlighting the need for research into building small code\nmodels that can be deployed on developer desktops. This paper addresses this\nresearch gap by focusing on the development of a small Java code model,\nNT-Java-1.1B, and its quantized versions, which performs comparably to open\nmodels around 1.1B on MultiPL-E Java code benchmarks, making them ideal for\ndesktop deployment. This paper establishes the foundation for specialized\nmodels across languages and sizes for a family of NT Models.\n","authors":["Kamalkumar Rathinasamy","Balaji A J","Ankush Kumar","Gagan Gayari","Harshini K","Rajab Ali Mondal","Sreenivasa Raghavan K S","Swayam Singh","Mohammed Rafee Tarafdar"],"pdf_url":"https://arxiv.org/pdf/2407.03941v2.pdf","comment":"Updated Authors list"},{"id":"http://arxiv.org/abs/2409.04795v1","updated":"2024-09-07T11:22:35Z","published":"2024-09-07T11:22:35Z","title":"Phrase-Level Adversarial Training for Mitigating Bias in Neural\n Network-based Automatic Essay Scoring","summary":" Automatic Essay Scoring (AES) is widely used to evaluate candidates for\neducational purposes. However, due to the lack of representative data, most\nexisting AES systems are not robust, and their scoring predictions are biased\ntowards the most represented data samples. In this study, we propose a\nmodel-agnostic phrase-level method to generate an adversarial essay set to\naddress the biases and robustness of AES models. Specifically, we construct an\nattack test set comprising samples from the original test set and adversarially\ngenerated samples using our proposed method. To evaluate the effectiveness of\nthe attack strategy and data augmentation, we conducted a comprehensive\nanalysis utilizing various neural network scoring models. Experimental results\nshow that the proposed approach significantly improves AES model performance in\nthe presence of adversarial examples and scenarios without such attacks.\n","authors":["Haddad Philip","Tsegaye Misikir Tashu"],"pdf_url":"https://arxiv.org/pdf/2409.04795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09296v2","updated":"2024-09-07T10:27:35Z","published":"2024-04-14T16:34:31Z","title":"Cross-Data Knowledge Graph Construction for LLM-enabled Educational\n Question-Answering System: A Case Study at HCMUT","summary":" In today's rapidly evolving landscape of Artificial Intelligence, large\nlanguage models (LLMs) have emerged as a vibrant research topic. LLMs find\napplications in various fields and contribute significantly. Despite their\npowerful language capabilities, similar to pre-trained language models (PLMs),\nLLMs still face challenges in remembering events, incorporating new\ninformation, and addressing domain-specific issues or hallucinations. To\novercome these limitations, researchers have proposed Retrieval-Augmented\nGeneration (RAG) techniques, some others have proposed the integration of LLMs\nwith Knowledge Graphs (KGs) to provide factual context, thereby improving\nperformance and delivering more accurate feedback to user queries.\n Education plays a crucial role in human development and progress. With the\ntechnology transformation, traditional education is being replaced by digital\nor blended education. Therefore, educational data in the digital environment is\nincreasing day by day. Data in higher education institutions are diverse,\ncomprising various sources such as unstructured/structured text, relational\ndatabases, web/app-based API access, etc. Constructing a Knowledge Graph from\nthese cross-data sources is not a simple task. This article proposes a method\nfor automatically constructing a Knowledge Graph from multiple data sources and\ndiscusses some initial applications (experimental trials) of KG in conjunction\nwith LLMs for question-answering tasks.\n","authors":["Tuan Bui","Oanh Tran","Phuong Nguyen","Bao Ho","Long Nguyen","Thang Bui","Tho Quan"],"pdf_url":"https://arxiv.org/pdf/2404.09296v2.pdf","comment":"8 pages, 7 figures, Accepted at AIQAM '24: Proceedings of the 1st ACM\n Workshop on AI-Powered Q&A Systems for Multimedia"},{"id":"http://arxiv.org/abs/2409.04787v1","updated":"2024-09-07T10:21:03Z","published":"2024-09-07T10:21:03Z","title":"Selective Self-Rehearsal: A Fine-Tuning Approach to Improve\n Generalization in Large Language Models","summary":" Fine-tuning Large Language Models (LLMs) on specific datasets is a common\npractice to improve performance on target tasks. However, this performance gain\noften leads to overfitting, where the model becomes too specialized in either\nthe task or the characteristics of the training data, resulting in a loss of\ngeneralization. This paper introduces Selective Self-Rehearsal (SSR), a\nfine-tuning approach that achieves performance comparable to the standard\nsupervised fine-tuning (SFT) while improving generalization. SSR leverages the\nfact that there can be multiple valid responses to a query. By utilizing the\nmodel's correct responses, SSR reduces model specialization during the\nfine-tuning stage. SSR first identifies the correct model responses from the\ntraining set by deploying an appropriate LLM as a judge. Then, it fine-tunes\nthe model using the correct model responses and the gold response for the\nremaining samples. The effectiveness of SSR is demonstrated through experiments\non the task of identifying unanswerable queries across various datasets. The\nresults show that standard SFT can lead to an average performance drop of up to\n$16.7\\%$ on multiple benchmarks, such as MMLU and TruthfulQA. In contrast, SSR\nresults in close to $2\\%$ drop on average, indicating better generalization\ncapabilities compared to standard SFT.\n","authors":["Sonam Gupta","Yatin Nandwani","Asaf Yehudai","Mayank Mishra","Gaurav Pandey","Dinesh Raghu","Sachindra Joshi"],"pdf_url":"https://arxiv.org/pdf/2409.04787v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2404.15166v2","updated":"2024-09-07T10:19:15Z","published":"2024-04-22T17:36:33Z","title":"Pixels and Predictions: Potential of GPT-4V in Meteorological Imagery\n Analysis and Forecast Communication","summary":" Generative AI, such as OpenAI's GPT-4V large-language model, has rapidly\nentered mainstream discourse. Novel capabilities in image processing and\nnatural-language communication may augment existing forecasting methods. Large\nlanguage models further display potential to better communicate weather hazards\nin a style honed for diverse communities and different languages. This study\nevaluates GPT-4V's ability to interpret meteorological charts and communicate\nweather hazards appropriately to the user, despite challenges of\nhallucinations, where generative AI delivers coherent, confident, but incorrect\nresponses. We assess GPT-4V's competence via its web interface ChatGPT in two\ntasks: (1) generating a severe-weather outlook from weather-chart analysis and\nconducting self-evaluation, revealing an outlook that corresponds well with a\nStorm Prediction Center human-issued forecast; and (2) producing hazard\nsummaries in Spanish and English from weather charts. Responses in Spanish,\nhowever, resemble direct (not idiomatic) translations from English to Spanish,\nyielding poorly translated summaries that lose critical idiomatic precision\nrequired for optimal communication. Our findings advocate for cautious\nintegration of tools like GPT-4V in meteorology, underscoring the necessity of\nhuman oversight and development of trustworthy, explainable AI.\n","authors":["John R. Lawson","Joseph E. Trujillo-Falcón","David M. Schultz","Montgomery L. Flora","Kevin H. Goebbert","Seth N. Lyman","Corey K. Potvin","Adam J. Stepanek"],"pdf_url":"https://arxiv.org/pdf/2404.15166v2.pdf","comment":"Supplementary material PDF attached. Submitted to Artificial\n Intelligence for the Earth Systems (American Meteorological Society) on 18\n April 2024"},{"id":"http://arxiv.org/abs/2408.13745v2","updated":"2024-09-07T10:09:31Z","published":"2024-08-25T07:10:36Z","title":"DOCE: Finding the Sweet Spot for Execution-Based Code Generation","summary":" Recently, a diverse set of decoding and reranking procedures have been shown\neffective for LLM-based code generation. However, a comprehensive framework\nthat links and experimentally compares these methods is missing. We address\nthis by proposing Decoding Objectives for Code Execution, a comprehensive\nframework that includes candidate generation, $n$-best reranking, minimum Bayes\nrisk (MBR) decoding, and self-debugging as the core components. We then study\nthe contributions of these components through execution-based evaluation\nmetrics. Our findings highlight the importance of execution-based methods and\nthe difference gap between execution-based and execution-free methods.\nFurthermore, we assess the impact of filtering based on trial unit tests, a\nsimple and effective strategy that has been often overlooked in prior works. We\nalso propose self-debugging on multiple candidates, obtaining state-of-the-art\nperformance on reranking for code generation. We expect our framework to\nprovide a solid guideline for future research on code generation.\n","authors":["Haau-Sing Li","Patrick Fernandes","Iryna Gurevych","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2408.13745v2.pdf","comment":"10 pages (32 including appendix), 5 figures, 25 tables. To ensure\n reproducibility, we present some prompts used in our experiments from\n arXiv:2304.05128 that leads to text overlap"},{"id":"http://arxiv.org/abs/2409.04778v1","updated":"2024-09-07T09:38:36Z","published":"2024-09-07T09:38:36Z","title":"LoCa: Logit Calibration for Knowledge Distillation","summary":" Knowledge Distillation (KD), aiming to train a better student model by\nmimicking the teacher model, plays an important role in model compression. One\ntypical way is to align the output logits. However, we find a common issue\nnamed mis-instruction, that the student would be misled when the predictions\nbased on teacher logits do not follow the labels. Meanwhile, there is other\nuseful dark knowledge in the logits such as the class discriminability, which\nis vital for distillation. In this paper, we propose a simple yet effective\nLogit Calibration (LoCa) method, which calibrates the logits from the teacher\nmodel based on the ground-truth labels. The key insight is to correct the\nprediction (to address the mis-instruction issue) and maintain useful dark\nknowledge simultaneously. Our proposed LoCa does not require any additional\nparameters. Empirical results on image classification and text generation tasks\ndemonstrate that LoCa can effectively improve the performance of baselines.\n","authors":["Runming Yang","Taiqiang Wu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2409.04778v1.pdf","comment":"Accepted by ECAI 2024"},{"id":"http://arxiv.org/abs/2409.04774v1","updated":"2024-09-07T09:28:55Z","published":"2024-09-07T09:28:55Z","title":"Untie the Knots: An Efficient Data Augmentation Strategy for\n Long-Context Pre-Training in Language Models","summary":" Large language models (LLM) have prioritized expanding the context window\nfrom which models can incorporate more information. However, training models to\nhandle long contexts presents significant challenges. These include the\nscarcity of high-quality natural long-context data, the potential for\nperformance degradation on short-context tasks, and the reduced training\nefficiency associated with attention mechanisms. In this paper, we introduce\nUntie the Knots (\\textbf{UtK}), a novel data augmentation strategy employed\nduring the continue pre-training phase, designed to efficiently enable LLMs to\ngain long-context capabilities without the need to modify the existing data\nmixture. In particular, we chunk the documents, shuffle the chunks, and create\na complex and knotted structure of long texts; LLMs are then trained to untie\nthese knots and identify relevant segments within seemingly chaotic token\nsequences. This approach greatly improves the model's performance by accurately\nattending to relevant information in long context and the training efficiency\nis also largely increased. We conduct extensive experiments on models with 7B\nand 72B parameters, trained on 20 billion tokens, demonstrating that UtK\nachieves 75\\% and 84.5\\% accurracy on RULER at 128K context length,\nsignificantly outperforming other long context strategies. The trained models\nwill open-source for further research.\n","authors":["Junfeng Tian","Da Zheng","Yang Cheng","Rui Wang","Colin Zhang","Debing Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19097v2","updated":"2024-09-07T09:25:09Z","published":"2024-06-27T11:26:17Z","title":"Fairness and Bias in Multimodal AI: A Survey","summary":" The importance of addressing fairness and bias in artificial intelligence\n(AI) systems cannot be over-emphasized. Mainstream media has been awashed with\nnews of incidents around stereotypes and other types of bias in many of these\nsystems in recent years. In this survey, we fill a gap with regards to the\nrelatively minimal study of fairness and bias in Large Multimodal Models (LMMs)\ncompared to Large Language Models (LLMs), providing 50 examples of datasets and\nmodels related to both types of AI along with the challenges of bias affecting\nthem. We discuss the less-mentioned category of mitigating bias, preprocessing\n(with particular attention on the first part of it, which we call preuse). The\nmethod is less-mentioned compared to the two well-known ones in the literature:\nintrinsic and extrinsic mitigation methods. We critically discuss the various\nways researchers are addressing these challenges. Our method involved two\nslightly different search queries on two reputable search engines, Google\nScholar and Web of Science (WoS), which revealed that for the queries 'Fairness\nand bias in Large Multimodal Models' and 'Fairness and bias in Large Language\nModels', 33,400 and 538,000 links are the initial results, respectively, for\nScholar while 4 and 50 links are the initial results, respectively, for WoS.\nFor reproducibility and verification, we provide links to the search results\nand the citations to all the final reviewed papers. We believe this work\ncontributes to filling this gap and providing insight to researchers and other\nstakeholders on ways to address the challenges of fairness and bias in\nmultimodal and language AI.\n","authors":["Tosin Adewumi","Lama Alkhaled","Namrata Gurung","Goya van Boven","Irene Pagliai"],"pdf_url":"https://arxiv.org/pdf/2406.19097v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2404.07234v2","updated":"2024-09-07T07:28:00Z","published":"2024-04-06T06:17:10Z","title":"Goal-guided Generative Prompt Injection Attack on Large Language Models","summary":" Current large language models (LLMs) provide a strong foundation for\nlarge-scale user-oriented natural language tasks. A large number of users can\neasily inject adversarial text or instructions through the user interface, thus\ncausing LLMs model security challenges. Although there is currently a large\namount of research on prompt injection attacks, most of these black-box attacks\nuse heuristic strategies. It is unclear how these heuristic strategies relate\nto the success rate of attacks and thus effectively improve model robustness.\nTo solve this problem, we redefine the goal of the attack: to maximize the KL\ndivergence between the conditional probabilities of the clean text and the\nadversarial text. Furthermore, we prove that maximizing the KL divergence is\nequivalent to maximizing the Mahalanobis distance between the embedded\nrepresentation $x$ and $x'$ of the clean text and the adversarial text when the\nconditional probability is a Gaussian distribution and gives a quantitative\nrelationship on $x$ and $x'$. Then we designed a simple and effective\ngoal-guided generative prompt injection strategy (G2PIA) to find an injection\ntext that satisfies specific constraints to achieve the optimal attack effect\napproximately. It is particularly noteworthy that our attack method is a\nquery-free black-box attack method with low computational cost. Experimental\nresults on seven LLM models and four datasets show the effectiveness of our\nattack method.\n","authors":["Chong Zhang","Mingyu Jin","Qinkai Yu","Chengzhi Liu","Haochen Xue","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2404.07234v2.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2404.01067v2","updated":"2024-09-07T06:03:06Z","published":"2024-04-01T12:01:06Z","title":"Exploring the Mystery of Influential Data for Mathematical Reasoning","summary":" Selecting influential data for fine-tuning on downstream tasks is a key\nfactor for both performance and computation efficiency. Recent works have shown\nthat training with only limited data can show a superior performance on general\ntasks. However, the feasibility on mathematical reasoning tasks has not been\nvalidated. To go further, there exist two open questions for mathematical\nreasoning: how to select influential data and what is an influential data\ncomposition. For the former one, we propose a Quality-aware Diverse Selection\n(QaDS) strategy adaptable for mathematical reasoning. A comparison with other\nselection strategies validates the superiority of QaDS. For the latter one, we\nfirst enlarge our setting and explore the influential data composition. We\nconduct a series of experiments and highlight: scaling up reasoning data, and\ntraining with general data selected by QaDS is helpful. Then, we define our\noptimal mixture as OpenMathMix, an influential data mixture with open-source\ndata selected by QaDS. With OpenMathMix, we achieve a state-of-the-art 48.8%\naccuracy on MATH with 7B base model. Additionally, we showcase the use of QaDS\nin creating efficient fine-tuning mixtures with various selection ratios, and\nanalyze the quality of a wide range of open-source datasets, which can perform\nas a reference for future works on mathematical reasoning tasks.\n","authors":["Xinzhe Ni","Yeyun Gong","Zhibin Gou","Yelong Shen","Yujiu Yang","Nan Duan","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01067v2.pdf","comment":"Accepted by COLM 2024"},{"id":"http://arxiv.org/abs/2409.04701v1","updated":"2024-09-07T03:54:46Z","published":"2024-09-07T03:54:46Z","title":"Late Chunking: Contextual Chunk Embeddings Using Long-Context Embedding\n Models","summary":" Many use cases require retrieving smaller portions of text, and dense\nvector-based retrieval systems often perform better with shorter text segments,\nas the semantics are less likely to be \"over-compressed\" in the embeddings.\nConsequently, practitioners often split text documents into smaller chunks and\nencode them separately. However, chunk embeddings created in this way can lose\ncontextual information from surrounding chunks, resulting in suboptimal\nrepresentations. In this paper, we introduce a novel method called \"late\nchunking,\" which leverages long context embedding models to first embed all\ntokens of the long text, with chunking applied after the transformer model and\njust before mean pooling. The resulting chunk embeddings capture the full\ncontextual information, leading to superior results across various retrieval\ntasks without the need for additional training. Moreover, our method is generic\nenough to be applied to any long-context embedding model.\n","authors":["Michael Günther","Isabelle Mohr","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.04701v1.pdf","comment":"4 pages, early draft"},{"id":"http://arxiv.org/abs/2405.14366v2","updated":"2024-09-07T02:52:29Z","published":"2024-05-23T09:43:52Z","title":"MiniCache: KV Cache Compression in Depth Dimension for Large Language\n Models","summary":" A critical approach for efficiently deploying computationally demanding large\nlanguage models (LLMs) is Key-Value (KV) caching. The KV cache stores key-value\nstates of previously generated tokens, significantly reducing the need for\nrepetitive computations and thereby lowering latency in autoregressive\ngeneration. However, the size of the KV cache grows linearly with sequence\nlength, posing challenges for applications requiring long context input and\nextensive sequence generation. In this paper, we present a simple yet effective\napproach, called MiniCache, to compress the KV cache across layers from a novel\ndepth perspective, significantly reducing the memory footprint for LLM\ninference. Our approach is based on the observation that KV cache states\nexhibit high similarity between the adjacent layers in the middle-to-deep\nportion of LLMs. To facilitate merging, we propose disentangling the states\ninto the magnitude and direction components, interpolating the directions of\nthe state vectors while preserving their lengths unchanged. Furthermore, we\nintroduce a token retention strategy to keep highly distinct state pairs\nunmerged, thus preserving the information with minimal additional storage\noverhead. Our MiniCache is training-free and general, complementing existing KV\ncache compression strategies, such as quantization and sparsity. We conduct a\ncomprehensive evaluation of MiniCache utilizing various models including\nLLaMA-2, LLaMA-3, Phi-3, Mistral, and Mixtral across multiple benchmarks,\ndemonstrating its exceptional performance in achieving superior compression\nratios and high throughput. On the ShareGPT dataset, LLaMA-2-7B with 4-bit\nMiniCache achieves a remarkable compression ratio of up to 5.02x, enhances\ninference throughput by approximately 5x, and reduces the memory footprint by\n41% compared to the FP16 full cache baseline, all while maintaining\nnear-lossless performance.\n","authors":["Akide Liu","Jing Liu","Zizheng Pan","Yefei He","Gholamreza Haffari","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2405.14366v2.pdf","comment":"Project is available at https://minicache.vmv.re"},{"id":"http://arxiv.org/abs/2311.08389v2","updated":"2024-09-07T02:49:37Z","published":"2023-11-14T18:50:51Z","title":"PSST: A Benchmark for Evaluation-driven Text Public-Speaking Style\n Transfer","summary":" Language style is necessary for AI systems to understand and generate diverse\nhuman language accurately. However, previous text style transfer primarily\nfocused on sentence-level data-driven approaches, limiting exploration of\npotential problems in large language models (LLMs) and the ability to meet\ncomplex application needs. To overcome these limitations, we introduce a novel\ntask called Public-Speaking Style Transfer (PSST), which aims to simulate\nhumans to transform passage-level, official texts into a public-speaking style.\nGrounded in the analysis of real-world data from a linguistic perspective, we\ndecompose public-speaking style into key sub-styles to pose challenges and\nquantify the style modeling capability of LLMs. For such intricate text style\ntransfer, we further propose a fine-grained evaluation framework to analyze the\ncharacteristics and identify the problems of stylized texts. Comprehensive\nexperiments suggest that current LLMs struggle to generate public speaking\ntexts that align with human preferences, primarily due to excessive stylization\nand loss of semantic information.\n","authors":["Huashan Sun","Yixiao Wu","Yuhao Ye","Yizhe Yang","Yinghao Li","Jiawei Li","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2311.08389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04667v1","updated":"2024-09-07T00:46:58Z","published":"2024-09-07T00:46:58Z","title":"QueryBuilder: Human-in-the-Loop Query Development for Information\n Retrieval","summary":" Frequently, users of an Information Retrieval (IR) system start with an\noverarching information need (a.k.a., an analytic task) and proceed to define\nfiner-grained queries covering various important aspects (i.e., sub-topics) of\nthat analytic task. We present a novel, interactive system called\n$\\textit{QueryBuilder}$, which allows a novice, English-speaking user to create\nqueries with a small amount of effort, through efficient exploration of an\nEnglish development corpus in order to rapidly develop cross-lingual\ninformation retrieval queries corresponding to the user's information needs.\nQueryBuilder performs near real-time retrieval of documents based on\nuser-entered search terms; the user looks through the retrieved documents and\nmarks sentences as relevant to the information needed. The marked sentences are\nused by the system as additional information in query formation and refinement:\nquery terms (and, optionally, event features, which capture event $'triggers'$\n(indicator terms) and agent/patient roles) are appropriately weighted, and a\nneural-based system, which better captures textual meaning, retrieves other\nrelevant content. The process of retrieval and marking is repeated as many\ntimes as desired, giving rise to increasingly refined queries in each\niteration. The final product is a fine-grained query used in Cross-Lingual\nInformation Retrieval (CLIR). Our experiments using analytic tasks and requests\nfrom the IARPA BETTER IR datasets show that with a small amount of effort (at\nmost 10 minutes per sub-topic), novice users can form $\\textit{useful}$\nfine-grained queries including in languages they don't understand. QueryBuilder\nalso provides beneficial capabilities to the traditional corpus exploration and\nquery formation process. A demonstration video is released at\nhttps://vimeo.com/734795835\n","authors":["Hemanth Kandula","Damianos Karakos","Haoling Qiu","Benjamin Rozonoyer","Ian Soboroff","Lee Tarlin","Bonan Min"],"pdf_url":"https://arxiv.org/pdf/2409.04667v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2404.14774v2","updated":"2024-09-07T16:11:36Z","published":"2024-04-23T06:29:48Z","title":"CoST: Contrastive Quantization based Semantic Tokenization for\n Generative Recommendation","summary":" Embedding-based retrieval serves as a dominant approach to candidate item\nmatching for industrial recommender systems. With the success of generative AI,\ngenerative retrieval has recently emerged as a new retrieval paradigm for\nrecommendation, which casts item retrieval as a generation problem. Its model\nconsists of two stages: semantic tokenization and autoregressive generation.\nThe first stage involves item tokenization that constructs discrete semantic\ntokens to index items, while the second stage autoregressively generates\nsemantic tokens of candidate items. Therefore, semantic tokenization serves as\na crucial preliminary step for training generative recommendation models.\nExisting research usually employs a vector quantizier with reconstruction loss\n(e.g., RQ-VAE) to obtain semantic tokens of items, but this method fails to\ncapture the essential neighborhood relationships that are vital for effective\nitem modeling in recommender systems. In this paper, we propose a contrastive\nquantization-based semantic tokenization approach, named CoST, which harnesses\nboth item relationships and semantic information to learn semantic tokens. Our\nexperimental results highlight the significant impact of semantic tokenization\non generative recommendation performance, with CoST achieving up to a 43%\nimprovement in Recall@5 and 44% improvement in NDCG@5 on the MIND dataset over\nprevious baselines.\n","authors":["Jieming Zhu","Mengqun Jin","Qijiong Liu","Zexuan Qiu","Zhenhua Dong","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2404.14774v2.pdf","comment":"Accepted by RecSys'2024"},{"id":"http://arxiv.org/abs/2407.21300v3","updated":"2024-09-07T15:02:48Z","published":"2024-07-31T03:00:59Z","title":"Implementing Streaming algorithm and k-means clusters to RAG","summary":" Retrieval-augmented generation (RAG) has achieved significant success in\ninformation retrieval to assist large language models LLMs because it builds an\nexternal knowledge database. However, it also has many problems, it consumes a\nlot of memory because of the enormous database, and it cannot update the\nestablished index database in time when confronted with massive streaming data.\nTo reduce the memory required for building the database and maintain accuracy\nsimultaneously, we proposed a new approach integrating a streaming algorithm\nwith k-means clustering into RAG. Our approach applied a streaming algorithm to\nupdate the index dynamically and reduce memory consumption. Additionally, the\nk-means algorithm clusters highly similar documents, and the query time would\nbe shortened. We conducted comparative experiments on four methods, and the\nresults indicated that RAG with streaming algorithm and k-means clusters\noutperforms traditional RAG in accuracy and memory, particularly when dealing\nwith large-scale streaming data.\n","authors":["Haoyu Kang","Yuzhou Zhu","Yukun Zhong","Ke Wang"],"pdf_url":"https://arxiv.org/pdf/2407.21300v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02335v2","updated":"2024-09-07T14:29:11Z","published":"2024-02-04T04:13:31Z","title":"Video Editing for Video Retrieval","summary":" Though pre-training vision-language models have demonstrated significant\nbenefits in boosting video-text retrieval performance from large-scale web\nvideos, fine-tuning still plays a critical role with manually annotated clips\nwith start and end times, which requires considerable human effort. To address\nthis issue, we explore an alternative cheaper source of annotations, single\ntimestamps, for video-text retrieval. We initialise clips from timestamps in a\nheuristic way to warm up a retrieval model. Then a video clip editing method is\nproposed to refine the initial rough boundaries to improve retrieval\nperformance. A student-teacher network is introduced for video clip editing.\nThe teacher model is employed to edit the clips in the training set whereas the\nstudent model trains on the edited clips. The teacher weights are updated from\nthe student's after the student's performance increases. Our method is model\nagnostic and applicable to any retrieval models. We conduct experiments based\non three state-of-the-art retrieval models, COOT, VideoCLIP and CLIP4Clip.\nExperiments conducted on three video retrieval datasets, YouCook2, DiDeMo and\nActivityNet-Captions show that our edited clips consistently improve retrieval\nperformance over initial clips across all the three retrieval models.\n","authors":["Bin Zhu","Kevin Flanagan","Adriano Fragomeni","Michael Wray","Dima Damen"],"pdf_url":"https://arxiv.org/pdf/2402.02335v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04827v1","updated":"2024-09-07T13:41:37Z","published":"2024-09-07T13:41:37Z","title":"Incorporate LLMs with Influential Recommender System","summary":" Recommender systems have achieved increasing accuracy over the years.\nHowever, this precision often leads users to narrow their interests, resulting\nin issues such as limited diversity and the creation of echo chambers. Current\nresearch addresses these challenges through proactive recommender systems by\nrecommending a sequence of items (called influence path) to guide user interest\nin the target item. However, existing methods struggle to construct a coherent\ninfluence path that builds up with items the user is likely to enjoy. In this\npaper, we leverage the Large Language Model's (LLMs) exceptional ability for\npath planning and instruction following, introducing a novel approach named\nLLM-based Influence Path Planning (LLM-IPP). Our approach maintains coherence\nbetween consecutive recommendations and enhances user acceptability of the\nrecommended items. To evaluate LLM-IPP, we implement various user simulators\nand metrics to measure user acceptability and path coherence. Experimental\nresults demonstrate that LLM-IPP significantly outperforms traditional\nproactive recommender systems. This study pioneers the integration of LLMs into\nproactive recommender systems, offering a reliable and user-engaging\nmethodology for future recommendation technologies.\n","authors":["Mingze Wang","Shuxian Bi","Wenjie Wang","Chongming Gao","Yangyang Li","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2409.04827v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2409.04810v1","updated":"2024-09-07T12:42:58Z","published":"2024-09-07T12:42:58Z","title":"Debias Can be Unreliable: Mitigating Bias Issue in Evaluating Debiasing\n Recommendation","summary":" Recent work has improved recommendation models remarkably by equipping them\nwith debiasing methods. Due to the unavailability of fully-exposed datasets,\nmost existing approaches resort to randomly-exposed datasets as a proxy for\nevaluating debiased models, employing traditional evaluation scheme to\nrepresent the recommendation performance. However, in this study, we reveal\nthat traditional evaluation scheme is not suitable for randomly-exposed\ndatasets, leading to inconsistency between the Recall performance obtained\nusing randomly-exposed datasets and that obtained using fully-exposed datasets.\nSuch inconsistency indicates the potential unreliability of experiment\nconclusions on previous debiasing techniques and calls for unbiased Recall\nevaluation using randomly-exposed datasets. To bridge the gap, we propose the\nUnbiased Recall Evaluation (URE) scheme, which adjusts the utilization of\nrandomly-exposed datasets to unbiasedly estimate the true Recall performance on\nfully-exposed datasets. We provide theoretical evidence to demonstrate the\nrationality of URE and perform extensive experiments on real-world datasets to\nvalidate its soundness.\n","authors":["Chengbing Wang","Wentao Shi","Jizhi Zhang","Wenjie Wang","Hang Pan","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2409.04810v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.06809v2","updated":"2024-09-07T09:02:29Z","published":"2024-08-13T10:58:29Z","title":"Reformulating Conversational Recommender Systems as Tri-Phase Offline\n Policy Learning","summary":" Existing Conversational Recommender Systems (CRS) predominantly utilize user\nsimulators for training and evaluating recommendation policies. These\nsimulators often oversimplify the complexity of user interactions by focusing\nsolely on static item attributes, neglecting the rich, evolving preferences\nthat characterize real-world user behavior. This limitation frequently leads to\nmodels that perform well in simulated environments but falter in actual\ndeployment. Addressing these challenges, this paper introduces the Tri-Phase\nOffline Policy Learning-based Conversational Recommender System (TCRS), which\nsignificantly reduces dependency on real-time interactions and mitigates\noverfitting issues prevalent in traditional approaches. TCRS integrates a\nmodel-based offline learning strategy with a controllable user simulation that\ndynamically aligns with both personalized and evolving user preferences.\nThrough comprehensive experiments, TCRS demonstrates enhanced robustness,\nadaptability, and accuracy in recommendations, outperforming traditional CRS\nmodels in diverse user scenarios. This approach not only provides a more\nrealistic evaluation environment but also facilitates a deeper understanding of\nuser behavior dynamics, thereby refining the recommendation process.\n","authors":["Gangyi Zhang","Chongming Gao","Hang Pan","Runzhe Teng","Ruizhe Li"],"pdf_url":"https://arxiv.org/pdf/2408.06809v2.pdf","comment":"Accepted at CIKM 2024"},{"id":"http://arxiv.org/abs/2408.11345v2","updated":"2024-09-07T07:54:32Z","published":"2024-08-21T05:09:53Z","title":"Deep Tree-based Retrieval for Efficient Recommendation: Theory and\n Method","summary":" With the development of deep learning techniques, deep recommendation models\nalso achieve remarkable improvements in terms of recommendation accuracy.\nHowever, due to the large number of candidate items in practice and the high\ncost of preference computation, these methods also suffer from low efficiency\nof recommendation. The recently proposed tree-based deep recommendation models\nalleviate the problem by directly learning tree structure and representations\nunder the guidance of recommendation objectives. However, such models have\nshortcomings. The max-heap assumption in the hierarchical tree, in which the\npreference for a parent node should be the maximum between the preferences for\nits children, is difficult to satisfy in their binary classification\nobjectives. To this end, we propose Tree-based Deep Retrieval (TDR for short)\nfor efficient recommendation. In TDR, all the trees generated during the\ntraining process are retained to form the forest. When learning the node\nrepresentation of each tree, we have to satisfy the max-heap assumption as much\nas possible and mimic beam search behavior over the tree in the training stage.\nThis is achieved by TDR to regard the training task as multi-classification\nover tree nodes at the same level. However, the number of tree nodes grows\nexponentially with levels, making us train the preference model with the\nguidance of the sampled-softmax technique. The experiments are conducted on\nreal-world datasets, validating the effectiveness of the proposed preference\nmodel learning method and tree learning method.\n","authors":["Ze Liu","Jin Zhang","Chao Feng","Defu Lian","Jie Wang","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2408.11345v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04701v1","updated":"2024-09-07T03:54:46Z","published":"2024-09-07T03:54:46Z","title":"Late Chunking: Contextual Chunk Embeddings Using Long-Context Embedding\n Models","summary":" Many use cases require retrieving smaller portions of text, and dense\nvector-based retrieval systems often perform better with shorter text segments,\nas the semantics are less likely to be \"over-compressed\" in the embeddings.\nConsequently, practitioners often split text documents into smaller chunks and\nencode them separately. However, chunk embeddings created in this way can lose\ncontextual information from surrounding chunks, resulting in suboptimal\nrepresentations. In this paper, we introduce a novel method called \"late\nchunking,\" which leverages long context embedding models to first embed all\ntokens of the long text, with chunking applied after the transformer model and\njust before mean pooling. The resulting chunk embeddings capture the full\ncontextual information, leading to superior results across various retrieval\ntasks without the need for additional training. Moreover, our method is generic\nenough to be applied to any long-context embedding model.\n","authors":["Michael Günther","Isabelle Mohr","Bo Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.04701v1.pdf","comment":"4 pages, early draft"},{"id":"http://arxiv.org/abs/2312.09425v3","updated":"2024-09-07T01:01:44Z","published":"2023-11-21T23:35:44Z","title":"YouTube Videos for Public Health Literacy? A Machine Learning Pipeline\n to Curate Covid-19 Videos","summary":" The COVID-19 pandemic has highlighted the dire necessity to improve public\nhealth literacy for societal resilience. YouTube, the largest video-sharing\nsocial media platform, provides a vast repository of user-generated health\ninformation in a multi-media-rich format which may be easier for the public to\nunderstand and use if major concerns about content quality and accuracy are\naddressed. This study develops an automated solution to identify, retrieve and\nshortlist medically relevant and understandable YouTube videos that domain\nexperts can subsequently review and recommend for disseminating and educating\nthe public on the COVID-19 pandemic and similar public health outbreaks. Our\napproach leverages domain knowledge from human experts and machine learning and\nnatural language processing methods to provide a scalable, replicable, and\ngeneralizable approach that can also be applied to enhance the management of\nmany health conditions.\n","authors":["Yawen Guo","Xiao Liu","Anjana Susarla","Rema Padman"],"pdf_url":"https://arxiv.org/pdf/2312.09425v3.pdf","comment":"Studies in health technology and informatics(MedInfo) 2023"},{"id":"http://arxiv.org/abs/2409.04667v1","updated":"2024-09-07T00:46:58Z","published":"2024-09-07T00:46:58Z","title":"QueryBuilder: Human-in-the-Loop Query Development for Information\n Retrieval","summary":" Frequently, users of an Information Retrieval (IR) system start with an\noverarching information need (a.k.a., an analytic task) and proceed to define\nfiner-grained queries covering various important aspects (i.e., sub-topics) of\nthat analytic task. We present a novel, interactive system called\n$\\textit{QueryBuilder}$, which allows a novice, English-speaking user to create\nqueries with a small amount of effort, through efficient exploration of an\nEnglish development corpus in order to rapidly develop cross-lingual\ninformation retrieval queries corresponding to the user's information needs.\nQueryBuilder performs near real-time retrieval of documents based on\nuser-entered search terms; the user looks through the retrieved documents and\nmarks sentences as relevant to the information needed. The marked sentences are\nused by the system as additional information in query formation and refinement:\nquery terms (and, optionally, event features, which capture event $'triggers'$\n(indicator terms) and agent/patient roles) are appropriately weighted, and a\nneural-based system, which better captures textual meaning, retrieves other\nrelevant content. The process of retrieval and marking is repeated as many\ntimes as desired, giving rise to increasingly refined queries in each\niteration. The final product is a fine-grained query used in Cross-Lingual\nInformation Retrieval (CLIR). Our experiments using analytic tasks and requests\nfrom the IARPA BETTER IR datasets show that with a small amount of effort (at\nmost 10 minutes per sub-topic), novice users can form $\\textit{useful}$\nfine-grained queries including in languages they don't understand. QueryBuilder\nalso provides beneficial capabilities to the traditional corpus exploration and\nquery formation process. A demonstration video is released at\nhttps://vimeo.com/734795835\n","authors":["Hemanth Kandula","Damianos Karakos","Haoling Qiu","Benjamin Rozonoyer","Ian Soboroff","Lee Tarlin","Bonan Min"],"pdf_url":"https://arxiv.org/pdf/2409.04667v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2209.03275v3","updated":"2024-09-07T20:10:07Z","published":"2022-09-07T16:27:34Z","title":"Multimodal Speech Enhancement Using Burst Propagation","summary":" This paper proposes the MBURST, a novel multimodal solution for audio-visual\nspeech enhancements that consider the most recent neurological discoveries\nregarding pyramidal cells of the prefrontal cortex and other brain regions. The\nso-called burst propagation implements several criteria to address the credit\nassignment problem in a more biologically plausible manner: steering the sign\nand magnitude of plasticity through feedback, multiplexing the feedback and\nfeedforward information across layers through different weight connections,\napproximating feedback and feedforward connections, and linearizing the\nfeedback signals. MBURST benefits from such capabilities to learn correlations\nbetween the noisy signal and the visual stimuli, thus attributing meaning to\nthe speech by amplifying relevant information and suppressing noise.\nExperiments conducted over a Grid Corpus and CHiME3-based dataset show that\nMBURST can reproduce similar mask reconstructions to the multimodal\nbackpropagation-based baseline while demonstrating outstanding energy\nefficiency management, reducing the neuron firing rates to values up to\n\\textbf{$70\\%$} lower. Such a feature implies more sustainable implementations,\nsuitable and desirable for hearing aids or any other similar embedded systems.\n","authors":["Mohsin Raza","Leandro A. Passos","Ahmed Khubaib","Ahsan Adeel"],"pdf_url":"https://arxiv.org/pdf/2209.03275v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04828v1","updated":"2024-09-07T13:41:37Z","published":"2024-09-07T13:41:37Z","title":"POINTS: Improving Your Vision-language Model with Affordable Strategies","summary":" In recent years, vision-language models have made significant strides,\nexcelling in tasks like optical character recognition and geometric\nproblem-solving. However, several critical issues remain: 1) Proprietary models\noften lack transparency about their architectures, while open-source models\nneed more detailed ablations of their training strategies. 2) Pre-training data\nin open-source works is under-explored, with datasets added empirically, making\nthe process cumbersome. 3) Fine-tuning often focuses on adding datasets,\nleading to diminishing returns. To address these issues, we propose the\nfollowing contributions: 1) We trained a robust baseline model using the latest\nadvancements in vision-language models, introducing effective improvements and\nconducting comprehensive ablation and validation for each technique. 2)\nInspired by recent work on large language models, we filtered pre-training data\nusing perplexity, selecting the lowest perplexity data for training. This\napproach allowed us to train on a curated 1M dataset, achieving competitive\nperformance. 3) During visual instruction tuning, we used model soup on\ndifferent datasets when adding more datasets yielded marginal improvements.\nThese innovations resulted in a 9B parameter model that performs competitively\nwith state-of-the-art models. Our strategies are efficient and lightweight,\nmaking them easily adoptable by the community.\n","authors":["Yuan Liu","Zhongyin Zhao","Ziyuan Zhuang","Le Tian","Xiao Zhou","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2409.04828v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..3b8117df --- /dev/null +++ b/index.html @@ -0,0 +1,74143 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 53 + +
+
+
+ + ☆ MMEvol: Empowering Multimodal Large Language Models with Evol-Instruct + + +
+ The development of Multimodal Large Language Models (MLLMs) has seen +significant advancements. However, the quantity and quality of multimodal +instruction data have emerged as significant bottlenecks in their progress. +Manually creating multimodal instruction data is both time-consuming and +inefficient, posing challenges in producing instructions of high complexity. +Moreover, distilling instruction data from black-box commercial models (e.g., +GPT-4o, GPT-4V) often results in simplistic instruction data, which constrains +performance to that of these models. The challenge of curating diverse and +complex instruction data remains substantial. We propose MMEvol, a novel +multimodal instruction data evolution framework that combines fine-grained +perception evolution, cognitive reasoning evolution, and interaction evolution. +This iterative approach breaks through data quality bottlenecks to generate a +complex and diverse image-text instruction dataset, thereby empowering MLLMs +with enhanced capabilities. Beginning with an initial set of instructions, +SEED-163K, we utilize MMEvol to systematically broadens the diversity of +instruction types, integrates reasoning steps to enhance cognitive +capabilities, and extracts detailed information from images to improve visual +understanding and robustness. To comprehensively evaluate the effectiveness of +our data, we train LLaVA-NeXT using the evolved data and conduct experiments +across 13 vision-language tasks. Compared to the baseline trained with seed +data, our approach achieves an average accuracy improvement of 3.1 points and +reaches state-of-the-art (SOTA) performance on 9 of these tasks. + +
+
+
+
+
+ + ☆ Improving Pretraining Data Using Perplexity Correlations + + +
+ Quality pretraining data is often seen as the key to high-performance +language models. However, progress in understanding pretraining data has been +slow due to the costly pretraining runs required for data selection +experiments. We present a framework that avoids these costs and selects +high-quality pretraining data without any LLM training of our own. Our work is +based on a simple observation: LLM losses on many pretraining texts are +correlated with downstream benchmark performance, and selecting +high-correlation documents is an effective pretraining data selection method. +We build a new statistical framework for data selection centered around +estimates of perplexity-benchmark correlations and perform data selection using +a sample of 90 LLMs taken from the Open LLM Leaderboard on texts from tens of +thousands of web domains. In controlled pretraining experiments at the 160M +parameter scale on 8 benchmarks, our approach outperforms DSIR on every +benchmark, while matching the best data selector found in DataComp-LM, a +hand-engineered bigram classifier. + +
+
+
+
+
+ + ☆ Benchmarking Chinese Knowledge Rectification in Large Language Models + + +
+ While Large Language Models (LLMs) exhibit remarkable generative +capabilities, they are not without flaws, particularly in the form of +hallucinations. This issue is even more pronounced when LLMs are applied to +specific languages and domains. For example, LLMs may generate nonsense +information when handling Chinese ancient poetry, proverbs, or idioms, owing to +the lack of specific knowledge. To this end, this paper introduces a benchmark +for rectifying Chinese knowledge in LLMs via knowledge editing. Specifically, +we introduce a new Chinese dataset, CKnowEdit, by collecting seven type of +knowledge from various sources, including classical texts, idioms, and content +from Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony, +antithesis, and logical constructs inherent in the Chinese language. Through +the analysis of this dataset, we uncover the challenges faced by current LLMs +in mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge +editing techniques on this dataset unveil the substantial scope for advancement +in the rectification of Chinese knowledge. Code and dataset are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work; code and dataset are available at + https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ PDAF: A Phonetic Debiasing Attention Framework For Speaker Verification + + +
+ Speaker verification systems are crucial for authenticating identity through +voice. Traditionally, these systems focus on comparing feature vectors, +overlooking the speech's content. However, this paper challenges this by +highlighting the importance of phonetic dominance, a measure of the frequency +or duration of phonemes, as a crucial cue in speaker verification. A novel +Phoneme Debiasing Attention Framework (PDAF) is introduced, integrating with +existing attention frameworks to mitigate biases caused by phonetic dominance. +PDAF adjusts the weighting for each phoneme and influences feature extraction, +allowing for a more nuanced analysis of speech. This approach paves the way for +more accurate and reliable identity authentication through voice. Furthermore, +by employing various weighting strategies, we evaluate the influence of +phonetic features on the efficacy of the speaker verification system. + +
+
+ comment: Accepted to SLT +
+
+
+
+
+ + ☆ Evidence from fMRI Supports a Two-Phase Abstraction Process in Language + Models NeurIPS + + +
+ Research has repeatedly demonstrated that intermediate hidden states +extracted from large language models are able to predict measured brain +response to natural language stimuli. Yet, very little is known about the +representation properties that enable this high prediction performance. Why is +it the intermediate layers, and not the output layers, that are most capable +for this unique and highly general transfer task? In this work, we show that +evidence from language encoding models in fMRI supports the existence of a +two-phase abstraction process within LLMs. We use manifold learning methods to +show that this abstraction process naturally arises over the course of training +a language model and that the first "composition" phase of this abstraction +process is compressed into fewer layers as training continues. Finally, we +demonstrate a strong correspondence between layerwise encoding performance and +the intrinsic dimensionality of representations from LLMs. We give initial +evidence that this correspondence primarily derives from the inherent +compositionality of LLMs and not their next-word prediction properties. + +
+
+ comment: Equal contribution from both authors. Submitted to NeurIPS NeuroAI + workshop 2024 +
+
+
+
+
+ + ☆ Towards Democratizing Multilingual Large Language Models For Medicine + Through A Two-Stage Instruction Fine-tuning Approach + + +
+ Open-source, multilingual medical large language models (LLMs) have the +potential to serve linguistically diverse populations across different regions. +Adapting generic LLMs for healthcare often requires continual pretraining, but +this approach is computationally expensive and sometimes impractical. +Instruction fine-tuning on a specific task may not always guarantee optimal +performance due to the lack of broader domain knowledge that the model needs to +understand and reason effectively in diverse scenarios. To address these +challenges, we introduce two multilingual instruction fine-tuning datasets, +MMed-IFT and MMed-IFT-MC, containing over 200k high-quality medical samples in +six languages. We propose a two-stage training paradigm: the first stage +injects general medical knowledge using MMed-IFT, while the second stage +fine-tunes task-specific multiple-choice questions with MMed-IFT-MC. Our method +achieves competitive results on both English and multilingual benchmarks, +striking a balance between computational efficiency and performance. We plan to +make our dataset and model weights public at +\url{https://github.com/SpassMed/Med-Llama3} in the future. + +
+
+ comment: Technical Report v1, work in progress +
+
+
+
+
+ + ☆ Referring Expression Generation in Visually Grounded Dialogue with + Discourse-aware Comprehension Guiding + + +
+ We propose an approach to referring expression generation (REG) in visually +grounded dialogue that is meant to produce referring expressions (REs) that are +both discriminative and discourse-appropriate. Our method constitutes a +two-stage process. First, we model REG as a text- and image-conditioned +next-token prediction task. REs are autoregressively generated based on their +preceding linguistic context and a visual representation of the referent. +Second, we propose the use of discourse-aware comprehension guiding as part of +a generate-and-rerank strategy through which candidate REs generated with our +REG model are reranked based on their discourse-dependent discriminatory power. +Results from our human evaluation indicate that our proposed two-stage approach +is effective in producing discriminative REs, with higher performance in terms +of text-image retrieval accuracy for reranked REs compared to those generated +using greedy decoding. + +
+
+ comment: Accepted for publication at INLG 2024 +
+
+
+
+
+ + ☆ RegNLP in Action: Facilitating Compliance Through Automated Information + Retrieval and Answer Generation + + +
+ Regulatory documents, issued by governmental regulatory bodies, establish +rules, guidelines, and standards that organizations must adhere to for legal +compliance. These documents, characterized by their length, complexity and +frequent updates, are challenging to interpret, requiring significant +allocation of time and expertise on the part of organizations to ensure ongoing +compliance.Regulatory Natural Language Processing (RegNLP) is a +multidisciplinary subfield aimed at simplifying access to and interpretation of +regulatory rules and obligations. We define an Automated Question-Passage +Generation task for RegNLP, create the ObliQA dataset containing 27,869 +questions derived from the Abu Dhabi Global Markets (ADGM) financial regulation +document collection, design a baseline Regulatory Information Retrieval and +Answer Generation system, and evaluate it with RePASs, a novel evaluation +metric that tests whether generated answers accurately capture all relevant +obligations and avoid contradictions. + +
+
+
+
+
+ + ☆ Evaluation of real-time transcriptions using end-to-end ASR models + + +
+ Automatic Speech Recognition (ASR) or Speech-to-text (STT) has greatly +evolved in the last few years. Traditional architectures based on pipelines +have been replaced by joint end-to-end (E2E) architectures that simplify and +streamline the model training process. In addition, new AI training methods, +such as weak-supervised learning have reduced the need for high-quality audio +datasets for model training. However, despite all these advancements, little to +no research has been done on real-time transcription. In real-time scenarios, +the audio is not pre-recorded, and the input audio must be fragmented to be +processed by the ASR systems. To achieve real-time requirements, these +fragments must be as short as possible to reduce latency. However, audio cannot +be split at any point as dividing an utterance into two separate fragments will +generate an incorrect transcription. Also, shorter fragments provide less +context for the ASR model. For this reason, it is necessary to design and test +different splitting algorithms to optimize the quality and delay of the +resulting transcription. In this paper, three audio splitting algorithms are +evaluated with different ASR models to determine their impact on both the +quality of the transcription and the end-to-end delay. The algorithms are +fragmentation at fixed intervals, voice activity detection (VAD), and +fragmentation with feedback. The results are compared to the performance of the +same model, without audio fragmentation, to determine the effects of this +division. The results show that VAD fragmentation provides the best quality +with the highest delay, whereas fragmentation at fixed intervals provides the +lowest quality and the lowest delay. The newly proposed feedback algorithm +exchanges a 2-4% increase in WER for a reduction of 1.5-2s delay, respectively, +to the VAD splitting. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Revisiting English Winogender Schemas for Consistency, Coverage, and + Grammatical Case + + +
+ While measuring bias and robustness in coreference resolution are important +goals, such measurements are only as good as the tools we use to measure them +with. Winogender schemas (Rudinger et al., 2018) are an influential dataset +proposed to evaluate gender bias in coreference resolution, but a closer look +at the data reveals issues with the instances that compromise their use for +reliable evaluation, including treating different grammatical cases of pronouns +in the same way, violations of template constraints, and typographical errors. +We identify these issues and fix them, contributing a new dataset: Winogender +2.0. Our changes affect performance with state-of-the-art supervised +coreference resolution systems as well as all model sizes of the language model +FLAN-T5, with F1 dropping on average 0.1 points. We also propose a new method +to evaluate pronominal bias in coreference resolution that goes beyond the +binary. With this method and our new dataset which is balanced for grammatical +case, we empirically demonstrate that bias characteristics vary not just across +pronoun sets, but also across surface forms of those sets. + +
+
+
+
+
+ + ☆ Longer is (Not Necessarily) Stronger: Punctuated Long-Sequence Training + for Enhanced Speech Recognition and Translation + + +
+ This paper presents a new method for training sequence-to-sequence models for +speech recognition and translation tasks. Instead of the traditional approach +of training models on short segments containing only lowercase or partial +punctuation and capitalization (PnC) sentences, we propose training on longer +utterances that include complete sentences with proper punctuation and +capitalization. We achieve this by using the FastConformer architecture which +allows training 1 Billion parameter models with sequences up to 60 seconds long +with full attention. However, while training with PnC enhances the overall +performance, we observed that accuracy plateaus when training on sequences +longer than 40 seconds across various evaluation settings. Our proposed method +significantly improves punctuation and capitalization accuracy, showing a 25% +relative word error rate (WER) improvement on the Earnings-21 and Earnings-22 +benchmarks. Additionally, training on longer audio segments increases the +overall model accuracy across speech recognition and translation benchmarks. +The model weights and training code are open-sourced though NVIDIA NeMo. + +
+
+ comment: Accepted at SLT 2024 +
+
+
+
+
+ + ☆ ExDDI: Explaining Drug-Drug Interaction Predictions with Natural + Language + + +
+ Predicting unknown drug-drug interactions (DDIs) is crucial for improving +medication safety. Previous efforts in DDI prediction have typically focused on +binary classification or predicting DDI categories, with the absence of +explanatory insights that could enhance trust in these predictions. In this +work, we propose to generate natural language explanations for DDI predictions, +enabling the model to reveal the underlying pharmacodynamics and +pharmacokinetics mechanisms simultaneously as making the prediction. To do +this, we have collected DDI explanations from DDInter and DrugBank and +developed various models for extensive experiments and analysis. Our models can +provide accurate explanations for unknown DDIs between known drugs. This paper +contributes new tools to the field of DDI prediction and lays a solid +foundation for further research on generating explanations for DDI predictions. + +
+
+ comment: 17 pages, 4 figures +
+
+
+
+
+ + ☆ MemoRAG: Moving towards Next-Gen RAG Via Memory-Inspired Knowledge + Discovery + + +
+ Retrieval-Augmented Generation (RAG) leverages retrieval tools to access +external databases, thereby enhancing the generation quality of large language +models (LLMs) through optimized context. However, the existing retrieval +methods are constrained inherently, as they can only perform relevance matching +between explicitly stated queries and well-formed knowledge, but unable to +handle tasks involving ambiguous information needs or unstructured knowledge. +Consequently, existing RAG systems are primarily effective for straightforward +question-answering tasks. In this work, we propose \textbf{MemoRAG}, a novel +retrieval-augmented generation paradigm empowered by long-term memory. MemoRAG +adopts a dual-system architecture. On the one hand, it employs a \textit{light +but long-range} LLM to form the global memory of database. Once a task is +presented, it generates draft answers, cluing the retrieval tools to locate +useful information within the database. On the other hand, it leverages an +\textit{expensive but expressive} LLM, which generates the ultimate answer +based on the retrieved information. Building on this general framework, we +further optimize MemoRAG's performance by enhancing its cluing mechanism and +memorization capacity. In our experiment, MemoRAG achieves superior performance +across a variety of evaluation tasks, including both complex ones where +conventional RAG fails and straightforward ones where RAG is commonly applied. + +
+
+ comment: Codes and models are in https://github.com/qhjqhj00/MemoRAG +
+
+
+
+
+ + ☆ Spatially-Aware Speaker for Vision-and-Language Navigation Instruction + Generation + + +
+ Embodied AI aims to develop robots that can \textit{understand} and execute +human language instructions, as well as communicate in natural languages. On +this front, we study the task of generating highly detailed navigational +instructions for the embodied robots to follow. Although recent studies have +demonstrated significant leaps in the generation of step-by-step instructions +from sequences of images, the generated instructions lack variety in terms of +their referral to objects and landmarks. Existing speaker models learn +strategies to evade the evaluation metrics and obtain higher scores even for +low-quality sentences. In this work, we propose SAS (Spatially-Aware Speaker), +an instruction generator or \textit{Speaker} model that utilises both +structural and semantic knowledge of the environment to produce richer +instructions. For training, we employ a reward learning method in an +adversarial setting to avoid systematic bias introduced by language evaluation +metrics. Empirically, our method outperforms existing instruction generation +models, evaluated using standard metrics. Our code is available at +\url{https://github.com/gmuraleekrishna/SAS}. + +
+
+
+
+
+ + ☆ SciAgents: Automating scientific discovery through multi-agent + intelligent graph reasoning + + +
+ A key challenge in artificial intelligence is the creation of systems capable +of autonomously advancing scientific understanding by exploring novel domains, +identifying complex patterns, and uncovering previously unseen connections in +vast scientific data. In this work, we present SciAgents, an approach that +leverages three core concepts: (1) the use of large-scale ontological knowledge +graphs to organize and interconnect diverse scientific concepts, (2) a suite of +large language models (LLMs) and data retrieval tools, and (3) multi-agent +systems with in-situ learning capabilities. Applied to biologically inspired +materials, SciAgents reveals hidden interdisciplinary relationships that were +previously considered unrelated, achieving a scale, precision, and exploratory +power that surpasses traditional human-driven research methods. The framework +autonomously generates and refines research hypotheses, elucidating underlying +mechanisms, design principles, and unexpected material properties. By +integrating these capabilities in a modular fashion, the intelligent system +yields material discoveries, critique and improve existing hypotheses, retrieve +up-to-date data about existing research, and highlights their strengths and +limitations. Our case studies demonstrate scalable capabilities to combine +generative AI, ontological representations, and multi-agent modeling, +harnessing a `swarm of intelligence' similar to biological systems. This +provides new avenues for materials discovery and accelerates the development of +advanced materials by unlocking Nature's design principles. + +
+
+
+
+
+ + ☆ QiBERT -- Classifying Online Conversations Messages with BERT as a + Feature + + +
+ Recent developments in online communication and their usage in everyday life +have caused an explosion in the amount of a new genre of text data, short text. +Thus, the need to classify this type of text based on its content has a +significant implication in many areas. Online debates are no exception, once +these provide access to information about opinions, positions and preferences +of its users. This paper aims to use data obtained from online social +conversations in Portuguese schools (short text) to observe behavioural trends +and to see if students remain engaged in the discussion when stimulated. This +project used the state of the art (SoA) Machine Learning (ML) algorithms and +methods, through BERT based models to classify if utterances are in or out of +the debate subject. Using SBERT embeddings as a feature, with supervised +learning, the proposed model achieved results above 0.95 average accuracy for +classifying online messages. Such improvements can help social scientists +better understand human communication, behaviour, discussion and persuasion. + +
+
+
+
+
+ + ☆ Harmonic Reasoning in Large Language Models + + +
+ Large Language Models (LLMs) are becoming very popular and are used for many +different purposes, including creative tasks in the arts. However, these models +sometimes have trouble with specific reasoning tasks, especially those that +involve logical thinking and counting. This paper looks at how well LLMs +understand and reason when dealing with musical tasks like figuring out notes +from intervals and identifying chords and scales. We tested GPT-3.5 and GPT-4o +to see how they handle these tasks. Our results show that while LLMs do well +with note intervals, they struggle with more complicated tasks like recognizing +chords and scales. This points out clear limits in current LLM abilities and +shows where we need to make them better, which could help improve how they +think and work in both artistic and other complex areas. We also provide an +automatically generated benchmark data set for the described tasks. + +
+
+
+
+
+ + ☆ Elsevier Arena: Human Evaluation of Chemistry/Biology/Health + Foundational Large Language Models + + +
+ The quality and capabilities of large language models cannot be currently +fully assessed with automated, benchmark evaluations. Instead, human +evaluations that expand on traditional qualitative techniques from natural +language generation literature are required. One recent best-practice consists +in using A/B-testing frameworks, which capture preferences of human evaluators +for specific models. In this paper we describe a human evaluation experiment +focused on the biomedical domain (health, biology, chemistry/pharmacology) +carried out at Elsevier. In it a large but not massive (8.8B parameter) +decoder-only foundational transformer trained on a relatively small (135B +tokens) but highly curated collection of Elsevier datasets is compared to +OpenAI's GPT-3.5-turbo and Meta's foundational 7B parameter Llama 2 model +against multiple criteria. Results indicate -- even if IRR scores were +generally low -- a preference towards GPT-3.5-turbo, and hence towards models +that possess conversational abilities, are very large and were trained on very +large datasets. But at the same time, indicate that for less massive models +training on smaller but well-curated training sets can potentially give rise to +viable alternatives in the biomedical domain. + +
+
+ comment: 11 pages, 5 tables, 6 figures +
+
+
+
+
+ + ☆ Representational Analysis of Binding in Large Language Models + + +
+ Entity tracking is essential for complex reasoning. To perform in-context +entity tracking, language models (LMs) must bind an entity to its attribute +(e.g., bind a container to its content) to recall attribute for a given entity. +For example, given a context mentioning ``The coffee is in Box Z, the stone is +in Box M, the map is in Box H'', to infer ``Box Z contains the coffee'' later, +LMs must bind ``Box Z'' to ``coffee''. To explain the binding behaviour of LMs, +Feng and Steinhardt (2023) introduce a Binding ID mechanism and state that LMs +use a abstract concept called Binding ID (BI) to internally mark +entity-attribute pairs. However, they have not directly captured the BI +determinant information from entity activations. In this work, we provide a +novel view of the Binding ID mechanism by localizing the prototype of BI +information. Specifically, we discover that there exists a low-rank subspace in +the hidden state (or activation) of LMs, that primarily encodes the order of +entity and attribute and which is used as the prototype of BI to causally +determine the binding. To identify this subspace, we choose principle component +analysis as our first attempt and it is empirically proven to be effective. +Moreover, we also discover that when editing representations along directions +in the subspace, LMs tend to bind a given entity to other attributes +accordingly. For example, by patching activations along the BI encoding +direction we can make the LM to infer ``Box Z contains the stone'' and ``Box Z +contains the map''. + +
+
+
+
+
+ + ☆ STLM Engineering Report: Dropout + + +
+ In this work we explore the relevance of dropout for modern language models, +particularly in the context of models on the scale of <100M parameters. We +explore it's relevance firstly in the regime of improving the sample efficiency +of models given small, high quality datasets, and secondly in the regime of +improving the quality of its fit on larger datasets where models may underfit. +We find that concordant with conventional wisdom, dropout remains effective in +the overfitting scenario, and that furthermore it may have some relevance for +improving the fit of models even in the case of excess data, as suggested by +previous research. In the process we find that the existing explanation for the +mechanism behind this performance gain is not applicable in the case of +language modelling. + +
+
+ comment: 6 pages, 3 figures, For code base see + https://github.com/LeonGuertler/SuperTinyLanguageModels +
+
+
+
+
+ + ☆ NLLB-E5: A Scalable Multilingual Retrieval Model + + +
+ Despite significant progress in multilingual information retrieval, the lack +of models capable of effectively supporting multiple languages, particularly +low-resource like Indic languages, remains a critical challenge. This paper +presents NLLB-E5: A Scalable Multilingual Retrieval Model. NLLB-E5 leverages +the in-built multilingual capabilities in the NLLB encoder for translation +tasks. It proposes a distillation approach from multilingual retriever E5 to +provide a zero-shot retrieval approach handling multiple languages, including +all major Indic languages, without requiring multilingual training data. We +evaluate the model on a comprehensive suite of existing benchmarks, including +Hindi-BEIR, highlighting its robust performance across diverse languages and +tasks. Our findings uncover task and domain-specific challenges, providing +valuable insights into the retrieval performance, especially for low-resource +languages. NLLB-E5 addresses the urgent need for an inclusive, scalable, and +language-agnostic text retrieval model, advancing the field of multilingual +information access and promoting digital inclusivity for millions of users +globally. + +
+
+
+
+
+ + ☆ Towards Building a Robust Knowledge Intensive Question Answering Model + with Large Language Models NLPCC-2024 + + +
+ The development of LLMs has greatly enhanced the intelligence and fluency of +question answering, while the emergence of retrieval enhancement has enabled +models to better utilize external information. However, the presence of noise +and errors in retrieved information poses challenges to the robustness of LLMs. +In this work, to evaluate the model's performance under multiple interferences, +we first construct a dataset based on machine reading comprehension datasets +simulating various scenarios, including critical information absence, noise, +and conflicts. To address the issue of model accuracy decline caused by noisy +external information, we propose a data augmentation-based fine-tuning method +to enhance LLM's robustness against noise. Additionally, contrastive learning +approach is utilized to preserve the model's discrimination capability of +external information. We have conducted experiments on both existing LLMs and +our approach, the results are evaluated by GPT-4, which indicates that our +proposed methods improve model robustness while strengthening the model's +discrimination capability. + +
+
+ comment: This paper has been accepted by NLPCC-2024 +
+
+
+
+
+ + ☆ Application Specific Compression of Deep Learning Models KDD + + +
+ Large Deep Learning models are compressed and deployed for specific +applications. However, current Deep Learning model compression methods do not +utilize the information about the target application. As a result, the +compressed models are application agnostic. Our goal is to customize the model +compression process to create a compressed model that will perform better for +the target application. Our method, Application Specific Compression (ASC), +identifies and prunes components of the large Deep Learning model that are +redundant specifically for the given target application. The intuition of our +work is to prune the parts of the network that do not contribute significantly +to updating the data representation for the given application. We have +experimented with the BERT family of models for three applications: Extractive +QA, Natural Language Inference, and Paraphrase Identification. We observe that +customized compressed models created using ASC method perform better than +existing model compression methods and off-the-shelf compressed models. + +
+
+ comment: Accepted in the Proceedings of the 8th Joint International Conference + on Data Science & Management of Data (12th ACM IKDD CODS and 30th COMAD) for + the Short Research Paper track, 5 pages +
+
+
+
+
+ + ☆ Diagnostic Reasoning in Natural Language: Computational Model and + Application + + +
+ Diagnostic reasoning is a key component of expert work in many domains. It is +a hard, time-consuming activity that requires expertise, and AI research has +investigated the ways automated systems can support this process. Yet, due to +the complexity of natural language, the applications of AI for diagnostic +reasoning to language-related tasks are lacking. To close this gap, we +investigate diagnostic abductive reasoning (DAR) in the context of +language-grounded tasks (NL-DAR). We propose a novel modeling framework for +NL-DAR based on Pearl's structural causal models and instantiate it in a +comprehensive study of scientific paper assessment in the biomedical domain. We +use the resulting dataset to investigate the human decision-making process in +NL-DAR and determine the potential of LLMs to support structured +decision-making over text. Our framework, open resources and tools lay the +groundwork for the empirical study of collaborative diagnostic reasoning in the +age of LLMs, in the scholarly domain and beyond. + +
+
+
+
+
+ + ☆ IndicVoices-R: Unlocking a Massive Multilingual Multi-speaker Speech + Corpus for Scaling Indian TTS + + +
+ Recent advancements in text-to-speech (TTS) synthesis show that large-scale +models trained with extensive web data produce highly natural-sounding output. +However, such data is scarce for Indian languages due to the lack of +high-quality, manually subtitled data on platforms like LibriVox or YouTube. To +address this gap, we enhance existing large-scale ASR datasets containing +natural conversations collected in low-quality environments to generate +high-quality TTS training data. Our pipeline leverages the cross-lingual +generalization of denoising and speech enhancement models trained on English +and applied to Indian languages. This results in IndicVoices-R (IV-R), the +largest multilingual Indian TTS dataset derived from an ASR dataset, with 1,704 +hours of high-quality speech from 10,496 speakers across 22 Indian languages. +IV-R matches the quality of gold-standard TTS datasets like LJSpeech, LibriTTS, +and IndicTTS. We also introduce the IV-R Benchmark, the first to assess +zero-shot, few-shot, and many-shot speaker generalization capabilities of TTS +models on Indian voices, ensuring diversity in age, gender, and style. We +demonstrate that fine-tuning an English pre-trained model on a combined dataset +of high-quality IndicTTS and our IV-R dataset results in better zero-shot +speaker generalization compared to fine-tuning on the IndicTTS dataset alone. +Further, our evaluation reveals limited zero-shot generalization for Indian +voices in TTS models trained on prior datasets, which we improve by fine-tuning +the model on our data containing diverse set of speakers across language +families. We open-source all data and code, releasing the first TTS model for +all 22 official Indian languages. + +
+
+
+
+
+ + ☆ Mpox Narrative on Instagram: A Labeled Multilingual Dataset of Instagram + Posts on Mpox for Sentiment, Hate Speech, and Anxiety Analysis + + +
+ The world is currently experiencing an outbreak of mpox, which has been +declared a Public Health Emergency of International Concern by WHO. No prior +work related to social media mining has focused on the development of a dataset +of Instagram posts about the mpox outbreak. The work presented in this paper +aims to address this research gap and makes two scientific contributions to +this field. First, it presents a multilingual dataset of 60,127 Instagram posts +about mpox, published between July 23, 2022, and September 5, 2024. The +dataset, available at https://dx.doi.org/10.21227/7fvc-y093, contains Instagram +posts about mpox in 52 languages. For each of these posts, the Post ID, Post +Description, Date of publication, language, and translated version of the post +(translation to English was performed using the Google Translate API) are +presented as separate attributes in the dataset. After developing this dataset, +sentiment analysis, hate speech detection, and anxiety or stress detection were +performed. This process included classifying each post into (i) one of the +sentiment classes, i.e., fear, surprise, joy, sadness, anger, disgust, or +neutral, (ii) hate or not hate, and (iii) anxiety/stress detected or no +anxiety/stress detected. These results are presented as separate attributes in +the dataset. Second, this paper presents the results of performing sentiment +analysis, hate speech analysis, and anxiety or stress analysis. The variation +of the sentiment classes - fear, surprise, joy, sadness, anger, disgust, and +neutral were observed to be 27.95%, 2.57%, 8.69%, 5.94%, 2.69%, 1.53%, and +50.64%, respectively. In terms of hate speech detection, 95.75% of the posts +did not contain hate and the remaining 4.25% of the posts contained hate. +Finally, 72.05% of the posts did not indicate any anxiety/stress, and the +remaining 27.95% of the posts represented some form of anxiety/stress. + +
+
+
+
+
+ + ☆ Seek and Solve Reasoning for Table Question Answering + + +
+ Table-based Question Answering (TQA) involves answering questions based on +tabular data. The complexity of table structures and question logic makes this +task difficult even for Large Language Models (LLMs). This paper improves TQA +performance by leveraging LLMs' reasoning capabilities. Inspired by how humans +solve TQA tasks, we propose a Seek-and-Solve pipeline that instructs the LLM to +first seek relevant information and then answer questions. The two stages are +integrated at the reasoning level, and their Chain of Thought (CoT) paths are +integrated into a coherent Seek-and-Solve CoT (SS-CoT). Furthermore, we present +a compact single-stage TQA-solving prompt distilled from the pipeline. +Experiments demonstrate that under In-Context Learning settings, using samples +with SS-CoT paths as demonstrations, the TQA-solving prompt can effectively +guide the LLM to solve complex TQA tasks, resulting in improved performance and +reliability. Our results highlight the importance of properly eliciting LLMs' +reasoning capabilities in solving complex TQA tasks. + +
+
+
+
+
+ + ☆ On the Relationship between Truth and Political Bias in Language Models + + +
+ Language model alignment research often attempts to ensure that models are +not only helpful and harmless, but also truthful and unbiased. However, +optimizing these objectives simultaneously can obscure how improving one aspect +might impact the others. In this work, we focus on analyzing the relationship +between two concepts essential in both language model alignment and political +science: \textit{truthfulness} and \textit{political bias}. We train reward +models on various popular truthfulness datasets and subsequently evaluate their +political bias. Our findings reveal that optimizing reward models for +truthfulness on these datasets tends to result in a left-leaning political +bias. We also find that existing open-source reward models (i.e. those trained +on standard human preference datasets) already show a similar bias and that the +bias is larger for larger models. These results raise important questions about +both the datasets used to represent truthfulness and what language models +capture about the relationship between truth and politics. + +
+
+
+
+
+ + ☆ RexUniNLU: Recursive Method with Explicit Schema Instructor for + Universal NLU + + +
+ Information Extraction (IE) and Text Classification (CLS) serve as the +fundamental pillars of NLU, with both disciplines relying on analyzing input +sequences to categorize outputs into pre-established schemas. However, there is +no existing encoder-based model that can unify IE and CLS tasks from this +perspective. To fully explore the foundation shared within NLU tasks, we have +proposed a Recursive Method with Explicit Schema Instructor for Universal NLU. +Specifically, we firstly redefine the true universal information extraction +(UIE) with a formal formulation that covers almost all extraction schemas, +including quadruples and quintuples which remain unsolved for previous UIE +models. Then, we expands the formulation to all CLS and multi-modal NLU tasks. +Based on that, we introduce RexUniNLU, an universal NLU solution that employs +explicit schema constraints for IE and CLS, which encompasses all IE and CLS +tasks and prevent incorrect connections between schema and input sequence. To +avoid interference between different schemas, we reset the position ids and +attention mask matrices. Extensive experiments are conducted on IE, CLS in both +English and Chinese, and multi-modality, revealing the effectiveness and +superiority. Our codes are publicly released. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2304.14770 +
+
+
+
+
+ + ☆ UPCS: Unbiased Persona Construction for Dialogue Generation + + +
+ Narrative systems, such as dialogue and storytelling systems, often utilize +persona profiles to enhance personalized interactions. Existing persona +profiles frequently exhibit biases, posing risks to system integrity and +fairness. To address this, we introduce the UPCS framework, which categorizes +character descriptions into eight dimensions, including bias mitigation +strategies. Experimental results demonstrate UPCS's superiority in accuracy, +diversity, bias elimination, and user satisfaction, marking a significant +advancement in persona construction for reliable narrative systems. + +
+
+
+
+
+ + ♻ ☆ Using Natural Language Explanations to Rescale Human Judgments + + +
+ The rise of large language models (LLMs) has brought a critical need for +high-quality human-labeled data, particularly for processes like human feedback +and evaluation. A common practice is to label data via consensus annotation +over human judgments. However, annotators' judgments for subjective tasks can +differ in many ways: they may reflect different qualitative judgments about an +example, and they may be mapped to a labeling scheme in different ways. We show +that these nuances can be captured by natural language explanations, and +propose a method to rescale ordinal annotations and explanations using LLMs. +Specifically, we feed annotators' Likert ratings and corresponding explanations +into an LLM and prompt it to produce a numeric score anchored in a scoring +rubric. These scores should reflect the annotators' underlying assessments of +the example. The rubric can be designed or modified after annotation, and +include distinctions that may not have been known when the original error +taxonomy was devised. We explore our technique in the context of rating system +outputs for a document-grounded question answering task, where LLMs achieve +near-human performance. Our method rescales the raw judgments without impacting +agreement and brings the scores closer to human judgments grounded in the same +scoring rubric. + +
+
+ comment: Data available at + https://github.com/ManyaWadhwa/explanation_based_rescaling +
+
+
+
+
+ + ♻ ☆ Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in + Lifted Compiled Code + + +
+ Detecting vulnerabilities within compiled binaries is challenging due to lost +high-level code structures and other factors such as architectural +dependencies, compilers, and optimization options. To address these obstacles, +this research explores vulnerability detection using natural language +processing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn +semantics from intermediate representation (LLVM IR) code. Long short-term +memory (LSTM) neural networks were trained on embeddings from encoders created +using approximately 48k LLVM functions from the Juliet dataset. This study is +pioneering in its comparison of word2vec models with multiple bidirectional +transformer (BERT, RoBERTa) embeddings built using LLVM code to train neural +networks to detect vulnerabilities in compiled binaries. word2vec Skip-Gram +models achieved 92% validation accuracy in detecting vulnerabilities, +outperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This +suggests that complex contextual embeddings may not provide advantages over +simpler word2vec models for this task when a limited number (e.g. 48K) of data +samples are used to train the bidirectional transformer-based models. The +comparative results provide novel insights into selecting optimal embeddings +for learning compiler-independent semantic code representations to advance +machine learning detection of vulnerabilities in compiled binaries. + +
+
+ comment: Updated with improvements" +
+
+
+
+
+ + ♻ ☆ Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning + + +
+ We introduce Instruct-SkillMix, an automated approach for creating diverse, +high quality SFT data. The Instruct-SkillMix pipeline involves two stages, each +leveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to +extract core "skills" for instruction-following, either from existing datasets, +or by directly prompting the model; (2) Data generation: uses the powerful LLM +to generate (instruction, response) data that exhibit a randomly chosen pair of +these skills. Here, the use of random skill combinations promotes diversity and +difficulty. + Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from +Instruct-SkillMix leads to strong gains on instruction following benchmarks +such as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples, +LLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0. +To our knowledge, this achieves state-of-the-art performance among all models +that have only undergone SFT (no RL methods) and competes with proprietary +models such as Claude 3 Opus and LLaMA-3.1-405B-Instruct. + Ablation studies also suggest plausible reasons for why creating open +instruction-tuning datasets via naive crowd-sourcing has proved difficult. +Introducing low quality answers ("shirkers") in $20\%$ of Instruct-SkillMix +examples causes performance to plummet, sometimes catastrophically. + The Instruct-SkillMix pipeline is flexible and is adaptable to other +settings. + +
+
+
+
+
+ + ♻ ☆ Balancing Rigor and Utility: Mitigating Cognitive Biases in Large + Language Models for Multiple-Choice Questions + + +
+ This paper examines the role of cognitive biases in the decision-making +processes of large language models (LLMs), challenging the conventional goal of +eliminating all biases. We show that certain cognitive biases when properly +balanced, can enhance decision-making efficiency through rational deviations +and heuristic shortcuts. By introducing heuristic moderation and an abstention +option, which allows LLMs to withhold responses when uncertain, we reduce error +rates, improve decision accuracy, and optimize decision rates. Using the +Balance Rigor and Utility (BRU) dataset, developed through expert +collaboration, our findings demonstrate that targeted inspection of cognitive +biases aligns LLM decisions more closely with human reasoning, enhancing +reliability and suggesting strategies for future improvements. This approach +offers a novel way to leverage cognitive biases to improve the practical +utility of LLMs across various applications. + +
+
+ comment: This article is currently under review. All data will be open on + GitHub once the review is complete. + https://github.com/limanwang/Balancing-Rigor-and-Utility +
+
+
+
+
+ + ♻ ☆ X-InstructBLIP: A Framework for aligning X-Modal instruction-aware + representations to LLMs and Emergent Cross-modal Reasoning + + +
+ Recent research has achieved significant advancements in visual reasoning +tasks through learning image-to-language projections and leveraging the +impressive reasoning abilities of Large Language Models (LLMs). This paper +introduces an efficient and effective framework that integrates multiple +modalities (images, 3D, audio and video) to a frozen LLM and demonstrates an +emergent ability for cross-modal reasoning (2+ modality inputs). Our approach +explores two distinct projection mechanisms: Q-Formers and Linear Projections +(LPs). Through extensive experimentation across all four modalities on 16 +benchmarks, we explore both methods and assess their adaptability in integrated +and separate cross-modal reasoning. The Q-Former projection demonstrates +superior performance in single modality scenarios and adaptability in joint +versus discriminative reasoning involving two or more modalities. However, it +exhibits lower generalization capabilities than linear projection in contexts +where task-modality data are limited. To enable this framework, we devise a +scalable pipeline that automatically generates high-quality, instruction-tuning +datasets from readily available captioning data across different modalities, +and contribute 24K QA data for audio and 250K QA data for 3D. To facilitate +further research in cross-modal reasoning, we introduce the DisCRn +(Discriminative Cross-modal Reasoning) benchmark comprising 9K audio-video QA +samples and 28K image-3D QA samples that require the model to reason +discriminatively across disparate input modalities. + +
+
+
+
+
+ + ♻ ☆ PanoSent: A Panoptic Sextuple Extraction Benchmark for Multimodal + Conversational Aspect-based Sentiment Analysis ACM MM 2024 + + +
+ While existing Aspect-based Sentiment Analysis (ABSA) has received extensive +effort and advancement, there are still gaps in defining a more holistic +research target seamlessly integrating multimodality, conversation context, +fine-granularity, and also covering the changing sentiment dynamics as well as +cognitive causal rationales. This paper bridges the gaps by introducing a +multimodal conversational ABSA, where two novel subtasks are proposed: 1) +Panoptic Sentiment Sextuple Extraction, panoramically recognizing holder, +target, aspect, opinion, sentiment, rationale from multi-turn multi-party +multimodal dialogue. 2) Sentiment Flipping Analysis, detecting the dynamic +sentiment transformation throughout the conversation with the causal reasons. +To benchmark the tasks, we construct PanoSent, a dataset annotated both +manually and automatically, featuring high quality, large scale, multimodality, +multilingualism, multi-scenarios, and covering both implicit and explicit +sentiment elements. To effectively address the tasks, we devise a novel +Chain-of-Sentiment reasoning framework, together with a novel multimodal large +language model (namely Sentica) and a paraphrase-based verification mechanism. +Extensive evaluations demonstrate the superiority of our methods over strong +baselines, validating the efficacy of all our proposed methods. The work is +expected to open up a new era for the ABSA community, and thus all our codes +and data are open at https://PanoSent.github.io/ + +
+
+ comment: Accepted by ACM MM 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ LoQT: Low-Rank Adapters for Quantized Pre-Training + + +
+ Training of large neural networks requires significant computational +resources. Despite advances using low-rank adapters and quantization, +pretraining of models such as LLMs on consumer hardware has not been possible +without model sharding, offloading during training, or per-layer gradient +updates. To address these limitations, we propose LoQT, a method for +efficiently training quantized models. LoQT uses gradient-based tensor +factorization to initialize low-rank trainable weight matrices that are +periodically merged into quantized full-rank weight matrices. Our approach is +suitable for both pretraining and fine-tuning of models, which we demonstrate +experimentally for language modeling and downstream task adaptation. We find +that LoQT enables efficient training of models up to 7B parameters on a +consumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B +parameter model using per-layer gradient updates on the same hardware. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Information Extraction using Large Language Models + + +
+ Human-like large language models (LLMs), especially the most powerful and +popular ones in OpenAI's GPT family, have proven to be very helpful for many +natural language processing (NLP) related tasks. Therefore, various attempts +have been made to apply LLMs to information extraction (IE), which is a +fundamental NLP task that involves extracting information from unstructured +plain text. To demonstrate the latest representative progress in LLMs' +information extraction ability, we assess the information extraction ability of +GPT-4 (the latest version of GPT at the time of writing this paper) from four +perspectives: Performance, Evaluation Criteria, Robustness, and Error Types. +Our results suggest a visible performance gap between GPT-4 and +state-of-the-art (SOTA) IE methods. To alleviate this problem, considering the +LLMs' human-like characteristics, we propose and analyze the effects of a +series of simple prompt-based methods, which can be generalized to other LLMs +and NLP tasks. Rich experiments show our methods' effectiveness and some of +their remaining issues in improving GPT-4's information extraction ability. + +
+
+ comment: Need to submit this paper as the replacement of arXiv:2305.14450 +
+
+
+
+
+ + ♻ ☆ Improving Factuality in Large Language Models via Decoding-Time + Hallucinatory and Truthful Comparators + + +
+ Despite their remarkable capabilities, Large Language Models (LLMs) are prone +to generate responses that contradict verifiable facts, i.e., unfaithful +hallucination content. Existing efforts generally focus on optimizing model +parameters or editing semantic representations, which compromise the internal +factual knowledge of target LLMs. In addition, hallucinations typically exhibit +multifaceted patterns in downstream tasks, limiting the model's holistic +performance across tasks. In this paper, we propose a Comparator-driven +Decoding-Time (CDT) framework to alleviate the response hallucination. Firstly, +we construct hallucinatory and truthful comparators with multi-task fine-tuning +samples. In this case, we present an instruction prototype-guided mixture of +experts strategy to enhance the ability of the corresponding comparators to +capture different hallucination or truthfulness patterns in distinct task +instructions. CDT constrains next-token predictions to factuality-robust +distributions by contrasting the logit differences between the target LLMs and +these comparators. Systematic experiments on multiple downstream tasks show +that our framework can significantly improve the model performance and response +factuality. + +
+
+ comment: Hallucination Mitigation in LLMs +
+
+
+
+
+ + ♻ ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding + Benchmark + + +
+ As the capabilities of large multimodal models (LMMs) continue to advance, +evaluating the performance of LMMs emerges as an increasing need. Additionally, +there is an even larger gap in evaluating the advanced knowledge and reasoning +abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU, +a new Chinese Massive Multi-discipline Multimodal Understanding benchmark +designed to evaluate LMMs on tasks demanding college-level subject knowledge +and deliberate reasoning in a Chinese context. CMMMU is inspired by and +strictly follows the annotation and analysis pattern of MMMU. CMMMU includes +12k manually collected multimodal questions from college exams, quizzes, and +textbooks, covering six core disciplines: Art & Design, Business, Science, +Health & Medicine, Humanities & Social Science, and Tech & Engineering, like +its companion, MMMU. These questions span 30 subjects and comprise 39 highly +heterogeneous image types, such as charts, diagrams, maps, tables, music +sheets, and chemical structures. CMMMU focuses on complex perception and +reasoning with domain-specific knowledge in the Chinese context. We evaluate 11 +open-source LLMs and one proprietary GPT-4V(ision). Even GPT-4V only achieves +accuracies of 42%, indicating a large space for improvement. CMMMU will boost +the community to build the next-generation LMMs towards expert artificial +intelligence and promote the democratization of LMMs by providing diverse +language contexts. + +
+
+
+
+
+ + ♻ ☆ AnyMatch -- Efficient Zero-Shot Entity Matching with a Small Language + Model + + +
+ Entity matching (EM) is the problem of determining whether two records refer +to same real-world entity, which is crucial in data integration, e.g., for +product catalogs or address databases. A major drawback of many EM approaches +is their dependence on labelled examples. We thus focus on the challenging +setting of zero-shot entity matching where no labelled examples are available +for an unseen target dataset. Recently, large language models (LLMs) have shown +promising results for zero-shot EM, but their low throughput and high +deployment cost limit their applicability and scalability. + We revisit the zero-shot EM problem with AnyMatch, a small language model +fine-tuned in a transfer learning setup. We propose several novel data +selection techniques to generate fine-tuning data for our model, e.g., by +selecting difficult pairs to match via an AutoML filter, by generating +additional attribute-level examples, and by controlling label imbalance in the +data. + We conduct an extensive evaluation of the prediction quality and deployment +cost of our model, in a comparison to thirteen baselines on nine benchmark +datasets. We find that AnyMatch provides competitive prediction quality despite +its small parameter size: it achieves the second-highest F1 score overall, and +outperforms several other approaches that employ models with hundreds of +billions of parameters. Furthermore, our approach exhibits major cost benefits: +the average prediction quality of AnyMatch is within 4.4% of the +state-of-the-art method MatchGPT with the proprietary trillion-parameter model +GPT-4, yet AnyMatch requires four orders of magnitude less parameters and +incurs a 3,899 times lower inference cost (in dollars per 1,000 tokens). + +
+
+ comment: 12 pages excluding references, 3 figures, and 5 tables +
+
+
+
+
+ + ♻ ☆ RAGLAB: A Modular and Research-Oriented Unified Framework for + Retrieval-Augmented Generation + + +
+ Large Language Models (LLMs) demonstrate human-level capabilities in +dialogue, reasoning, and knowledge retention. However, even the most advanced +LLMs face challenges such as hallucinations and real-time updating of their +knowledge. Current research addresses this bottleneck by equipping LLMs with +external knowledge, a technique known as Retrieval Augmented Generation (RAG). +However, two key issues constrained the development of RAG. First, there is a +growing lack of comprehensive and fair comparisons between novel RAG +algorithms. Second, open-source tools such as LlamaIndex and LangChain employ +high-level abstractions, which results in a lack of transparency and limits the +ability to develop novel algorithms and evaluation metrics. To close this gap, +we introduce RAGLAB, a modular and research-oriented open-source library. +RAGLAB reproduces 6 existing algorithms and provides a comprehensive ecosystem +for investigating RAG algorithms. Leveraging RAGLAB, we conduct a fair +comparison of 6 RAG algorithms across 10 benchmarks. With RAGLAB, researchers +can efficiently compare the performance of various algorithms and develop novel +algorithms. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild + + +
+ The increasing availability of real-world conversation data offers exciting +opportunities for researchers to study user-chatbot interactions. However, the +sheer volume of this data makes manually examining individual conversations +impractical. To overcome this challenge, we introduce WildVis, an interactive +tool that enables fast, versatile, and large-scale conversation analysis. +WildVis provides search and visualization capabilities in the text and +embedding spaces based on a list of criteria. To manage million-scale datasets, +we implemented optimizations including search index construction, embedding +precomputation and compression, and caching to ensure responsive user +interactions within seconds. We demonstrate WildVis' utility through three case +studies: facilitating chatbot misuse research, visualizing and comparing topic +distributions across datasets, and characterizing user-specific conversation +patterns. WildVis is open-source and designed to be extendable, supporting +additional datasets and customized search and visualization functionalities. + +
+
+
+
+
+ + ♻ ☆ Towards a Unified View of Preference Learning for Large Language Models: + A Survey + + +
+ Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of +the crucial factors to achieve success is aligning the LLM's output with human +preferences. This alignment process often requires only a small amount of data +to efficiently enhance the LLM's performance. While effective, research in this +area spans multiple domains, and the methods involved are relatively complex to +understand. The relationships between different methods have been +under-explored, limiting the development of the preference alignment. In light +of this, we break down the existing popular alignment strategies into different +components and provide a unified framework to study the current alignment +strategies, thereby establishing connections among them. In this survey, we +decompose all the strategies in preference learning into four components: +model, data, feedback, and algorithm. This unified view offers an in-depth +understanding of existing alignment algorithms and also opens up possibilities +to synergize the strengths of different strategies. Furthermore, we present +detailed working examples of prevalent existing algorithms to facilitate a +comprehensive understanding for the readers. Finally, based on our unified +perspective, we explore the challenges and future research directions for +aligning large language models with human preferences. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ YOLO-Stutter: End-to-end Region-Wise Speech Dysfluency Detection + + +
+ Dysfluent speech detection is the bottleneck for disordered speech analysis +and spoken language learning. Current state-of-the-art models are governed by +rule-based systems which lack efficiency and robustness, and are sensitive to +template design. In this paper, we propose YOLO-Stutter: a first end-to-end +method that detects dysfluencies in a time-accurate manner. YOLO-Stutter takes +imperfect speech-text alignment as input, followed by a spatial feature +aggregator, and a temporal dependency extractor to perform region-wise boundary +and class predictions. We also introduce two dysfluency corpus, VCTK-Stutter +and VCTK-TTS, that simulate natural spoken dysfluencies including repetition, +block, missing, replacement, and prolongation. Our end-to-end method achieves +state-of-the-art performance with a minimum number of trainable parameters for +on both simulated data and real aphasia speech. Code and datasets are +open-sourced at https://github.com/rorizzz/YOLO-Stutter + +
+
+ comment: Interspeech 2024 +
+
+
+
+
+ + ♻ ☆ Natural Language Processing RELIES on Linguistics + + +
+ Large Language Models (LLMs) have become capable of generating highly fluent +text in certain languages, without modules specially designed to capture +grammar or semantic coherence. What does this mean for the future of linguistic +expertise in NLP? We highlight several aspects in which NLP (still) relies on +linguistics, or where linguistic thinking can illuminate new directions. We +argue our case around the acronym RELIES that encapsulates six major facets +where linguistics contributes to NLP: Resources, Evaluation, Low-resource +settings, Interpretability, Explanation, and the Study of language. This list +is not exhaustive, nor is linguistics the main point of reference for every +effort under these themes; but at a macro level, these facets highlight the +enduring importance of studying machine systems vis-\`a-vis systems of human +language. + +
+
+
+
+
+ + ♻ ☆ CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) constitutes a foundational task, directed +towards learning representations for entities and relations within knowledge +graphs (KGs), with the objective of crafting representations comprehensive +enough to approximate the logical and symbolic interconnections among entities. +In this paper, we define a metric Z-counts to measure the difficulty of +training each triple ($<$head entity, relation, tail entity$>$) in KGs with +theoretical analysis. Based on this metric, we propose \textbf{CL4KGE}, an +efficient \textbf{C}urriculum \textbf{L}earning based training strategy for +\textbf{KGE}. This method includes a difficulty measurer and a training +scheduler that aids in the training of KGE models. Our approach possesses the +flexibility to act as a plugin within a wide range of KGE models, with the +added advantage of adaptability to the majority of KGs in existence. The +proposed method has been evaluated on popular KGE models, and the results +demonstrate that it enhances the state-of-the-art methods. The use of Z-counts +as a metric has enabled the identification of challenging triples in KGs, which +helps in devising effective training strategies. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG + Capabilities + + +
+ In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K +context window, designed to bridge the gap between open-source LLMs and leading +proprietary models (e.g., GPT-4-Turbo) in long-context understanding and +retrieval-augmented generation (RAG) capabilities. These two capabilities are +essential for LLMs to process large volumes of information that cannot fit into +a single prompt and are complementary to each other, depending on the +downstream tasks and computational budgets. We present a detailed continued +training recipe to extend the context window of Llama3-70B-base from 8K to 128K +tokens, along with a three-stage instruction tuning process to enhance the +model's instruction-following, RAG performance, and long-context understanding +capabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model +outperforms most existing state-of-the-art models, including +GPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on +ultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only +a 4K context window, showing the strong long context capability across varying +sequence lengths. We further provide extensive comparisons between direct +long-context and RAG solutions using the same state-of-the-art long-context +LLMs. Interestingly, we find that the performance of strong long-context LLMs +using RAG improves when retrieving a larger number of chunks. With a large set +of top-k chunks, RAG consistently outperforms direct long-context solution +using the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B +and Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To +advance research in this field, we open-sourced the model weights, training +data, and the evaluation setup for the for the community: +https://chatqa2-project.github.io/ + +
+
+ comment: v2: major update with significantly improved results +
+
+
+
+
+ + ♻ ☆ A Survey on Employing Large Language Models for Text-to-SQL Tasks + + +
+ The increasing volume of data stored in relational databases has led to the +need for efficient querying and utilization of this data in various sectors. +However, writing SQL queries requires specialized knowledge, which poses a +challenge for non-professional users trying to access and query databases. +Text-to-SQL parsing solves this issue by converting natural language queries +into SQL queries, thus making database access more accessible for non-expert +users. To take advantage of the recent developments in Large Language Models +(LLMs), a range of new methods have emerged, with a primary focus on prompt +engineering and fine-tuning. This survey provides a comprehensive overview of +LLMs in text-to-SQL tasks, discussing benchmark datasets, prompt engineering, +fine-tuning methods, and future research directions. We hope this review will +enable readers to gain a broader understanding of the recent advances in this +field and offer some insights into its future trajectory. + +
+
+
+
+
+ + ♻ ☆ Advancing Aspect-Based Sentiment Analysis through Deep Learning Models + + +
+ Aspect-based sentiment analysis predicts sentiment polarity with fine +granularity. While graph convolutional networks (GCNs) are widely utilized for +sentimental feature extraction, their naive application for syntactic feature +extraction can compromise information preservation. This study introduces an +innovative edge-enhanced GCN, named SentiSys, to navigate the syntactic graph +while preserving intact feature information, leading to enhanced performance. +Specifically,we first integrate a bidirectional long short-term memory +(Bi-LSTM) network and a self-attention-based transformer. This combination +facilitates effective text encoding, preventing the loss of information and +predicting long dependency text. A bidirectional GCN (Bi-GCN) with message +passing is then employed to encode relationships between entities. +Additionally, unnecessary information is filtered out using an aspect-specific +masking technique. To validate the effectiveness of our proposed model, we +conduct extensive evaluation experiments on four benchmark datasets. The +experimental results demonstrate enhanced performance in aspect-based sentiment +analysis with the use of SentiSys. + +
+
+ comment: This paper has already been accepted by the 20th International + Conference on Advanced Data Mining and Applications (ADMA2024) +
+
+
+
+
+ + ♻ ☆ Disentangling Length from Quality in Direct Preference Optimization + + +
+ Reinforcement Learning from Human Feedback (RLHF) has been a crucial +component in the recent success of Large Language Models. However, RLHF is know +to exploit biases in human preferences, such as verbosity. A well-formatted and +eloquent answer is often more highly rated by users, even when it is less +helpful and objective. A number of approaches have been developed to control +those biases in the classical RLHF literature, but the problem remains +relatively under-explored for Direct Alignment Algorithms such as Direct +Preference Optimization (DPO). Unlike classical RLHF, DPO does not train a +separate reward model or use reinforcement learning directly, so previous +approaches developed to control verbosity cannot be directly applied to this +setting. Our work makes several contributions. For the first time, we study the +length problem in the DPO setting, showing significant exploitation in DPO and +linking it to out-of-distribution bootstrapping. We then develop a principled +but simple regularization strategy that prevents length exploitation, while +still maintaining improvements in model quality. We demonstrate these effects +across datasets on summarization and dialogue, where we achieve up to 20\% +improvement in win rates when controlling for length, despite the GPT4 judge's +well-known verbosity bias. + +
+
+
+
+
+ + ♻ ☆ MQuAKE: Assessing Knowledge Editing in Language Models via Multi-Hop + Questions EMNLP 2023 + + +
+ The information stored in large language models (LLMs) falls out of date +quickly, and retraining from scratch is often not an option. This has recently +given rise to a range of techniques for injecting new facts through updating +model weights. Current evaluation paradigms are extremely limited, mainly +validating the recall of edited facts, but changing one fact should cause +rippling changes to the model's related beliefs. If we edit the UK Prime +Minister to now be Rishi Sunak, then we should get a different answer to Who is +married to the British Prime Minister? In this work, we present a benchmark, +MQuAKE (Multi-hop Question Answering for Knowledge Editing), comprising +multi-hop questions that assess whether edited models correctly answer +questions where the answer should change as an entailed consequence of edited +facts. While we find that current knowledge-editing approaches can recall +edited facts accurately, they fail catastrophically on the constructed +multi-hop questions. We thus propose a simple memory-based approach, MeLLo, +which stores all edited facts externally while prompting the language model +iteratively to generate answers that are consistent with the edited facts. +While MQuAKE remains challenging, we show that MeLLo scales well with LLMs +(e.g., OpenAI GPT-3.5-turbo) and outperforms previous model editors by a large +margin. + +
+
+ comment: EMNLP 2023. Our code and datasets are available at + https://github.com/princeton-nlp/MQuAKE +
+
+
+
+
+ + ♻ ☆ Shared Latent Space by Both Languages in Non-Autoregressive Neural + Machine Translation + + +
+ Non-autoregressive neural machine translation (NAT) offers substantial +translation speed up compared to autoregressive neural machine translation (AT) +at the cost of translation quality. Latent variable modeling has emerged as a +promising approach to bridge this quality gap, particularly for addressing the +chronic multimodality problem in NAT. In the previous works that used latent +variable modeling, they added an auxiliary model to estimate the posterior +distribution of the latent variable conditioned on the source and target +sentences. However, it causes several disadvantages, such as redundant +information extraction in the latent variable, increasing the number of +parameters, and a tendency to ignore some information from the inputs. In this +paper, we propose a novel latent variable modeling that integrates a dual +reconstruction perspective and an advanced hierarchical latent modeling with a +shared intermediate latent space across languages. This latent variable +modeling hypothetically alleviates or prevents the above disadvantages. In our +experiment results, we present comprehensive demonstrations that our proposed +approach infers superior latent variables which lead better translation +quality. Finally, in the benchmark translation tasks, such as WMT, we +demonstrate that our proposed method significantly improves translation quality +compared to previous NAT baselines including the state-of-the-art NAT model. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 145 + +
+
+
+ + ☆ Flash Cache: Reducing Bias in Radiance Cache Based Inverse Rendering + + +
+ State-of-the-art techniques for 3D reconstruction are largely based on +volumetric scene representations, which require sampling multiple points to +compute the color arriving along a ray. Using these representations for more +general inverse rendering -- reconstructing geometry, materials, and lighting +from observed images -- is challenging because recursively path-tracing such +volumetric representations is expensive. Recent works alleviate this issue +through the use of radiance caches: data structures that store the +steady-state, infinite-bounce radiance arriving at any point from any +direction. However, these solutions rely on approximations that introduce bias +into the renderings and, more importantly, into the gradients used for +optimization. We present a method that avoids these approximations while +remaining computationally efficient. In particular, we leverage two techniques +to reduce variance for unbiased estimators of the rendering equation: (1) an +occlusion-aware importance sampler for incoming illumination and (2) a fast +cache architecture that can be used as a control variate for the radiance from +a high-quality, but more expensive, volumetric cache. We show that by removing +these biases our approach improves the generality of radiance cache based +inverse rendering, as well as increasing quality in the presence of challenging +light transport effects such as specular reflections. + +
+
+ comment: Website: https://benattal.github.io/flash-cache/ +
+
+
+
+
+ + ☆ Neural MP: A Generalist Neural Motion Planner + + +
+ The current paradigm for motion planning generates solutions from scratch for +every new problem, which consumes significant amounts of time and computational +resources. For complex, cluttered scenes, motion planning approaches can often +take minutes to produce a solution, while humans are able to accurately and +safely reach any goal in seconds by leveraging their prior experience. We seek +to do the same by applying data-driven learning at scale to the problem of +motion planning. Our approach builds a large number of complex scenes in +simulation, collects expert data from a motion planner, then distills it into a +reactive generalist policy. We then combine this with lightweight optimization +to obtain a safe path for real world deployment. We perform a thorough +evaluation of our method on 64 motion planning tasks across four diverse +environments with randomized poses, scenes and obstacles, in the real world, +demonstrating an improvement of 23%, 17% and 79% motion planning success rate +over state of the art sampling, optimization and learning based planning +methods. Video results available at mihdalal.github.io/neuralmotionplanner + +
+
+ comment: Website at mihdalal.github.io/neuralmotionplanner. Main paper: 7 + pages, 4 figures, 2 tables. Appendix: 9 pages, 5 figures, 6 tables +
+
+
+
+
+ + ☆ Promptable Closed-loop Traffic Simulation + + +
+ Simulation stands as a cornerstone for safe and efficient autonomous driving +development. At its core a simulation system ought to produce realistic, +reactive, and controllable traffic patterns. In this paper, we propose ProSim, +a multimodal promptable closed-loop traffic simulation framework. ProSim allows +the user to give a complex set of numerical, categorical or textual prompts to +instruct each agent's behavior and intention. ProSim then rolls out a traffic +scenario in a closed-loop manner, modeling each agent's interaction with other +traffic participants. Our experiments show that ProSim achieves high prompt +controllability given different user prompts, while reaching competitive +performance on the Waymo Sim Agents Challenge when no prompt is given. To +support research on promptable traffic simulation, we create +ProSim-Instruct-520k, a multimodal prompt-scenario paired driving dataset with +over 10M text prompts for over 520k real-world driving scenarios. We will +release code of ProSim as well as data and labeling tools of +ProSim-Instruct-520k at https://ariostgx.github.io/ProSim. + +
+
+ comment: Accepted to CoRL 2024. Website available at + https://ariostgx.github.io/ProSim +
+
+
+
+
+ + ☆ Evaluating Multiview Object Consistency in Humans and Image Models + + +
+ We introduce a benchmark to directly evaluate the alignment between human +observers and vision models on a 3D shape inference task. We leverage an +experimental design from the cognitive sciences which requires zero-shot visual +inferences about object shape: given a set of images, participants identify +which contain the same/different objects, despite considerable viewpoint +variation. We draw from a diverse range of images that include common objects +(e.g., chairs) as well as abstract shapes (i.e., procedurally generated +`nonsense' objects). After constructing over 2000 unique image sets, we +administer these tasks to human participants, collecting 35K trials of +behavioral data from over 500 participants. This includes explicit choice +behaviors as well as intermediate measures, such as reaction time and gaze +data. We then evaluate the performance of common vision models (e.g., DINOv2, +MAE, CLIP). We find that humans outperform all models by a wide margin. Using a +multi-scale evaluation approach, we identify underlying similarities and +differences between models and humans: while human-model performance is +correlated, humans allocate more time/processing on challenging trials. All +images, data, and code can be accessed via our project page. + +
+
+ comment: Project page: https:/tzler.github.io/MOCHI/ Code: + https://github.com/tzler/mochi_code Huggingface dataset: + https://huggingface.co/datasets/tzler/MOCHI +
+
+
+
+
+ + ☆ LSVOS Challenge Report: Large-scale Complex and Long Video Object + Segmentation ECCV 2024 + + +
+ Despite the promising performance of current video segmentation models on +existing benchmarks, these models still struggle with complex scenes. In this +paper, we introduce the 6th Large-scale Video Object Segmentation (LSVOS) +challenge in conjunction with ECCV 2024 workshop. This year's challenge +includes two tasks: Video Object Segmentation (VOS) and Referring Video Object +Segmentation (RVOS). In this year, we replace the classic YouTube-VOS and +YouTube-RVOS benchmark with latest datasets MOSE, LVOS, and MeViS to assess VOS +under more challenging complex environments. This year's challenge attracted +129 registered teams from more than 20 institutes across over 8 countries. This +report include the challenge and dataset introduction, and the methods used by +top 7 teams in two tracks. More details can be found in our homepage +https://lsvos.github.io/. + +
+
+ comment: ECCV 2024 LSVOS Challenge Report: https://lsvos.github.io/ +
+
+
+
+
+ + ☆ Vision-Driven 2D Supervised Fine-Tuning Framework for Bird's Eye View + Perception + + +
+ Visual bird's eye view (BEV) perception, due to its excellent perceptual +capabilities, is progressively replacing costly LiDAR-based perception systems, +especially in the realm of urban intelligent driving. However, this type of +perception still relies on LiDAR data to construct ground truth databases, a +process that is both cumbersome and time-consuming. Moreover, most massproduced +autonomous driving systems are only equipped with surround camera sensors and +lack LiDAR data for precise annotation. To tackle this challenge, we propose a +fine-tuning method for BEV perception network based on visual 2D semantic +perception, aimed at enhancing the model's generalization capabilities in new +scene data. Considering the maturity and development of 2D perception +technologies, our method significantly reduces the dependency on high-cost BEV +ground truths and shows promising industrial application prospects. Extensive +experiments and comparative analyses conducted on the nuScenes and Waymo public +datasets demonstrate the effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ GASP: Gaussian Splatting for Physic-Based Simulations + + +
+ Physics simulation is paramount for modeling and utilization of 3D scenes in +various real-world applications. However, its integration with state-of-the-art +3D scene rendering techniques such as Gaussian Splatting (GS) remains +challenging. Existing models use additional meshing mechanisms, including +triangle or tetrahedron meshing, marching cubes, or cage meshes. As an +alternative, we can modify the physics grounded Newtonian dynamics to align +with 3D Gaussian components. Current models take the first-order approximation +of a deformation map, which locally approximates the dynamics by linear +transformations. In contrast, our Gaussian Splatting for Physics-Based +Simulations (GASP) model uses such a map (without any modifications) and flat +Gaussian distributions, which are parameterized by three points (mesh faces). +Subsequently, each 3D point (mesh face node) is treated as a discrete entity +within a 3D space. Consequently, the problem of modeling Gaussian components is +reduced to working with 3D points. Additionally, the information on mesh faces +can be used to incorporate further properties into the physics model, +facilitating the use of triangles. Resulting solution can be integrated into +any physics engine that can be treated as a black box. As demonstrated in our +studies, the proposed model exhibits superior performance on a diverse range of +benchmark datasets designed for 3D object rendering. + +
+
+
+
+
+ + ☆ VFA: Vision Frequency Analysis of Foundation Models and Human + + +
+ Machine learning models often struggle with distribution shifts in real-world +scenarios, whereas humans exhibit robust adaptation. Models that better align +with human perception may achieve higher out-of-distribution generalization. In +this study, we investigate how various characteristics of large-scale computer +vision models influence their alignment with human capabilities and robustness. +Our findings indicate that increasing model and data size and incorporating +rich semantic information and multiple modalities enhance models' alignment +with human perception and their overall robustness. Our empirical analysis +demonstrates a strong correlation between out-of-distribution accuracy and +human alignment. + +
+
+
+
+
+ + ☆ A Flexible Framework for Universal Computational Aberration Correction + via Automatic Lens Library Generation and Domain Adaptation + + +
+ Emerging universal Computational Aberration Correction (CAC) paradigms +provide an inspiring solution to light-weight and high-quality imaging without +repeated data preparation and model training to accommodate new lens designs. +However, the training databases in these approaches, i.e., the lens libraries +(LensLibs), suffer from their limited coverage of real-world aberration +behaviors. In this work, we set up an OmniLens framework for universal CAC, +considering both the generalization ability and flexibility. OmniLens extends +the idea of universal CAC to a broader concept, where a base model is trained +for three cases, including zero-shot CAC with the pre-trained model, few-shot +CAC with a little lens-specific data for fine-tuning, and domain adaptive CAC +using domain adaptation for lens-descriptions-unknown lens. In terms of +OmniLens's data foundation, we first propose an Evolution-based Automatic +Optical Design (EAOD) pipeline to construct LensLib automatically, coined +AODLib, whose diversity is enriched by an evolution framework, with +comprehensive constraints and a hybrid optimization strategy for achieving +realistic aberration behaviors. For network design, we introduce the guidance +of high-quality codebook priors to facilitate zero-shot CAC and few-shot CAC, +which enhances the model's generalization ability, while also boosting its +convergence in a few-shot case. Furthermore, based on the statistical +observation of dark channel priors in optical degradation, we design an +unsupervised regularization term to adapt the base model to the target +descriptions-unknown lens using its aberration images without ground truth. We +validate OmniLens on 4 manually designed low-end lenses with various structures +and aberration behaviors. Remarkably, the base model trained on AODLib exhibits +strong generalization capabilities, achieving 97% of the lens-specific +performance in a zero-shot setting. + +
+
+
+
+
+ + ☆ Input Space Mode Connectivity in Deep Neural Networks + + +
+ We extend the concept of loss landscape mode connectivity to the input space +of deep neural networks. Mode connectivity was originally studied within +parameter space, where it describes the existence of low-loss paths between +different solutions (loss minimizers) obtained through gradient descent. We +present theoretical and empirical evidence of its presence in the input space +of deep networks, thereby highlighting the broader nature of the phenomenon. We +observe that different input images with similar predictions are generally +connected, and for trained models, the path tends to be simple, with only a +small deviation from being a linear path. Our methodology utilizes real, +interpolated, and synthetic inputs created using the input optimization +technique for feature visualization. We conjecture that input space mode +connectivity in high-dimensional spaces is a geometric effect that takes place +even in untrained models and can be explained through percolation theory. We +exploit mode connectivity to obtain new insights about adversarial examples and +demonstrate its potential for adversarial detection. Additionally, we discuss +applications for the interpretability of deep networks. + +
+
+
+
+
+ + ☆ Leveraging Object Priors for Point Tracking ECCV 2024 + + +
+ Point tracking is a fundamental problem in computer vision with numerous +applications in AR and robotics. A common failure mode in long-term point +tracking occurs when the predicted point leaves the object it belongs to and +lands on the background or another object. We identify this as the failure to +correctly capture objectness properties in learning to track. To address this +limitation of prior work, we propose a novel objectness regularization approach +that guides points to be aware of object priors by forcing them to stay inside +the the boundaries of object instances. By capturing objectness cues at +training time, we avoid the need to compute object masks during testing. In +addition, we leverage contextual attention to enhance the feature +representation for capturing objectness at the feature level more effectively. +As a result, our approach achieves state-of-the-art performance on three point +tracking benchmarks, and we further validate the effectiveness of our +components via ablation studies. The source code is available at: +https://github.com/RehgLab/tracking_objectness + +
+
+ comment: ECCV 2024 ILR Workshop +
+
+
+
+
+ + ☆ Creativity and Visual Communication from Machine to Musician: Sharing a + Score through a Robotic Camera + + +
+ This paper explores the integration of visual communication and musical +interaction by implementing a robotic camera within a "Guided Harmony" musical +game. We aim to examine co-creative behaviors between human musicians and +robotic systems. Our research explores existing methodologies like +improvisational game pieces and extends these concepts to include robotic +participation using a PTZ camera. The robotic system interprets and responds to +nonverbal cues from musicians, creating a collaborative and adaptive musical +experience. This initial case study underscores the importance of intuitive +visual communication channels. We also propose future research directions, +including parameters for refining the visual cue toolkit and data collection +methods to understand human-machine co-creativity further. Our findings +contribute to the broader understanding of machine intelligence in augmenting +human creativity, particularly in musical settings. + +
+
+
+
+
+ + ☆ Consensus-based Distributed Quantum Kernel Learning for Speech + Recognition + + +
+ This paper presents a Consensus-based Distributed Quantum Kernel Learning +(CDQKL) framework aimed at improving speech recognition through distributed +quantum computing.CDQKL addresses the challenges of scalability and data +privacy in centralized quantum kernel learning. It does this by distributing +computational tasks across quantum terminals, which are connected through +classical channels. This approach enables the exchange of model parameters +without sharing local training data, thereby maintaining data privacy and +enhancing computational efficiency. Experimental evaluations on benchmark +speech emotion recognition datasets demonstrate that CDQKL achieves competitive +classification accuracy and scalability compared to centralized and local +quantum kernel learning models. The distributed nature of CDQKL offers +advantages in privacy preservation and computational efficiency, making it +suitable for data-sensitive fields such as telecommunications, automotive, and +finance. The findings suggest that CDQKL can effectively leverage distributed +quantum computing for large-scale machine-learning tasks. + +
+
+
+
+
+ + ☆ ReL-SAR: Representation Learning for Skeleton Action Recognition with + Convolutional Transformers and BYOL + + +
+ To extract robust and generalizable skeleton action recognition features, +large amounts of well-curated data are typically required, which is a +challenging task hindered by annotation and computation costs. Therefore, +unsupervised representation learning is of prime importance to leverage +unlabeled skeleton data. In this work, we investigate unsupervised +representation learning for skeleton action recognition. For this purpose, we +designed a lightweight convolutional transformer framework, named ReL-SAR, +exploiting the complementarity of convolutional and attention layers for +jointly modeling spatial and temporal cues in skeleton sequences. We also use a +Selection-Permutation strategy for skeleton joints to ensure more informative +descriptions from skeletal data. Finally, we capitalize on Bootstrap Your Own +Latent (BYOL) to learn robust representations from unlabeled skeleton sequence +data. We achieved very competitive results on limited-size datasets: MCAD, +IXMAS, JHMDB, and NW-UCLA, showing the effectiveness of our proposed method +against state-of-the-art methods in terms of both performance and computational +efficiency. To ensure reproducibility and reusability, the source code +including all implementation parameters is provided at: +https://github.com/SafwenNaimi/Representation-Learning-for-Skeleton-Action-Recognition-with-Convolutional-Transformers-and-BYOL + +
+
+ comment: 8 pages, 4 figures, 6 tables +
+
+
+
+
+ + ☆ Robust Loss Functions for Object Grasping under Limited Ground Truth + + +
+ Object grasping is a crucial technology enabling robots to perceive and +interact with the environment sufficiently. However, in practical applications, +researchers are faced with missing or noisy ground truth while training the +convolutional neural network, which decreases the accuracy of the model. +Therefore, different loss functions are proposed to deal with these problems to +improve the accuracy of the neural network. For missing ground truth, a new +predicted category probability method is defined for unlabeled samples, which +works effectively in conjunction with the pseudo-labeling method. Furthermore, +for noisy ground truth, a symmetric loss function is introduced to resist the +corruption of label noises. The proposed loss functions are powerful, robust, +and easy to use. Experimental results based on the typical grasping neural +network show that our method can improve performance by 2 to 13 percent. + +
+
+
+
+
+ + ☆ Referring Expression Generation in Visually Grounded Dialogue with + Discourse-aware Comprehension Guiding + + +
+ We propose an approach to referring expression generation (REG) in visually +grounded dialogue that is meant to produce referring expressions (REs) that are +both discriminative and discourse-appropriate. Our method constitutes a +two-stage process. First, we model REG as a text- and image-conditioned +next-token prediction task. REs are autoregressively generated based on their +preceding linguistic context and a visual representation of the referent. +Second, we propose the use of discourse-aware comprehension guiding as part of +a generate-and-rerank strategy through which candidate REs generated with our +REG model are reranked based on their discourse-dependent discriminatory power. +Results from our human evaluation indicate that our proposed two-stage approach +is effective in producing discriminative REs, with higher performance in terms +of text-image retrieval accuracy for reranked REs compared to those generated +using greedy decoding. + +
+
+ comment: Accepted for publication at INLG 2024 +
+
+
+
+
+ + ☆ Boosting CNN-based Handwriting Recognition Systems with Learnable + Relaxation Labeling + + +
+ The primary challenge for handwriting recognition systems lies in managing +long-range contextual dependencies, an issue that traditional models often +struggle with. To mitigate it, attention mechanisms have recently been employed +to enhance context-aware labelling, thereby achieving state-of-the-art +performance. In the field of pattern recognition and image analysis, however, +the use of contextual information in labelling problems has a long history and +goes back at least to the early 1970's. Among the various approaches developed +in those years, Relaxation Labelling (RL) processes have played a prominent +role and have been the method of choice in the field for more than a decade. +Contrary to recent transformer-based architectures, RL processes offer a +principled approach to the use of contextual constraints, having a solid +theoretic foundation grounded on variational inequality and game theory, as +well as effective algorithms with convergence guarantees. In this paper, we +propose a novel approach to handwriting recognition that integrates the +strengths of two distinct methodologies. In particular, we propose integrating +(trainable) RL processes with various well-established neural architectures and +we introduce a sparsification technique that accelerates the convergence of the +algorithm and enhances the overall system's performance. Experiments over +several benchmark datasets show that RL processes can improve the +generalisation ability, even surpassing in some cases transformer-based +architectures. + +
+
+ comment: 26 pages, 3 figures +
+
+
+
+
+ + ☆ Segmentation by Factorization: Unsupervised Semantic Segmentation for + Pathology by Factorizing Foundation Model Features + + +
+ We introduce Segmentation by Factorization (F-SEG), an unsupervised +segmentation method for pathology that generates segmentation masks from +pre-trained deep learning models. F-SEG allows the use of pre-trained deep +neural networks, including recently developed pathology foundation models, for +semantic segmentation. It achieves this without requiring additional training +or finetuning, by factorizing the spatial features extracted by the models into +segmentation masks and their associated concept features. We create generic +tissue phenotypes for H&E images by training clustering models for multiple +numbers of clusters on features extracted from several deep learning models on +The Cancer Genome Atlas Program (TCGA), and then show how the clusters can be +used for factorizing corresponding segmentation masks using off-the-shelf deep +learning models. Our results show that F-SEG provides robust unsupervised +segmentation capabilities for H&E pathology images, and that the segmentation +quality is greatly improved by utilizing pathology foundation models. We +discuss and propose methods for evaluating the performance of unsupervised +segmentation in pathology. + +
+
+
+
+
+ + ☆ LayeredFlow: A Real-World Benchmark for Non-Lambertian Multi-Layer + Optical Flow ECCV 2024 + + +
+ Achieving 3D understanding of non-Lambertian objects is an important task +with many useful applications, but most existing algorithms struggle to deal +with such objects. One major obstacle towards progress in this field is the +lack of holistic non-Lambertian benchmarks -- most benchmarks have low scene +and object diversity, and none provide multi-layer 3D annotations for objects +occluded by transparent surfaces. In this paper, we introduce LayeredFlow, a +real world benchmark containing multi-layer ground truth annotation for optical +flow of non-Lambertian objects. Compared to previous benchmarks, our benchmark +exhibits greater scene and object diversity, with 150k high quality optical +flow and stereo pairs taken over 185 indoor and outdoor scenes and 360 unique +objects. Using LayeredFlow as evaluation data, we propose a new task called +multi-layer optical flow. To provide training data for this task, we introduce +a large-scale densely-annotated synthetic dataset containing 60k images within +30 scenes tailored for non-Lambertian objects. Training on our synthetic +dataset enables model to predict multi-layer optical flow, while fine-tuning +existing optical flow methods on the dataset notably boosts their performance +on non-Lambertian objects without compromising the performance on diffuse +objects. Data is available at https://layeredflow.cs.princeton.edu. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ SX-Stitch: An Efficient VMS-UNet Based Framework for Intraoperative + Scoliosis X-Ray Image Stitching + + +
+ In scoliosis surgery, the limited field of view of the C-arm X-ray machine +restricts the surgeons' holistic analysis of spinal structures .This paper +presents an end-to-end efficient and robust intraoperative X-ray image +stitching method for scoliosis surgery,named SX-Stitch. The method is divided +into two stages:segmentation and stitching. In the segmentation stage, We +propose a medical image segmentation model named Vision Mamba of Spine-UNet +(VMS-UNet), which utilizes the state space Mamba to capture long-distance +contextual information while maintaining linear computational complexity, and +incorporates the SimAM attention mechanism, significantly improving the +segmentation performance.In the stitching stage, we simplify the alignment +process between images to the minimization of a registration energy function. +The total energy function is then optimized to order unordered images, and a +hybrid energy function is introduced to optimize the best seam, effectively +eliminating parallax artifacts. On the clinical dataset, Sx-Stitch demonstrates +superiority over SOTA schemes both qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ Cherenkov Imaged Bio-morphological Features Verify Patient Positioning + with Deformable Tissue Translocation in Breast Radiotherapy + + +
+ Accurate patient positioning is critical for precise radiotherapy dose +delivery, as positioning errors can significantly affect treatment outcomes. +This study introduces a novel method for tracking loco-regional tissue +deformation through Cherenkov image analysis during fractionated breast cancer +radiotherapy. The primary goal was to develop and test an algorithm for +Cherenkov-based regional position accuracy quantification, specifically for +loco-regional deformations, which lack ideal quantification methods in +radiotherapy. Blood vessel detection and segmentation were developed in +Cherenkov images using a tissue phantom with incremental movements, and later +applied to images from fractionated whole breast radiotherapy in human patients +(n=10). A combined rigid and non-rigid registration technique was used to +detect inter- and intra-fractional positioning variations. This approach +quantified positioning variations in two parts: a global shift from rigid +registration and a two-dimensional variation map of loco-regional deformation +from non-rigid registration. The methodology was validated using an +anthropomorphic chest phantom experiment, where known treatment couch +translations and respiratory motion were simulated to assess inter- and +intra-fractional uncertainties, yielding an average accuracy of 0.83 mm for +couch translations up to 20 mm. Analysis of clinical Cherenkov data from ten +breast cancer patients showed an inter-fraction setup variation of 3.7 plus +minus 2.4 mm relative to the first fraction and loco-regional deformations +(95th percentile) of up to 3.3 plus minus 1.9 mm. This study presents a +Cherenkov-based approach to quantify global and local positioning variations, +demonstrating feasibility in addressing loco-regional deformations that +conventional imaging techniques fail to capture. + +
+
+ comment: 25 pages, 4 figures, 1 table, journal under review +
+
+
+
+
+ + ☆ AnomalyCD: A benchmark for Earth anomaly change detection with + high-resolution and time-series observations + + +
+ Various Earth anomalies have destroyed the stable, balanced state, resulting +in fatalities and serious destruction of property. With the advantages of +large-scale and precise observation, high-resolution remote sensing images have +been widely used for anomaly monitoring and localization. Powered by the deep +representation, the existing methods have achieved remarkable advances, +primarily in classification and change detection techniques. However, labeled +samples are difficult to acquire due to the low probability of anomaly +occurrence, and the trained models are limited to fixed anomaly categories, +which hinders the application for anomalies with few samples or unknown +anomalies. In this paper, to tackle this problem, we propose the anomaly change +detection (AnomalyCD) technique, which accepts time-series observations and +learns to identify anomalous changes by learning from the historical normal +change pattern. Compared to the existing techniques, AnomalyCD processes an +unfixed number of time steps and can localize the various anomalies in a +unified manner, without human supervision. To benchmark AnomalyCD, we +constructed a high-resolution dataset with time-series images dedicated to +various Earth anomalies (the AnomalyCDD dataset). AnomalyCDD contains +high-resolution (from 0.15 to 2.39 m/pixel), time-series (from 3 to 7 time +steps), and large-scale images (1927.93 km2 in total) collected globally +Furthermore, we developed a zero-shot baseline model (AnomalyCDM), which +implements the AnomalyCD technique by extracting a general representation from +the segment anything model (SAM) and conducting temporal comparison to +distinguish the anomalous changes from normal changes. AnomalyCDM is designed +as a two-stage workflow to enhance the efficiency, and has the ability to +process the unseen images directly, without retraining for each scene. + +
+
+ comment: remote sensing benchmark +
+
+
+
+
+ + ☆ Robust Real-time Segmentation of Bio-Morphological Features in Human + Cherenkov Imaging during Radiotherapy via Deep Learning + + +
+ Cherenkov imaging enables real-time visualization of megavoltage X-ray or +electron beam delivery to the patient during Radiation Therapy (RT). +Bio-morphological features, such as vasculature, seen in these images are +patient-specific signatures that can be used for verification of positioning +and motion management that are essential to precise RT treatment. However until +now, no concerted analysis of this biological feature-based tracking was +utilized because of the slow speed and accuracy of conventional image +processing for feature segmentation. This study demonstrated the first deep +learning framework for such an application, achieving video frame rate +processing. To address the challenge of limited annotation of these features in +Cherenkov images, a transfer learning strategy was applied. A fundus +photography dataset including 20,529 patch retina images with ground-truth +vessel annotation was used to pre-train a ResNet segmentation framework. +Subsequently, a small Cherenkov dataset (1,483 images from 212 treatment +fractions of 19 breast cancer patients) with known annotated vasculature masks +was used to fine-tune the model for accurate segmentation prediction. This deep +learning framework achieved consistent and rapid segmentation of +Cherenkov-imaged bio-morphological features on another 19 patients, including +subcutaneous veins, scars, and pigmented skin. Average segmentation by the +model achieved Dice score of 0.85 and required less than 0.7 milliseconds +processing time per instance. The model demonstrated outstanding consistency +against input image variances and speed compared to conventional manual +segmentation methods, laying the foundation for online segmentation in +real-time monitoring in a prospective setting. + +
+
+ comment: 9 pages, 7 figures, 1 table, journal under review +
+
+
+
+
+ + ☆ Real-Time Human Action Recognition on Embedded Platforms + + +
+ With advancements in computer vision and deep learning, video-based human +action recognition (HAR) has become practical. However, due to the complexity +of the computation pipeline, running HAR on live video streams incurs excessive +delays on embedded platforms. This work tackles the real-time performance +challenges of HAR with four contributions: 1) an experimental study identifying +a standard Optical Flow (OF) extraction technique as the latency bottleneck in +a state-of-the-art HAR pipeline, 2) an exploration of the latency-accuracy +tradeoff between the standard and deep learning approaches to OF extraction, +which highlights the need for a novel, efficient motion feature extractor, 3) +the design of Integrated Motion Feature Extractor (IMFE), a novel single-shot +neural network architecture for motion feature extraction with drastic +improvement in latency, 4) the development of RT-HARE, a real-time HAR system +tailored for embedded platforms. Experimental results on an Nvidia Jetson +Xavier NX platform demonstrated that RT-HARE realizes real-time HAR at a video +frame rate of 30 frames per second while delivering high levels of recognition +accuracy. + +
+
+
+
+
+ + ☆ Replay Consolidation with Label Propagation for Continual Object + Detection + + +
+ Object Detection is a highly relevant computer vision problem with many +applications such as robotics and autonomous driving. Continual Learning~(CL) +considers a setting where a model incrementally learns new information while +retaining previously acquired knowledge. This is particularly challenging since +Deep Learning models tend to catastrophically forget old knowledge while +training on new data. In particular, Continual Learning for Object +Detection~(CLOD) poses additional difficulties compared to CL for +Classification. In CLOD, images from previous tasks may contain unknown classes +that could reappear labeled in future tasks. These missing annotations cause +task interference issues for replay-based approaches. As a result, most works +in the literature have focused on distillation-based approaches. However, these +approaches are effective only when there is a strong overlap of classes across +tasks. To address the issues of current methodologies, we propose a novel +technique to solve CLOD called Replay Consolidation with Label Propagation for +Object Detection (RCLPOD). Based on the replay method, our solution avoids task +interference issues by enhancing the buffer memory samples. Our method is +evaluated against existing techniques in CLOD literature, demonstrating its +superior performance on established benchmarks like VOC and COCO. + +
+
+
+
+
+ + ☆ Prototype-Driven Multi-Feature Generation for Visible-Infrared Person + Re-identification + + +
+ The primary challenges in visible-infrared person re-identification arise +from the differences between visible (vis) and infrared (ir) images, including +inter-modal and intra-modal variations. These challenges are further +complicated by varying viewpoints and irregular movements. Existing methods +often rely on horizontal partitioning to align part-level features, which can +introduce inaccuracies and have limited effectiveness in reducing modality +discrepancies. In this paper, we propose a novel Prototype-Driven Multi-feature +generation framework (PDM) aimed at mitigating cross-modal discrepancies by +constructing diversified features and mining latent semantically similar +features for modal alignment. PDM comprises two key components: Multi-Feature +Generation Module (MFGM) and Prototype Learning Module (PLM). The MFGM +generates diversity features closely distributed from modality-shared features +to represent pedestrians. Additionally, the PLM utilizes learnable prototypes +to excavate latent semantic similarities among local features between visible +and infrared modalities, thereby facilitating cross-modal instance-level +alignment. We introduce the cosine heterogeneity loss to enhance prototype +diversity for extracting rich local features. Extensive experiments conducted +on the SYSU-MM01 and LLCM datasets demonstrate that our approach achieves +state-of-the-art performance. Our codes are available at +https://github.com/mmunhappy/ICASSP2025-PDM. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ 3D-SAR Tomography and Machine Learning for High-Resolution Tree Height + Estimation + + +
+ Accurately estimating forest biomass is crucial for global carbon cycle +modelling and climate change mitigation. Tree height, a key factor in biomass +calculations, can be measured using Synthetic Aperture Radar (SAR) technology. +This study applies machine learning to extract forest height data from two SAR +products: Single Look Complex (SLC) images and tomographic cubes, in +preparation for the ESA Biomass Satellite mission. We use the TomoSense +dataset, containing SAR and LiDAR data from Germany's Eifel National Park, to +develop and evaluate height estimation models. Our approach includes classical +methods, deep learning with a 3D U-Net, and Bayesian-optimized techniques. By +testing various SAR frequencies and polarimetries, we establish a baseline for +future height and biomass modelling. Best-performing models predict forest +height to be within 2.82m mean absolute error for canopies around 30m, +advancing our ability to measure global carbon stocks and support climate +action. + +
+
+
+
+
+ + ☆ Renormalized Connection for Scale-preferred Object Detection in + Satellite Imagery + + +
+ Satellite imagery, due to its long-range imaging, brings with it a variety of +scale-preferred tasks, such as the detection of tiny/small objects, making the +precise localization and detection of small objects of interest a challenging +task. In this article, we design a Knowledge Discovery Network (KDN) to +implement the renormalization group theory in terms of efficient feature +extraction. Renormalized connection (RC) on the KDN enables ``synergistic +focusing'' of multi-scale features. Based on our observations of KDN, we +abstract a class of RCs with different connection strengths, called n21C, and +generalize it to FPN-based multi-branch detectors. In a series of FPN +experiments on the scale-preferred tasks, we found that the +``divide-and-conquer'' idea of FPN severely hampers the detector's learning in +the right direction due to the large number of large-scale negative samples and +interference from background noise. Moreover, these negative samples cannot be +eliminated by the focal loss function. The RCs extends the multi-level +feature's ``divide-and-conquer'' mechanism of the FPN-based detectors to a wide +range of scale-preferred tasks, and enables synergistic effects of multi-level +features on the specific learning goal. In addition, interference activations +in two aspects are greatly reduced and the detector learns in a more correct +direction. Extensive experiments of 17 well-designed detection architectures +embedded with n21s on five different levels of scale-preferred tasks validate +the effectiveness and efficiency of the RCs. Especially the simplest linear +form of RC, E421C performs well in all tasks and it satisfies the scaling +property of RGT. We hope that our approach will transfer a large number of +well-designed detectors from the computer vision community to the remote +sensing community. + +
+
+ comment: 24 pages, 14 figures Journal +
+
+
+
+
+ + ☆ G-NeLF: Memory- and Data-Efficient Hybrid Neural Light Field for Novel + View Synthesis + + +
+ Following the burgeoning interest in implicit neural representation, Neural +Light Field (NeLF) has been introduced to predict the color of a ray directly. +Unlike Neural Radiance Field (NeRF), NeLF does not create a point-wise +representation by predicting color and volume density for each point in space. +However, the current NeLF methods face a challenge as they need to train a NeRF +model first and then synthesize over 10K views to train NeLF for improved +performance. Additionally, the rendering quality of NeLF methods is lower +compared to NeRF methods. In this paper, we propose G-NeLF, a versatile +grid-based NeLF approach that utilizes spatial-aware features to unleash the +potential of the neural network's inference capability, and consequently +overcome the difficulties of NeLF training. Specifically, we employ a +spatial-aware feature sequence derived from a meticulously crafted grid as the +ray's representation. Drawing from our empirical studies on the adaptability of +multi-resolution hash tables, we introduce a novel grid-based ray +representation for NeLF that can represent the entire space with a very limited +number of parameters. To better utilize the sequence feature, we design a +lightweight ray color decoder that simulates the ray propagation process, +enabling a more efficient inference of the ray's color. G-NeLF can be trained +without necessitating significant storage overhead and with the model size of +only 0.95 MB to surpass previous state-of-the-art NeLF. Moreover, compared with +grid-based NeRF methods, e.g., Instant-NGP, we only utilize one-tenth of its +parameters to achieve higher performance. Our code will be released upon +acceptance. + +
+
+
+
+
+ + ☆ Adapted-MoE: Mixture of Experts with Test-Time Adaption for Anomaly + Detection + + +
+ Most unsupervised anomaly detection methods based on representations of +normal samples to distinguish anomalies have recently made remarkable progress. +However, existing methods only learn a single decision boundary for +distinguishing the samples within the training dataset, neglecting the +variation in feature distribution for normal samples even in the same category +in the real world. Furthermore, it was not considered that a distribution bias +still exists between the test set and the train set. Therefore, we propose an +Adapted-MoE which contains a routing network and a series of expert models to +handle multiple distributions of same-category samples by divide and conquer. +Specifically, we propose a routing network based on representation learning to +route same-category samples into the subclasses feature space. Then, a series +of expert models are utilized to learn the representation of various normal +samples and construct several independent decision boundaries. We propose the +test-time adaption to eliminate the bias between the unseen test sample +representation and the feature distribution learned by the expert model. Our +experiments are conducted on a dataset that provides multiple subclasses from +three categories, namely Texture AD benchmark. The Adapted-MoE significantly +improves the performance of the baseline model, achieving 2.18%-7.20% and +1.57%-16.30% increase in I-AUROC and P-AUROC, which outperforms the current +state-of-the-art methods. Our code is available at https://github.com/. + +
+
+
+
+
+ + ☆ CustomContrast: A Multilevel Contrastive Perspective For Subject-Driven + Text-to-Image Customization + + +
+ Subject-driven text-to-image (T2I) customization has drawn significant +interest in academia and industry. This task enables pre-trained models to +generate novel images based on unique subjects. Existing studies adopt a +self-reconstructive perspective, focusing on capturing all details of a single +image, which will misconstrue the specific image's irrelevant attributes (e.g., +view, pose, and background) as the subject intrinsic attributes. This +misconstruction leads to both overfitting or underfitting of irrelevant and +intrinsic attributes of the subject, i.e., these attributes are +over-represented or under-represented simultaneously, causing a trade-off +between similarity and controllability. In this study, we argue an ideal +subject representation can be achieved by a cross-differential perspective, +i.e., decoupling subject intrinsic attributes from irrelevant attributes via +contrastive learning, which allows the model to focus more on intrinsic +attributes through intra-consistency (features of the same subject are +spatially closer) and inter-distinctiveness (features of different subjects +have distinguished differences). Specifically, we propose CustomContrast, a +novel framework, which includes a Multilevel Contrastive Learning (MCL) +paradigm and a Multimodal Feature Injection (MFI) Encoder. The MCL paradigm is +used to extract intrinsic features of subjects from high-level semantics to +low-level appearance through crossmodal semantic contrastive learning and +multiscale appearance contrastive learning. To facilitate contrastive learning, +we introduce the MFI encoder to capture cross-modal representations. Extensive +experiments show the effectiveness of CustomContrast in subject similarity and +text controllability. + +
+
+
+
+
+ + ☆ SynMorph: Generating Synthetic Face Morphing Dataset with Mated Samples + + +
+ Face morphing attack detection (MAD) algorithms have become essential to +overcome the vulnerability of face recognition systems. To solve the lack of +large-scale and public-available datasets due to privacy concerns and +restrictions, in this work we propose a new method to generate a synthetic face +morphing dataset with 2450 identities and more than 100k morphs. The proposed +synthetic face morphing dataset is unique for its high-quality samples, +different types of morphing algorithms, and the generalization for both single +and differential morphing attack detection algorithms. For experiments, we +apply face image quality assessment and vulnerability analysis to evaluate the +proposed synthetic face morphing dataset from the perspective of biometric +sample quality and morphing attack potential on face recognition systems. The +results are benchmarked with an existing SOTA synthetic dataset and a +representative non-synthetic and indicate improvement compared with the SOTA. +Additionally, we design different protocols and study the applicability of +using the proposed synthetic dataset on training morphing attack detection +algorithms. + +
+
+
+
+
+ + ☆ DSDFormer: An Innovative Transformer-Mamba Framework for Robust + High-Precision Driver Distraction Identification + + +
+ Driver distraction remains a leading cause of traffic accidents, posing a +critical threat to road safety globally. As intelligent transportation systems +evolve, accurate and real-time identification of driver distraction has become +essential. However, existing methods struggle to capture both global contextual +and fine-grained local features while contending with noisy labels in training +datasets. To address these challenges, we propose DSDFormer, a novel framework +that integrates the strengths of Transformer and Mamba architectures through a +Dual State Domain Attention (DSDA) mechanism, enabling a balance between +long-range dependencies and detailed feature extraction for robust driver +behavior recognition. Additionally, we introduce Temporal Reasoning Confident +Learning (TRCL), an unsupervised approach that refines noisy labels by +leveraging spatiotemporal correlations in video sequences. Our model achieves +state-of-the-art performance on the AUC-V1, AUC-V2, and 100-Driver datasets and +demonstrates real-time processing efficiency on the NVIDIA Jetson AGX Orin +platform. Extensive experimental results confirm that DSDFormer and TRCL +significantly improve both the accuracy and robustness of driver distraction +detection, offering a scalable solution to enhance road safety. + +
+
+
+
+
+ + ☆ Latent 3D Brain MRI Counterfactual + + +
+ The number of samples in structural brain MRI studies is often too small to +properly train deep learning models. Generative models show promise in +addressing this issue by effectively learning the data distribution and +generating high-fidelity MRI. However, they struggle to produce diverse, +high-quality data outside the distribution defined by the training data. One +way to address the issue is using causal models developed for 3D volume +counterfactuals. However, accurately modeling causality in high-dimensional +spaces is a challenge so that these models generally generate 3D brain MRIS of +lower quality. To address these challenges, we propose a two-stage method that +constructs a Structural Causal Model (SCM) within the latent space. In the +first stage, we employ a VQ-VAE to learn a compact embedding of the MRI volume. +Subsequently, we integrate our causal model into this latent space and execute +a three-step counterfactual procedure using a closed-form Generalized Linear +Model (GLM). Our experiments conducted on real-world high-resolution MRI data +(1mm) demonstrate that our method can generate high-quality 3D MRI +counterfactuals. + +
+
+
+
+
+ + ☆ LEROjD: Lidar Extended Radar-Only Object Detection ECCV 2024 + + +
+ Accurate 3D object detection is vital for automated driving. While lidar +sensors are well suited for this task, they are expensive and have limitations +in adverse weather conditions. 3+1D imaging radar sensors offer a +cost-effective, robust alternative but face challenges due to their low +resolution and high measurement noise. Existing 3+1D imaging radar datasets +include radar and lidar data, enabling cross-modal model improvements. Although +lidar should not be used during inference, it can aid the training of +radar-only object detectors. We explore two strategies to transfer knowledge +from the lidar to the radar domain and radar-only object detectors: 1. +multi-stage training with sequential lidar point cloud thin-out, and 2. +cross-modal knowledge distillation. In the multi-stage process, three thin-out +methods are examined. Our results show significant performance gains of up to +4.2 percentage points in mean Average Precision with multi-stage training and +up to 3.9 percentage points with knowledge distillation by initializing the +student with the teacher's weights. The main benefit of these approaches is +their applicability to other 3D object detection networks without altering +their architecture, as we show by analyzing it on two different object +detectors. Our code is available at https://github.com/rst-tu-dortmund/lerojd + +
+
+ comment: Accepted for publication as ECCV 2024 +
+
+
+
+
+ + ☆ Seeing Through the Mask: Rethinking Adversarial Examples for CAPTCHAs + + +
+ Modern CAPTCHAs rely heavily on vision tasks that are supposedly hard for +computers but easy for humans. However, advances in image recognition models +pose a significant threat to such CAPTCHAs. These models can easily be fooled +by generating some well-hidden "random" noise and adding it to the image, or +hiding objects in the image. However, these methods are model-specific and thus +can not aid CAPTCHAs in fooling all models. We show in this work that by +allowing for more significant changes to the images while preserving the +semantic information and keeping it solvable by humans, we can fool many +state-of-the-art models. Specifically, we demonstrate that by adding masks of +various intensities the Accuracy @ 1 (Acc@1) drops by more than 50%-points for +all models, and supposedly robust models such as vision transformers see an +Acc@1 drop of 80%-points. + These masks can therefore effectively fool modern image classifiers, thus +showing that machines have not caught up with humans -- yet. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Seeing is Believing? Enhancing Vision-Language Navigation using Visual + Perturbations ICASSP 2025 + + +
+ Autonomous navigation for an embodied agent guided by natural language +instructions remains a formidable challenge in vision-and-language navigation +(VLN). Despite remarkable recent progress in learning fine-grained and +multifarious visual representations, the tendency to overfit to the training +environments leads to unsatisfactory generalization performance. In this work, +we present a versatile Multi-Branch Architecture (MBA) aimed at exploring and +exploiting diverse visual inputs. Specifically, we introduce three distinct +visual variants: ground-truth depth images, visual inputs integrated with +incongruent views, and those infused with random noise to enrich the diversity +of visual input representation and prevent overfitting to the original RGB +observations. To adaptively fuse these varied inputs, the proposed MBA extend a +base agent model into a multi-branch variant, where each branch processes a +different visual input. Surprisingly, even random noise can further enhance +navigation performance in unseen environments. Extensive experiments conducted +on three VLN benchmarks (R2R, REVERIE, SOON) demonstrate that our proposed +method equals or even surpasses state-of-the-art results. The source code will +be publicly available. + +
+
+ comment: 5 pages, 2 figures, submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Exploring Rich Subjective Quality Information for Image Quality + Assessment in the Wild + + +
+ Traditional in the wild image quality assessment (IQA) models are generally +trained with the quality labels of mean opinion score (MOS), while missing the +rich subjective quality information contained in the quality ratings, for +example, the standard deviation of opinion scores (SOS) or even distribution of +opinion scores (DOS). In this paper, we propose a novel IQA method named +RichIQA to explore the rich subjective rating information beyond MOS to predict +image quality in the wild. RichIQA is characterized by two key novel designs: +(1) a three-stage image quality prediction network which exploits the powerful +feature representation capability of the Convolutional vision Transformer (CvT) +and mimics the short-term and long-term memory mechanisms of human brain; (2) a +multi-label training strategy in which rich subjective quality information like +MOS, SOS and DOS are concurrently used to train the quality prediction network. +Powered by these two novel designs, RichIQA is able to predict the image +quality in terms of a distribution, from which the mean image quality can be +subsequently obtained. Extensive experimental results verify that the +three-stage network is tailored to predict rich quality information, while the +multi-label training strategy can fully exploit the potentials within +subjective quality rating and enhance the prediction performance and +generalizability of the network. RichIQA outperforms state-of-the-art +competitors on multiple large-scale in the wild IQA databases with rich +subjective rating labels. The code of RichIQA will be made publicly available +on GitHub. + +
+
+
+
+
+ + ☆ HMAFlow: Learning More Accurate Optical Flow via Hierarchical Motion + Field Alignment + + +
+ Optical flow estimation is a fundamental and long-standing visual task. In +this work, we present a novel method, dubbed HMAFlow, to improve optical flow +estimation in these tough scenes, especially with small objects. The proposed +model mainly consists of two core components: a Hierarchical Motion Field +Alignment (HMA) module and a Correlation Self-Attention (CSA) module. In +addition, we rebuild 4D cost volumes by employing a Multi-Scale Correlation +Search (MCS) layer and replacing average pooling in common cost volumes with an +search strategy using multiple search ranges. Experimental results demonstrate +that our model achieves the best generalization performance in comparison to +other state-of-the-art methods. Specifically, compared with RAFT, our method +achieves relative error reductions of 14.2% and 3.4% on the clean pass and +final pass of the Sintel online benchmark, respectively. On the KITTI test +benchmark, HMAFlow surpasses RAFT and GMA in the Fl-all metric by a relative +margin of 6.8% and 7.7%, respectively. To facilitate future research, our code +will be made available at https://github.com/BooTurbo/HMAFlow. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ An Atmospheric Correction Integrated LULC Segmentation Model for + High-Resolution Satellite Imagery + + +
+ The integration of fine-scale multispectral imagery with deep learning models +has revolutionized land use and land cover (LULC) classification. However, the +atmospheric effects present in Top-of-Atmosphere sensor measured Digital Number +values must be corrected to retrieve accurate Bottom-of-Atmosphere surface +reflectance for reliable analysis. This study employs look-up-table-based +radiative transfer simulations to estimate the atmospheric path reflectance and +transmittance for atmospherically correcting high-resolution CARTOSAT-3 +Multispectral (MX) imagery for several Indian cities. The corrected surface +reflectance data were subsequently used in supervised and semi-supervised +segmentation models, demonstrating stability in multi-class (buildings, roads, +trees and water bodies) LULC segmentation accuracy, particularly in scenarios +with sparsely labelled data. + +
+
+
+
+
+ + ☆ A Taxonomy of Miscompressions: Preparing Image Forensics for Neural + Compression + + +
+ Neural compression has the potential to revolutionize lossy image +compression. Based on generative models, recent schemes achieve unprecedented +compression rates at high perceptual quality but compromise semantic fidelity. +Details of decompressed images may appear optically flawless but semantically +different from the originals, making compression errors difficult or impossible +to detect. We explore the problem space and propose a provisional taxonomy of +miscompressions. It defines three types of 'what happens' and has a binary +'high impact' flag indicating miscompressions that alter symbols. We discuss +how the taxonomy can facilitate risk communication and research into +mitigations. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ☆ PVP-Recon: Progressive View Planning via Warping Consistency for + Sparse-View Surface Reconstruction + + +
+ Neural implicit representations have revolutionized dense multi-view surface +reconstruction, yet their performance significantly diminishes with sparse +input views. A few pioneering works have sought to tackle the challenge of +sparse-view reconstruction by leveraging additional geometric priors or +multi-scene generalizability. However, they are still hindered by the imperfect +choice of input views, using images under empirically determined viewpoints to +provide considerable overlap. We propose PVP-Recon, a novel and effective +sparse-view surface reconstruction method that progressively plans the next +best views to form an optimal set of sparse viewpoints for image capturing. +PVP-Recon starts initial surface reconstruction with as few as 3 views and +progressively adds new views which are determined based on a novel warping +score that reflects the information gain of each newly added view. This +progressive view planning progress is interleaved with a neural SDF-based +reconstruction module that utilizes multi-resolution hash features, enhanced by +a progressive training scheme and a directional Hessian loss. Quantitative and +qualitative experiments on three benchmark datasets show that our framework +achieves high-quality reconstruction with a constrained input budget and +outperforms existing baselines. + +
+
+
+
+
+ + ☆ Proto-OOD: Enhancing OOD Object Detection with Prototype Feature + Similarity + + +
+ The limited training samples for object detectors commonly result in low +accuracy out-of-distribution (OOD) object detection. We have observed that +feature vectors of the same class tend to cluster tightly in feature space, +whereas those of different classes are more scattered. This insight motivates +us to leverage feature similarity for OOD detection. Drawing on the concept of +prototypes prevalent in few-shot learning, we introduce a novel network +architecture, Proto-OOD, designed for this purpose. Proto-OOD enhances +prototype representativeness through contrastive loss and identifies OOD data +by assessing the similarity between input features and prototypes. It employs a +negative embedding generator to create negative embedding, which are then used +to train the similarity module. Proto-OOD achieves significantly lower FPR95 in +MS-COCO dataset and higher mAP for Pascal VOC dataset, when utilizing Pascal +VOC as ID dataset and MS-COCO as OOD dataset. Additionally, we identify +limitations in existing evaluation metrics and propose an enhanced evaluation +protocol. + +
+
+ comment: 14pages +
+
+
+
+
+ + ☆ DriveScape: Towards High-Resolution Controllable Multi-View Driving + Video Generation + + +
+ Recent advancements in generative models have provided promising solutions +for synthesizing realistic driving videos, which are crucial for training +autonomous driving perception models. However, existing approaches often +struggle with multi-view video generation due to the challenges of integrating +3D information while maintaining spatial-temporal consistency and effectively +learning from a unified model. In this paper, we propose an end-to-end +framework named DriveScape for multi-view, 3D condition-guided video +generation. DriveScape not only streamlines the process by integrating camera +data to ensure comprehensive spatial-temporal coverage, but also introduces a +Bi-Directional Modulated Transformer module to effectively align 3D road +structural information. As a result, our approach enables precise control over +video generation, significantly enhancing realism and providing a robust +solution for generating multi-view driving videos. Our framework achieves +state-of-the-art results on the nuScenes dataset, demonstrating impressive +generative quality metrics with an FID score of 8.34 and an FVD score of 76.39, +as well as superior performance across various perception tasks. This paves the +way for more accurate environmental simulations in autonomous driving. Code +will be available at our project homepage. + +
+
+
+
+
+ + ☆ EndoOmni: Zero-Shot Cross-Dataset Depth Estimation in Endoscopy by + Robust Self-Learning from Noisy Labels + + +
+ Single-image depth estimation is essential for endoscopy tasks such as +localization, reconstruction, and augmented reality. Most existing methods in +surgical scenes focus on in-domain depth estimation, limiting their real-world +applicability. This constraint stems from the scarcity and inferior labeling +quality of medical data for training. In this work, we present EndoOmni, the +first foundation model for zero-shot cross-domain depth estimation for +endoscopy. To harness the potential of diverse training data, we refine the +advanced self-learning paradigm that employs a teacher model to generate +pseudo-labels, guiding a student model trained on large-scale labeled and +unlabeled data. To address training disturbance caused by inherent noise in +depth labels, we propose a robust training framework that leverages both depth +labels and estimated confidence from the teacher model to jointly guide the +student model training. Moreover, we propose a weighted scale-and-shift +invariant loss to adaptively adjust learning weights based on label confidence, +thus imposing learning bias towards cleaner label pixels while reducing the +influence of highly noisy pixels. Experiments on zero-shot relative depth +estimation show that our EndoOmni improves state-of-the-art methods in medical +imaging for 41\% and existing foundation models for 25\% in terms of absolute +relative error on specific dataset. Furthermore, our model provides strong +initialization for fine-tuning to metric depth estimation, maintaining superior +performance in both in-domain and out-of-domain scenarios. The source code will +be publicly available. + +
+
+
+
+
+ + ☆ TextToucher: Fine-Grained Text-to-Touch Generation + + +
+ Tactile sensation plays a crucial role in the development of multi-modal +large models and embodied intelligence. To collect tactile data with minimal +cost as possible, a series of studies have attempted to generate tactile images +by vision-to-touch image translation. However, compared to text modality, +visual modality-driven tactile generation cannot accurately depict human +tactile sensation. In this work, we analyze the characteristics of tactile +images in detail from two granularities: object-level (tactile texture, tactile +shape), and sensor-level (gel status). We model these granularities of +information through text descriptions and propose a fine-grained Text-to-Touch +generation method (TextToucher) to generate high-quality tactile samples. +Specifically, we introduce a multimodal large language model to build the text +sentences about object-level tactile information and employ a set of learnable +text prompts to represent the sensor-level tactile information. To better guide +the tactile generation process with the built text information, we fuse the +dual grains of text information and explore various dual-grain text +conditioning methods within the diffusion transformer architecture. +Furthermore, we propose a Contrastive Text-Touch Pre-training (CTTP) metric to +precisely evaluate the quality of text-driven generated tactile data. Extensive +experiments demonstrate the superiority of our TextToucher method. The source +codes will be available at \url{https://github.com/TtuHamg/TextToucher}. + +
+
+
+
+
+ + ☆ Distribution Discrepancy and Feature Heterogeneity for Active 3D Object + Detection + + +
+ LiDAR-based 3D object detection is a critical technology for the development +of autonomous driving and robotics. However, the high cost of data annotation +limits its advancement. We propose a novel and effective active learning (AL) +method called Distribution Discrepancy and Feature Heterogeneity (DDFH), which +simultaneously considers geometric features and model embeddings, assessing +information from both the instance-level and frame-level perspectives. +Distribution Discrepancy evaluates the difference and novelty of instances +within the unlabeled and labeled distributions, enabling the model to learn +efficiently with limited data. Feature Heterogeneity ensures the heterogeneity +of intra-frame instance features, maintaining feature diversity while avoiding +redundant or similar instances, thus minimizing annotation costs. Finally, +multiple indicators are efficiently aggregated using Quantile Transform, +providing a unified measure of informativeness. Extensive experiments +demonstrate that DDFH outperforms the current state-of-the-art (SOTA) methods +on the KITTI and Waymo datasets, effectively reducing the bounding box +annotation cost by 56.3% and showing robustness when working with both +one-stage and two-stage models. + +
+
+ comment: Accepted to CoRL 2024 +
+
+
+
+
+ + ☆ AD-Net: Attention-based dilated convolutional residual network with + guided decoder for robust skin lesion segmentation + + +
+ In computer-aided diagnosis tools employed for skin cancer treatment and +early diagnosis, skin lesion segmentation is important. However, achieving +precise segmentation is challenging due to inherent variations in appearance, +contrast, texture, and blurry lesion boundaries. This research presents a +robust approach utilizing a dilated convolutional residual network, which +incorporates an attention-based spatial feature enhancement block (ASFEB) and +employs a guided decoder strategy. In each dilated convolutional residual +block, dilated convolution is employed to broaden the receptive field with +varying dilation rates. To improve the spatial feature information of the +encoder, we employed an attention-based spatial feature enhancement block in +the skip connections. The ASFEB in our proposed method combines feature maps +obtained from average and maximum-pooling operations. These combined features +are then weighted using the active outcome of global average pooling and +convolution operations. Additionally, we have incorporated a guided decoder +strategy, where each decoder block is optimized using an individual loss +function to enhance the feature learning process in the proposed AD-Net. The +proposed AD-Net presents a significant benefit by necessitating fewer model +parameters compared to its peer methods. This reduction in parameters directly +impacts the number of labeled data required for training, facilitating faster +convergence during the training process. The effectiveness of the proposed +AD-Net was evaluated using four public benchmark datasets. We conducted a +Wilcoxon signed-rank test to verify the efficiency of the AD-Net. The outcomes +suggest that our method surpasses other cutting-edge methods in performance, +even without the implementation of data augmentation strategies. + +
+
+
+
+
+ + ☆ CipherDM: Secure Three-Party Inference for Diffusion Model Sampling + + +
+ Diffusion Models (DMs) achieve state-of-the-art synthesis results in image +generation and have been applied to various fields. However, DMs sometimes +seriously violate user privacy during usage, making the protection of privacy +an urgent issue. Using traditional privacy computing schemes like Secure +Multi-Party Computation (MPC) directly in DMs faces significant computation and +communication challenges. To address these issues, we propose CipherDM, the +first novel, versatile and universal framework applying MPC technology to DMs +for secure sampling, which can be widely implemented on multiple DM based +tasks. We thoroughly analyze sampling latency breakdown, find time-consuming +parts and design corresponding secure MPC protocols for computing nonlinear +activations including SoftMax, SiLU and Mish. CipherDM is evaluated on popular +architectures (DDPM, DDIM) using MNIST dataset and on SD deployed by diffusers. +Compared to direct implementation on SPU, our approach improves running time by +approximately 1.084\times \sim 2.328\times, and reduces communication costs by +approximately 1.212\times \sim 1.791\times. + +
+
+
+
+
+ + ☆ From Words to Poses: Enhancing Novel Object Pose Estimation with Vision + Language Models + + +
+ Robots are increasingly envisioned to interact in real-world scenarios, where +they must continuously adapt to new situations. To detect and grasp novel +objects, zero-shot pose estimators determine poses without prior knowledge. +Recently, vision language models (VLMs) have shown considerable advances in +robotics applications by establishing an understanding between language input +and image input. In our work, we take advantage of VLMs zero-shot capabilities +and translate this ability to 6D object pose estimation. We propose a novel +framework for promptable zero-shot 6D object pose estimation using language +embeddings. The idea is to derive a coarse location of an object based on the +relevancy map of a language-embedded NeRF reconstruction and to compute the +pose estimate with a point cloud registration method. Additionally, we provide +an analysis of LERF's suitability for open-set object pose estimation. We +examine hyperparameters, such as activation thresholds for relevancy maps and +investigate the zero-shot capabilities on an instance- and category-level. +Furthermore, we plan to conduct robotic grasping experiments in a real-world +setting. + +
+
+
+
+
+ + ☆ KRONC: Keypoint-based Robust Camera Optimization for 3D Car + Reconstruction ECCV + + +
+ The three-dimensional representation of objects or scenes starting from a set +of images has been a widely discussed topic for years and has gained additional +attention after the diffusion of NeRF-based approaches. However, an +underestimated prerequisite is the knowledge of camera poses or, more +specifically, the estimation of the extrinsic calibration parameters. Although +excellent general-purpose Structure-from-Motion methods are available as a +pre-processing step, their computational load is high and they require a lot of +frames to guarantee sufficient overlapping among the views. This paper +introduces KRONC, a novel approach aimed at inferring view poses by leveraging +prior knowledge about the object to reconstruct and its representation through +semantic keypoints. With a focus on vehicle scenes, KRONC is able to estimate +the position of the views as a solution to a light optimization problem +targeting the convergence of keypoints' back-projections to a singular point. +To validate the method, a specific dataset of real-world car scenes has been +collected. Experiments confirm KRONC's ability to generate excellent estimates +of camera poses starting from very coarse initialization. Results are +comparable with Structure-from-Motion methods with huge savings in computation. +Code and data will be made publicly available. + +
+
+ comment: Accepted at ECCVW +
+
+
+
+
+ + ☆ A Survey of Multimodal Composite Editing and Retrieval + + +
+ In the real world, where information is abundant and diverse across different +modalities, understanding and utilizing various data types to improve retrieval +systems is a key focus of research. Multimodal composite retrieval integrates +diverse modalities such as text, image and audio, etc. to provide more +accurate, personalized, and contextually relevant results. To facilitate a +deeper understanding of this promising direction, this survey explores +multimodal composite editing and retrieval in depth, covering image-text +composite editing, image-text composite retrieval, and other multimodal +composite retrieval. In this survey, we systematically organize the application +scenarios, methods, benchmarks, experiments, and future directions. Multimodal +learning is a hot topic in large model era, and have also witnessed some +surveys in multimodal learning and vision-language models with transformers +published in the PAMI journal. To the best of our knowledge, this survey is the +first comprehensive review of the literature on multimodal composite retrieval, +which is a timely complement of multimodal fusion to existing reviews. To help +readers' quickly track this field, we build the project page for this survey, +which can be found at +https://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval. + +
+
+ comment: 22 pages, 3 figures, and 11 tables +
+
+
+
+
+ + ☆ Sequential Posterior Sampling with Diffusion Models + + +
+ Diffusion models have quickly risen in popularity for their ability to model +complex distributions and perform effective posterior sampling. Unfortunately, +the iterative nature of these generative models makes them computationally +expensive and unsuitable for real-time sequential inverse problems such as +ultrasound imaging. Considering the strong temporal structure across sequences +of frames, we propose a novel approach that models the transition dynamics to +improve the efficiency of sequential diffusion posterior sampling in +conditional image synthesis. Through modeling sequence data using a video +vision transformer (ViViT) transition model based on previous diffusion +outputs, we can initialize the reverse diffusion trajectory at a lower noise +scale, greatly reducing the number of iterations required for convergence. We +demonstrate the effectiveness of our approach on a real-world dataset of high +frame rate cardiac ultrasound images and show that it achieves the same +performance as a full diffusion trajectory while accelerating inference +25$\times$, enabling real-time posterior sampling. Furthermore, we show that +the addition of a transition model improves the PSNR up to 8\% in cases with +severe motion. Our method opens up new possibilities for real-time applications +of diffusion models in imaging and other domains requiring real-time inference. + +
+
+ comment: 5 pages, 4 figures, preprint +
+
+
+
+
+ + ☆ FacialFlowNet: Advancing Facial Optical Flow Estimation with a Diverse + Dataset and a Decomposed Model + + +
+ Facial movements play a crucial role in conveying altitude and intentions, +and facial optical flow provides a dynamic and detailed representation of it. +However, the scarcity of datasets and a modern baseline hinders the progress in +facial optical flow research. This paper proposes FacialFlowNet (FFN), a novel +large-scale facial optical flow dataset, and the Decomposed Facial Flow Model +(DecFlow), the first method capable of decomposing facial flow. FFN comprises +9,635 identities and 105,970 image pairs, offering unprecedented diversity for +detailed facial and head motion analysis. DecFlow features a facial +semantic-aware encoder and a decomposed flow decoder, excelling in accurately +estimating and decomposing facial flow into head and expression components. +Comprehensive experiments demonstrate that FFN significantly enhances the +accuracy of facial flow estimation across various optical flow methods, +achieving up to an 11% reduction in Endpoint Error (EPE) (from 3.91 to 3.48). +Moreover, DecFlow, when coupled with FFN, outperforms existing methods in both +synthetic and real-world scenarios, enhancing facial expression analysis. The +decomposed expression flow achieves a substantial accuracy improvement of 18% +(from 69.1% to 82.1%) in micro-expressions recognition. These contributions +represent a significant advancement in facial motion analysis and optical flow +estimation. Codes and datasets can be found. + +
+
+ comment: ACMMM2024 +
+
+
+
+
+ + ☆ Shaking Up VLMs: Comparing Transformers and Structured State Space + Models for Vision & Language Modeling + + +
+ This study explores replacing Transformers in Visual Language Models (VLMs) +with Mamba, a recent structured state space model (SSM) that demonstrates +promising performance in sequence modeling. We test models up to 3B parameters +under controlled conditions, showing that Mamba-based VLMs outperforms +Transformers-based VLMs in captioning, question answering, and reading +comprehension. However, we find that Transformers achieve greater performance +in visual grounding and the performance gap widens with scale. We explore two +hypotheses to explain this phenomenon: 1) the effect of task-agnostic visual +encoding on the updates of the hidden states, and 2) the difficulty in +performing visual grounding from the perspective of in-context multimodal +retrieval. Our results indicate that a task-aware encoding yields minimal +performance gains on grounding, however, Transformers significantly outperform +Mamba at in-context multimodal retrieval. Overall, Mamba shows promising +performance on tasks where the correct output relies on a summary of the image +but struggles when retrieval of explicit information from the context is +required. + +
+
+
+
+
+ + ☆ TAVP: Task-Adaptive Visual Prompt for Cross-domain Few-shot Segmentation + + +
+ Under the backdrop of large-scale pre-training, large visual models (LVM) +have demonstrated significant potential in image understanding. The recent +emergence of the Segment Anything Model (SAM) has brought a qualitative shift +in the field of image segmentation, supporting flexible interactive cues and +strong learning capabilities. However, its performance often falls short in +cross-domain and few-shot applications. Transferring prior knowledge from +foundation models to new applications while preserving learning capabilities is +worth exploring. This work proposes a task-adaptive prompt framework based on +SAM, a new paradigm for Cross-dominan few-shot segmentation (CD-FSS). First, a +Multi-level Feature Fusion (MFF) was used for integrated feature extraction. +Besides, an additional Class Domain Task-Adaptive Auto-Prompt (CDTAP) module +was combined with the segmentation branch for class-domain agnostic feature +extraction and high-quality learnable prompt production. This significant +advancement uses a unique generative approach to prompts alongside a +comprehensive model structure and specialized prototype computation. While +ensuring that the prior knowledge of SAM is not discarded, the new branch +disentangles category and domain information through prototypes, guiding it in +adapting the CD-FSS. We have achieved the best results on three benchmarks +compared to the recent state-of-the-art (SOTA) methods. Comprehensive +experiments showed that after task-specific and weighted guidance, the abundant +feature information of SAM can be better learned for CD-FSS. + +
+
+
+
+
+ + ☆ A Novel Representation of Periodic Pattern and Its Application to + Untrained Anomaly Detection + + +
+ There are a variety of industrial products that possess periodic textures or +surfaces, such as carbon fiber textiles and display panels. Traditional +image-based quality inspection methods for these products require identifying +the periodic patterns from normal images (without anomaly and noise) and +subsequently detecting anomaly pixels with inconsistent appearances. However, +it remains challenging to accurately extract the periodic pattern from a single +image in the presence of unknown anomalies and measurement noise. To deal with +this challenge, this paper proposes a novel self-representation of the periodic +image defined on a set of continuous parameters. In this way, periodic pattern +learning can be embedded into a joint optimization framework, which is named +periodic-sparse decomposition, with simultaneously modeling the sparse +anomalies and Gaussian noise. Finally, for the real-world industrial images +that may not strictly satisfy the periodic assumption, we propose a novel +pixel-level anomaly scoring strategy to enhance the performance of anomaly +detection. Both simulated and real-world case studies demonstrate the +effectiveness of the proposed methodology for periodic pattern learning and +anomaly detection. + +
+
+
+
+
+ + ☆ Decoupling Contact for Fine-Grained Motion Style Transfer + + +
+ Motion style transfer changes the style of a motion while retaining its +content and is useful in computer animations and games. Contact is an essential +component of motion style transfer that should be controlled explicitly in +order to express the style vividly while enhancing motion naturalness and +quality. However, it is unknown how to decouple and control contact to achieve +fine-grained control in motion style transfer. In this paper, we present a +novel style transfer method for fine-grained control over contacts while +achieving both motion naturalness and spatial-temporal variations of style. +Based on our empirical evidence, we propose controlling contact indirectly +through the hip velocity, which can be further decomposed into the trajectory +and contact timing, respectively. To this end, we propose a new model that +explicitly models the correlations between motions and trajectory/contact +timing/style, allowing us to decouple and control each separately. Our approach +is built around a motion manifold, where hip controls can be easily integrated +into a Transformer-based decoder. It is versatile in that it can generate +motions directly as well as be used as post-processing for existing methods to +improve quality and contact controllability. In addition, we propose a new +metric that measures a correlation pattern of motions based on our empirical +evidence, aligning well with human perception in terms of motion naturalness. +Based on extensive evaluation, our method outperforms existing methods in terms +of style expressivity and motion quality. + +
+
+
+
+
+ + ☆ Look One and More: Distilling Hybrid Order Relational Knowledge for + Cross-Resolution Image Recognition AAAI 2020 + + +
+ In spite of great success in many image recognition tasks achieved by recent +deep models, directly applying them to recognize low-resolution images may +suffer from low accuracy due to the missing of informative details during +resolution degradation. However, these images are still recognizable for +subjects who are familiar with the corresponding high-resolution ones. Inspired +by that, we propose a teacher-student learning approach to facilitate +low-resolution image recognition via hybrid order relational knowledge +distillation. The approach refers to three streams: the teacher stream is +pretrained to recognize high-resolution images in high accuracy, the student +stream is learned to identify low-resolution images by mimicking the teacher's +behaviors, and the extra assistant stream is introduced as bridge to help +knowledge transfer across the teacher to the student. To extract sufficient +knowledge for reducing the loss in accuracy, the learning of student is +supervised with multiple losses, which preserves the similarities in various +order relational structures. In this way, the capability of recovering missing +details of familiar low-resolution images can be effectively enhanced, leading +to a better knowledge transfer. Extensive experiments on metric learning, +low-resolution image classification and low-resolution face recognition tasks +show the effectiveness of our approach, while taking reduced models. + +
+
+ comment: Accepted by AAAI 2020 +
+
+
+
+
+ + ☆ Deep Learning for Video Anomaly Detection: A Review + + +
+ Video anomaly detection (VAD) aims to discover behaviors or events deviating +from the normality in videos. As a long-standing task in the field of computer +vision, VAD has witnessed much good progress. In the era of deep learning, with +the explosion of architectures of continuously growing capability and capacity, +a great variety of deep learning based methods are constantly emerging for the +VAD task, greatly improving the generalization ability of detection algorithms +and broadening the application scenarios. Therefore, such a multitude of +methods and a large body of literature make a comprehensive survey a pressing +necessity. In this paper, we present an extensive and comprehensive research +review, covering the spectrum of five different categories, namely, +semi-supervised, weakly supervised, fully supervised, unsupervised and open-set +supervised VAD, and we also delve into the latest VAD works based on +pre-trained large models, remedying the limitations of past reviews in terms of +only focusing on semi-supervised VAD and small model based methods. For the VAD +task with different levels of supervision, we construct a well-organized +taxonomy, profoundly discuss the characteristics of different types of methods, +and show their performance comparisons. In addition, this review involves the +public datasets, open-source codes, and evaluation metrics covering all the +aforementioned VAD tasks. Finally, we provide several important research +directions for the VAD community. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Boosting CLIP Adaptation for Image Quality Assessment via Meta-Prompt + Learning and Gradient Regularization + + +
+ Image Quality Assessment (IQA) remains an unresolved challenge in the field +of computer vision, due to complex distortion conditions, diverse image +content, and limited data availability. The existing Blind IQA (BIQA) methods +heavily rely on extensive human annotations to train models, which is both +labor-intensive and costly due to the demanding nature of creating IQA +datasets. To mitigate the dependence on labeled samples, this paper introduces +a novel Gradient-Regulated Meta-Prompt IQA Framework (GRMP-IQA). This framework +aims to fast adapt the powerful visual-language pre-trained model, CLIP, to +downstream IQA tasks, significantly improving accuracy in scenarios with +limited data. Specifically, the GRMP-IQA comprises two key modules: Meta-Prompt +Pre-training Module and Quality-Aware Gradient Regularization. The Meta Prompt +Pre-training Module leverages a meta-learning paradigm to pre-train soft +prompts with shared meta-knowledge across different distortions, enabling rapid +adaptation to various IQA tasks. On the other hand, the Quality-Aware Gradient +Regularization is designed to adjust the update gradients during fine-tuning, +focusing the model's attention on quality-relevant features and preventing +overfitting to semantic information. Extensive experiments on five standard +BIQA datasets demonstrate the superior performance to the state-of-the-art BIQA +methods under limited data setting, i.e., achieving SRCC values of 0.836 (vs. +0.760 on LIVEC) and 0.853 (vs. 0.812 on KonIQ). Notably, utilizing just 20\% of +the training data, our GRMP-IQA outperforms most existing fully supervised BIQA +methods. + +
+
+
+
+
+ + ☆ Prim2Room: Layout-Controllable Room Mesh Generation from Primitives + + +
+ We propose Prim2Room, a novel framework for controllable room mesh generation +leveraging 2D layout conditions and 3D primitive retrieval to facilitate +precise 3D layout specification. Diverging from existing methods that lack +control and precision, our approach allows for detailed customization of +room-scale environments. To overcome the limitations of previous methods, we +introduce an adaptive viewpoint selection algorithm that allows the system to +generate the furniture texture and geometry from more favorable views than +predefined camera trajectories. Additionally, we employ non-rigid depth +registration to ensure alignment between generated objects and their +corresponding primitive while allowing for shape variations to maintain +diversity. Our method not only enhances the accuracy and aesthetic appeal of +generated 3D scenes but also provides a user-friendly platform for detailed +room design. + +
+
+
+
+
+ + ☆ PersonaTalk: Bring Attention to Your Persona in Visual Dubbing SIGGRAPH + + +
+ For audio-driven visual dubbing, it remains a considerable challenge to +uphold and highlight speaker's persona while synthesizing accurate lip +synchronization. Existing methods fall short of capturing speaker's unique +speaking style or preserving facial details. In this paper, we present +PersonaTalk, an attention-based two-stage framework, including geometry +construction and face rendering, for high-fidelity and personalized visual +dubbing. In the first stage, we propose a style-aware audio encoding module +that injects speaking style into audio features through a cross-attention +layer. The stylized audio features are then used to drive speaker's template +geometry to obtain lip-synced geometries. In the second stage, a dual-attention +face renderer is introduced to render textures for the target geometries. It +consists of two parallel cross-attention layers, namely Lip-Attention and +Face-Attention, which respectively sample textures from different reference +frames to render the entire face. With our innovative design, intricate facial +details can be well preserved. Comprehensive experiments and user studies +demonstrate our advantages over other state-of-the-art methods in terms of +visual quality, lip-sync accuracy and persona preservation. Furthermore, as a +person-generic framework, PersonaTalk can achieve competitive performance as +state-of-the-art person-specific methods. Project Page: +https://grisoon.github.io/PersonaTalk/. + +
+
+ comment: Accepted at SIGGRAPH Asia 2024 (Conference Track) +
+
+
+
+
+ + ☆ Memoryless Multimodal Anomaly Detection via Student-Teacher Network and + Signed Distance Learning + + +
+ Unsupervised anomaly detection is a challenging computer vision task, in +which 2D-based anomaly detection methods have been extensively studied. +However, multimodal anomaly detection based on RGB images and 3D point clouds +requires further investigation. The existing methods are mainly inspired by +memory bank based methods commonly used in 2D-based anomaly detection, which +may cost extra memory for storing mutimodal features. In present study, a novel +memoryless method MDSS is proposed for multimodal anomaly detection, which +employs a light-weighted student-teacher network and a signed distance function +to learn from RGB images and 3D point clouds respectively, and complements the +anomaly information from the two modalities. Specifically, a student-teacher +network is trained with normal RGB images and masks generated from point clouds +by a dynamic loss, and the anomaly score map could be obtained from the +discrepancy between the output of student and teacher. Furthermore, the signed +distance function learns from normal point clouds to predict the signed +distances between points and surface, and the obtained signed distances are +used to generate anomaly score map. Subsequently, the anomaly score maps are +aligned to generate the final anomaly score map for detection. The experimental +results indicate that MDSS is comparable but more stable than the SOTA memory +bank based method Shape-guided, and furthermore performs better than other +baseline methods. + +
+
+ comment: 14 pages, 4 figures, 2 tables, to be published in PRCV-2024 +
+
+
+
+
+ + ☆ KARGEN: Knowledge-enhanced Automated Radiology Report Generation Using + Large Language Models + + +
+ Harnessing the robust capabilities of Large Language Models (LLMs) for +narrative generation, logical reasoning, and common-sense knowledge +integration, this study delves into utilizing LLMs to enhance automated +radiology report generation (R2Gen). Despite the wealth of knowledge within +LLMs, efficiently triggering relevant knowledge within these large models for +specific tasks like R2Gen poses a critical research challenge. This paper +presents KARGEN, a Knowledge-enhanced Automated radiology Report GENeration +framework based on LLMs. Utilizing a frozen LLM to generate reports, the +framework integrates a knowledge graph to unlock chest disease-related +knowledge within the LLM to enhance the clinical utility of generated reports. +This is achieved by leveraging the knowledge graph to distill disease-related +features in a designed way. Since a radiology report encompasses both normal +and disease-related findings, the extracted graph-enhanced disease-related +features are integrated with regional image features, attending to both +aspects. We explore two fusion methods to automatically prioritize and select +the most relevant features. The fused features are employed by LLM to generate +reports that are more sensitive to diseases and of improved quality. Our +approach demonstrates promising results on the MIMIC-CXR and IU-Xray datasets. + +
+
+
+
+
+ + ☆ FedBrain-Distill: Communication-Efficient Federated Brain Tumor + Classification Using Ensemble Knowledge Distillation on Non-IID Data + + +
+ Brain is one the most complex organs in the human body. Due to its +complexity, classification of brain tumors still poses a significant challenge, +making brain tumors a particularly serious medical issue. Techniques such as +Machine Learning (ML) coupled with Magnetic Resonance Imaging (MRI) have paved +the way for doctors and medical institutions to classify different types of +tumors. However, these techniques suffer from limitations that violate patients +privacy. Federated Learning (FL) has recently been introduced to solve such an +issue, but the FL itself suffers from limitations like communication costs and +dependencies on model architecture, forcing all models to have identical +architectures. In this paper, we propose FedBrain-Distill, an approach that +leverages Knowledge Distillation (KD) in an FL setting that maintains the users +privacy and ensures the independence of FL clients in terms of model +architecture. FedBrain-Distill uses an ensemble of teachers that distill their +knowledge to a simple student model. The evaluation of FedBrain-Distill +demonstrated high-accuracy results for both Independent and Identically +Distributed (IID) and non-IID data with substantial low communication costs on +the real-world Figshare brain tumor dataset. It is worth mentioning that we +used Dirichlet distribution to partition the data into IID and non-IID data. +All the implementation details are accessible through our Github repository. + +
+
+
+
+
+ + ☆ Driving with Prior Maps: Unified Vector Prior Encoding for Autonomous + Vehicle Mapping + + +
+ High-Definition Maps (HD maps) are essential for the precise navigation and +decision-making of autonomous vehicles, yet their creation and upkeep present +significant cost and timeliness challenges. The online construction of HD maps +using on-board sensors has emerged as a promising solution; however, these +methods can be impeded by incomplete data due to occlusions and inclement +weather. This paper proposes the PriorDrive framework to addresses these +limitations by harnessing the power of prior maps, significantly enhancing the +robustness and accuracy of online HD map construction. Our approach integrates +a variety of prior maps, such as OpenStreetMap's Standard Definition Maps (SD +maps), outdated HD maps from vendors, and locally constructed maps from +historical vehicle data. To effectively encode this prior information into +online mapping models, we introduce a Hybrid Prior Representation (HPQuery) +that standardizes the representation of diverse map elements. At the core of +PriorDrive is the Unified Vector Encoder (UVE), which employs a dual encoding +mechanism to process vector data. The intra-vector encoder captures +fine-grained local features, while the inter-vector encoder integrates global +context. Furthermore, we propose a segment-level and point-level pre-training +strategy that enables the UVE to learn the prior distribution of vector data, +thereby improving the encoder's generalizability and performance. Through +extensive testing on the nuScenes dataset, we demonstrate that PriorDrive is +highly compatible with various online mapping models and substantially improves +map prediction capabilities. The integration of prior maps through the +PriorDrive framework offers a robust solution to the challenges of +single-perception data, paving the way for more reliable autonomous vehicle +navigation. + +
+
+
+
+
+ + ☆ Early-exit Convolutional Neural Networks + + +
+ This paper is aimed at developing a method that reduces the computational +cost of convolutional neural networks (CNN) during inference. Conventionally, +the input data pass through a fixed neural network architecture. However, easy +examples can be classified at early stages of processing and conventional +networks do not take this into account. In this paper, we introduce 'Early-exit +CNNs', EENets for short, which adapt their computational cost based on the +input by stopping the inference process at certain exit locations. In EENets, +there are a number of exit blocks each of which consists of a confidence branch +and a softmax branch. The confidence branch computes the confidence score of +exiting (i.e. stopping the inference process) at that location; while the +softmax branch outputs a classification probability vector. Both branches are +learnable and their parameters are separate. During training of EENets, in +addition to the classical classification loss, the computational cost of +inference is taken into account as well. As a result, the network adapts its +many confidence branches to the inputs so that less computation is spent for +easy examples. Inference works as in conventional feed-forward networks, +however, when the output of a confidence branch is larger than a certain +threshold, the inference stops for that specific example. The idea of EENets is +applicable to available CNN architectures such as ResNets. Through +comprehensive experiments on MNIST, SVHN, CIFAR10 and Tiny-ImageNet datasets, +we show that early-exit (EE) ResNets achieve similar accuracy with their non-EE +versions while reducing the computational cost to 20% of the original. Code is +available at https://github.com/eksuas/eenets.pytorch + +
+
+
+
+
+ + ☆ A Multi-Modal Deep Learning Based Approach for House Price Prediction + + +
+ Accurate prediction of house price, a vital aspect of the residential real +estate sector, is of substantial interest for a wide range of stakeholders. +However, predicting house prices is a complex task due to the significant +variability influenced by factors such as house features, location, +neighborhood, and many others. Despite numerous attempts utilizing a wide array +of algorithms, including recent deep learning techniques, to predict house +prices accurately, existing approaches have fallen short of considering a wide +range of factors such as textual and visual features. This paper addresses this +gap by comprehensively incorporating attributes, such as features, textual +descriptions, geo-spatial neighborhood, and house images, typically showcased +in real estate listings in a house price prediction system. Specifically, we +propose a multi-modal deep learning approach that leverages different types of +data to learn more accurate representation of the house. In particular, we +learn a joint embedding of raw house attributes, geo-spatial neighborhood, and +most importantly from textual description and images representing the house; +and finally use a downstream regression model to predict the house price from +this jointly learned embedding vector. Our experimental results with a +real-world dataset show that the text embedding of the house advertisement +description and image embedding of the house pictures in addition to raw +attributes and geo-spatial embedding, can significantly improve the house price +prediction accuracy. The relevant source code and dataset are publicly +accessible at the following URL: https://github.com/4P0N/mhpp + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ Lagrangian Hashing for Compressed Neural Field Representations + + +
+ We present Lagrangian Hashing, a representation for neural fields combining +the characteristics of fast training NeRF methods that rely on Eulerian grids +(i.e.~InstantNGP), with those that employ points equipped with features as a +way to represent information (e.g. 3D Gaussian Splatting or PointNeRF). We +achieve this by incorporating a point-based representation into the +high-resolution layers of the hierarchical hash tables of an InstantNGP +representation. As our points are equipped with a field of influence, our +representation can be interpreted as a mixture of Gaussians stored within the +hash table. We propose a loss that encourages the movement of our Gaussians +towards regions that require more representation budget to be sufficiently well +represented. Our main finding is that our representation allows the +reconstruction of signals using a more compact representation without +compromising quality. + +
+
+ comment: Project page: https://theialab.github.io/laghashes/ +
+
+
+
+
+ + ☆ KAN-Based Fusion of Dual-Domain for Audio-Driven Facial Landmarks + Generation + + +
+ Audio-driven talking face generation is a widely researched topic due to its +high applicability. Reconstructing a talking face using audio significantly +contributes to fields such as education, healthcare, online conversations, +virtual assistants, and virtual reality. Early studies often focused solely on +changing the mouth movements, which resulted in outcomes with limited practical +applications. Recently, researchers have proposed a new approach of +constructing the entire face, including face pose, neck, and shoulders. To +achieve this, they need to generate through landmarks. However, creating stable +landmarks that align well with the audio is a challenge. In this paper, we +propose the KFusion of Dual-Domain model, a robust model that generates +landmarks from audio. We separate the audio into two distinct domains to learn +emotional information and facial context, then use a fusion mechanism based on +the KAN model. Our model demonstrates high efficiency compared to recent +models. This will lay the groundwork for the development of the audio-driven +talking face generation problem in the future. + +
+
+
+
+
+ + ☆ ICPR 2024 Competition on Safe Segmentation of Drive Scenes in + Unstructured Traffic and Adverse Weather Conditions ICPR + + +
+ The ICPR 2024 Competition on Safe Segmentation of Drive Scenes in +Unstructured Traffic and Adverse Weather Conditions served as a rigorous +platform to evaluate and benchmark state-of-the-art semantic segmentation +models under challenging conditions for autonomous driving. Over several +months, participants were provided with the IDD-AW dataset, consisting of 5000 +high-quality RGB-NIR image pairs, each annotated at the pixel level and +captured under adverse weather conditions such as rain, fog, low light, and +snow. A key aspect of the competition was the use and improvement of the Safe +mean Intersection over Union (Safe mIoU) metric, designed to penalize unsafe +incorrect predictions that could be overlooked by traditional mIoU. This +innovative metric emphasized the importance of safety in developing autonomous +driving systems. The competition showed significant advancements in the field, +with participants demonstrating models that excelled in semantic segmentation +and prioritized safety and robustness in unstructured and adverse conditions. +The results of the competition set new benchmarks in the domain, highlighting +the critical role of safety in deploying autonomous vehicles in real-world +scenarios. The contributions from this competition are expected to drive +further innovation in autonomous driving technology, addressing the critical +challenges of operating in diverse and unpredictable environments. + +
+
+ comment: 15 pages, 7 figures, ICPR Competition Paper +
+
+
+
+
+ + ☆ FIF-UNet: An Efficient UNet Using Feature Interaction and Fusion for + Medical Image Segmentation + + +
+ Nowadays, pre-trained encoders are widely used in medical image segmentation +because of their ability to capture complex feature representations. However, +the existing models fail to effectively utilize the rich features obtained by +the pre-trained encoder, resulting in suboptimal segmentation results. In this +work, a novel U-shaped model, called FIF-UNet, is proposed to address the above +issue, including three plug-and-play modules. A channel spatial interaction +module (CSI) is proposed to obtain informative features by establishing the +interaction between encoder stages and corresponding decoder stages. A cascaded +conv-SE module (CoSE) is designed to enhance the representation of critical +features by adaptively assigning importance weights on different feature +channels. A multi-level fusion module (MLF) is proposed to fuse the multi-scale +features from the decoder stages, ensuring accurate and robust final +segmentation. Comprehensive experiments on the Synapse and ACDC datasets +demonstrate that the proposed FIF-UNet outperforms existing state-of-the-art +methods, which achieves the highest average DICE of 86.05% and 92.58%, +respectively. + +
+
+
+
+
+ + ☆ Open-World Dynamic Prompt and Continual Visual Representation Learning ECCV 2024 + + +
+ The open world is inherently dynamic, characterized by ever-evolving concepts +and distributions. Continual learning (CL) in this dynamic open-world +environment presents a significant challenge in effectively generalizing to +unseen test-time classes. To address this challenge, we introduce a new +practical CL setting tailored for open-world visual representation learning. In +this setting, subsequent data streams systematically introduce novel classes +that are disjoint from those seen in previous training phases, while also +remaining distinct from the unseen test classes. In response, we present +Dynamic Prompt and Representation Learner (DPaRL), a simple yet effective +Prompt-based CL (PCL) method. Our DPaRL learns to generate dynamic prompts for +inference, as opposed to relying on a static prompt pool in previous PCL +methods. In addition, DPaRL jointly learns dynamic prompt generation and +discriminative representation at each training stage whereas prior PCL methods +only refine the prompt learning throughout the process. Our experimental +results demonstrate the superiority of our approach, surpassing +state-of-the-art methods on well-established open-world image retrieval +benchmarks by an average of 4.7\% improvement in Recall@1 performance. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Fitting Skeletal Models via Graph-based Learning + + +
+ Skeletonization is a popular shape analysis technique that models an object's +interior as opposed to just its boundary. Fitting template-based skeletal +models is a time-consuming process requiring much manual parameter tuning. +Recently, machine learning-based methods have shown promise for generating +s-reps from object boundaries. In this work, we propose a new skeletonization +method which leverages graph convolutional networks to produce skeletal +representations (s-reps) from dense segmentation masks. The method is evaluated +on both synthetic data and real hippocampus segmentations, achieving promising +results and fast inference. + +
+
+ comment: This paper was presented at the 2024 IEEE International Symposium on + Biomedical Imaging (ISBI) +
+
+
+
+
+ + ☆ Neural Surface Reconstruction and Rendering for LiDAR-Visual Systems + + +
+ This paper presents a unified surface reconstruction and rendering framework +for LiDAR-visual systems, integrating Neural Radiance Fields (NeRF) and Neural +Distance Fields (NDF) to recover both appearance and structural information +from posed images and point clouds. We address the structural visible gap +between NeRF and NDF by utilizing a visible-aware occupancy map to classify +space into the free, occupied, visible unknown, and background regions. This +classification facilitates the recovery of a complete appearance and structure +of the scene. We unify the training of the NDF and NeRF using a spatial-varying +scale SDF-to-density transformation for levels of detail for both structure and +appearance. The proposed method leverages the learned NDF for structure-aware +NeRF training by an adaptive sphere tracing sampling strategy for accurate +structure rendering. In return, NeRF further refines structural in recovering +missing or fuzzy structures in the NDF. Extensive experiments demonstrate the +superior quality and versatility of the proposed method across various +scenarios. To benefit the community, the codes will be released at +\url{https://github.com/hku-mars/M2Mapping}. + +
+
+
+
+
+ + ☆ RAL:Redundancy-Aware Lipreading Model Based on Differential Learning + with Symmetric Views + + +
+ Lip reading involves interpreting a speaker's speech by analyzing sequences +of lip movements. Currently, most models regard the left and right halves of +the lips as a symmetrical whole, lacking a thorough investigation of their +differences. However, the left and right halves of the lips are not always +symmetrical, and the subtle differences between them contain rich semantic +information. In this paper, we propose a differential learning strategy with +symmetric views (DLSV) to address this issue. Additionally, input images often +contain a lot of redundant information unrelated to recognition results, which +can degrade the model's performance. We present a redundancy-aware operation +(RAO) to reduce it. Finally, to leverage the relational information between +symmetric views and within each view, we further design an adaptive cross-view +interaction module (ACVI). Experiments on LRW and LRW-1000 datasets fully +demonstrate the effectiveness of our approach. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ RotCAtt-TransUNet++: Novel Deep Neural Network for Sophisticated Cardiac + Segmentation + + +
+ Cardiovascular disease is a major global health concern, contributing +significantly to global mortality. Accurately segmenting cardiac medical +imaging data is crucial for reducing fatality rates associated with these +conditions. However, current state-of-the-art (SOTA) neural networks, including +CNN-based and Transformer-based approaches, face challenges in capturing both +inter-slice connections and intra-slice details, especially in datasets +featuring intricate, long-range details along the z-axis like coronary +arteries. Existing methods also struggle with differentiating non-cardiac +components from the myocardium, resulting in segmentation inaccuracies and the +"spraying" phenomenon. To address these issues, we introduce +RotCAtt-TransUNet++, a novel architecture designed for robust segmentation of +intricate cardiac structures. Our approach enhances global context modeling +through multiscale feature aggregation and nested skip connections in the +encoder. Transformer layers facilitate capturing intra-slice interactions, +while a rotatory attention mechanism handles inter-slice connectivity. A +channel-wise cross-attention gate integrates multiscale information and decoder +features, effectively bridging semantic gaps. Experimental results across +multiple datasets demonstrate superior performance over current methods, +achieving near-perfect annotation of coronary arteries and myocardium. Ablation +studies confirm that our rotatory attention mechanism significantly improves +segmentation accuracy by transforming embedded vectorized patches in semantic +dimensional space. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ BrainDecoder: Style-Based Visual Decoding of EEG Signals + + +
+ Decoding neural representations of visual stimuli from electroencephalography +(EEG) offers valuable insights into brain activity and cognition. Recent +advancements in deep learning have significantly enhanced the field of visual +decoding of EEG, primarily focusing on reconstructing the semantic content of +visual stimuli. In this paper, we present a novel visual decoding pipeline +that, in addition to recovering the content, emphasizes the reconstruction of +the style, such as color and texture, of images viewed by the subject. Unlike +previous methods, this ``style-based'' approach learns in the CLIP spaces of +image and text separately, facilitating a more nuanced extraction of +information from EEG signals. We also use captions for text alignment simpler +than previously employed, which we find work better. Both quantitative and +qualitative evaluations show that our method better preserves the style of +visual stimuli and extracts more fine-grained semantic information from neural +signals. Notably, it achieves significant improvements in quantitative results +and sets a new state-of-the-art on the popular Brain2Image dataset. + +
+
+ comment: 5 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Disentangled Representations for Short-Term and Long-Term Person + Re-Identification + + +
+ We address the problem of person re-identification (reID), that is, +retrieving person images from a large dataset, given a query image of the +person of interest. A key challenge is to learn person representations robust +to intra-class variations, as different persons could have the same attribute, +and persons' appearances look different, e.g., with viewpoint changes. Recent +reID methods focus on learning person features discriminative only for a +particular factor of variations (e.g., human pose), which also requires +corresponding supervisory signals (e.g., pose annotations). To tackle this +problem, we propose to factorize person images into identity-related and +unrelated features. Identity-related features contain information useful for +specifying a particular person (e.g., clothing), while identity-unrelated ones +hold other factors (e.g., human pose). To this end, we propose a new generative +adversarial network, dubbed identity shuffle GAN (IS-GAN). It disentangles +identity-related and unrelated features from person images through an +identity-shuffling technique that exploits identification labels alone without +any auxiliary supervisory signals. We restrict the distribution of +identity-unrelated features or encourage the identity-related and unrelated +features to be uncorrelated, facilitating the disentanglement process. +Experimental results validate the effectiveness of IS-GAN, showing +state-of-the-art performance on standard reID benchmarks, including +Market-1501, CUHK03, and DukeMTMC-reID. We further demonstrate the advantages +of disentangling person representations on a long-term reID task, setting a new +state of the art on a Celeb-reID dataset. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:1910.12003 +
+
+
+
+
+ + ☆ Rethinking the Atmospheric Scattering-driven Attention via Channel and + Gamma Correction Priors for Low-Light Image Enhancement + + +
+ Low-light image enhancement remains a critical challenge in computer vision, +as does the lightweight design for edge devices with the computational burden +for deep learning models. In this article, we introduce an extended version of +Channel-Prior and Gamma-Estimation Network (CPGA-Net), termed CPGA-Net+, which +incorporates an attention mechanism driven by a reformulated Atmospheric +Scattering Model and effectively addresses both global and local image +processing through Plug-in Attention with gamma correction. These innovations +enable CPGA-Net+ to achieve superior performance on image enhancement tasks, +surpassing lightweight state-of-the-art methods with high efficiency. Our +results demonstrate the model's effectiveness and show the potential +applications in resource-constrained environments. + +
+
+
+
+
+ + ☆ Scalable Frame Sampling for Video Classification: A Semi-Optimal Policy + Approach with Reduced Search Space + + +
+ Given a video with $T$ frames, frame sampling is a task to select $N \ll T$ +frames, so as to maximize the performance of a fixed video classifier. Not just +brute-force search, but most existing methods suffer from its vast search space +of $\binom{T}{N}$, especially when $N$ gets large. To address this challenge, +we introduce a novel perspective of reducing the search space from $O(T^N)$ to +$O(T)$. Instead of exploring the entire $O(T^N)$ space, our proposed +semi-optimal policy selects the top $N$ frames based on the independently +estimated value of each frame using per-frame confidence, significantly +reducing the computational complexity. We verify that our semi-optimal policy +can efficiently approximate the optimal policy, particularly under practical +settings. Additionally, through extensive experiments on various datasets and +model architectures, we demonstrate that learning our semi-optimal policy +ensures stable and high performance regardless of the size of $N$ and $T$. + +
+
+
+
+
+ + ☆ Towards Automated Machine Learning Research + + +
+ This paper explores a top-down approach to automating incremental advances in +machine learning research through component-level innovation, facilitated by +Large Language Models (LLMs). Our framework systematically generates novel +components, validates their feasibility, and evaluates their performance +against existing baselines. A key distinction of this approach lies in how +these novel components are generated. Unlike traditional AutoML and NAS +methods, which often rely on a bottom-up combinatorial search over predefined, +hardcoded base components, our method leverages the cross-domain knowledge +embedded in LLMs to propose new components that may not be confined to any +hard-coded predefined set. By incorporating a reward model to prioritize +promising hypotheses, we aim to improve the efficiency of the hypothesis +generation and evaluation process. We hope this approach offers a new avenue +for exploration and contributes to the ongoing dialogue in the field. + +
+
+
+
+
+ + ☆ Label-free evaluation of lung and heart transplant biopsies using + virtual staining + + +
+ Organ transplantation serves as the primary therapeutic strategy for +end-stage organ failures. However, allograft rejection is a common complication +of organ transplantation. Histological assessment is essential for the timely +detection and diagnosis of transplant rejection and remains the gold standard. +Nevertheless, the traditional histochemical staining process is time-consuming, +costly, and labor-intensive. Here, we present a panel of virtual staining +neural networks for lung and heart transplant biopsies, which digitally convert +autofluorescence microscopic images of label-free tissue sections into their +brightfield histologically stained counterparts, bypassing the traditional +histochemical staining process. Specifically, we virtually generated +Hematoxylin and Eosin (H&E), Masson's Trichrome (MT), and Elastic Verhoeff-Van +Gieson (EVG) stains for label-free transplant lung tissue, along with H&E and +MT stains for label-free transplant heart tissue. Subsequent blind evaluations +conducted by three board-certified pathologists have confirmed that the virtual +staining networks consistently produce high-quality histology images with high +color uniformity, closely resembling their well-stained histochemical +counterparts across various tissue features. The use of virtually stained +images for the evaluation of transplant biopsies achieved comparable diagnostic +outcomes to those obtained via traditional histochemical staining, with a +concordance rate of 82.4% for lung samples and 91.7% for heart samples. +Moreover, virtual staining models create multiple stains from the same +autofluorescence input, eliminating structural mismatches observed between +adjacent sections stained in the traditional workflow, while also saving +tissue, expert time, and staining costs. + +
+
+ comment: 21 Pages, 5 Figures +
+
+
+
+
+ + ☆ MRStyle: A Unified Framework for Color Style Transfer with + Multi-Modality Reference + + +
+ In this paper, we introduce MRStyle, a comprehensive framework that enables +color style transfer using multi-modality reference, including image and text. +To achieve a unified style feature space for both modalities, we first develop +a neural network called IRStyle, which generates stylized 3D lookup tables for +image reference. This is accomplished by integrating an interaction +dual-mapping network with a combined supervised learning pipeline, resulting in +three key benefits: elimination of visual artifacts, efficient handling of +high-resolution images with low memory usage, and maintenance of style +consistency even in situations with significant color style variations. For +text reference, we align the text feature of stable diffusion priors with the +style feature of our IRStyle to perform text-guided color style transfer +(TRStyle). Our TRStyle method is highly efficient in both training and +inference, producing notable open-set text-guided transfer results. Extensive +experiments in both image and text settings demonstrate that our proposed +method outperforms the state-of-the-art in both qualitative and quantitative +evaluations. + +
+
+
+
+
+ + ♻ ☆ Pre-processing and Compression: Understanding Hidden Representation + Refinement Across Imaging Domains via Intrinsic Dimension + + +
+ In recent years, there has been interest in how geometric properties such as +intrinsic dimension (ID) of a neural network's hidden representations change +through its layers, and how such properties are predictive of important model +behavior such as generalization ability. However, evidence has begun to emerge +that such behavior can change significantly depending on the domain of the +network's training data, such as natural versus medical images. Here, we +further this inquiry by exploring how the ID of a network's learned +representations changes through its layers, in essence, characterizing how the +network successively refines the information content of input data to be used +for predictions. Analyzing eleven natural and medical image datasets across six +network architectures, we find that how ID changes through the network differs +noticeably between natural and medical image models. Specifically, medical +image models peak in representation ID earlier in the network, implying a +difference in the image features and their abstractness that are typically used +for downstream tasks in these domains. Additionally, we discover a strong +correlation of this peak representation ID with the ID of the data in its input +space, implying that the intrinsic information content of a model's learned +representations is guided by that of the data it was trained on. Overall, our +findings emphasize notable discrepancies in network behavior between natural +and non-natural imaging domains regarding hidden representation information +content, and provide further insights into how a network's learned features are +shaped by its training data. + +
+
+
+
+
+ + ♻ ☆ Guide-and-Rescale: Self-Guidance Mechanism for Effective Tuning-Free + Real Image Editing ECCV 2024 + + +
+ Despite recent advances in large-scale text-to-image generative models, +manipulating real images with these models remains a challenging problem. The +main limitations of existing editing methods are that they either fail to +perform with consistent quality on a wide range of image edits or require +time-consuming hyperparameter tuning or fine-tuning of the diffusion model to +preserve the image-specific appearance of the input image. We propose a novel +approach that is built upon a modified diffusion sampling process via the +guidance mechanism. In this work, we explore the self-guidance technique to +preserve the overall structure of the input image and its local regions +appearance that should not be edited. In particular, we explicitly introduce +layout-preserving energy functions that are aimed to save local and global +structures of the source image. Additionally, we propose a noise rescaling +mechanism that allows to preserve noise distribution by balancing the norms of +classifier-free guidance and our proposed guiders during generation. Such a +guiding approach does not require fine-tuning the diffusion model and exact +inversion process. As a result, the proposed method provides a fast and +high-quality editing mechanism. In our experiments, we show through human +evaluation and quantitative analysis that the proposed method allows to produce +desired editing which is more preferable by humans and also achieves a better +trade-off between editing quality and preservation of the original image. Our +code is available at https://github.com/FusionBrainLab/Guide-and-Rescale. + +
+
+ comment: Accepted to ECCV 2024. The project page is available at + https://fusionbrainlab.github.io/Guide-and-Rescale +
+
+
+
+
+ + ♻ ☆ A Lost Opportunity for Vision-Language Models: A Comparative Study of + Online Test-Time Adaptation for Vision-Language Models ECCV 2024 + + +
+ In deep learning, maintaining model robustness against distribution shifts is +critical. This work explores a broad range of possibilities to adapt +vision-language foundation models at test-time, with a particular emphasis on +CLIP and its variants. The study systematically examines prompt-based +techniques and existing test-time adaptation methods, aiming to improve the +robustness under distribution shift in diverse real-world scenarios. +Specifically, the investigation covers various prompt engineering strategies, +including handcrafted prompts, prompt ensembles, and prompt learning +techniques. Additionally, we introduce a vision-text-space ensemble that +substantially enhances average performance compared to text-space-only +ensembles. Since online test-time adaptation has shown to be effective to +mitigate performance drops under distribution shift, the study extends its +scope to evaluate the effectiveness of existing test-time adaptation methods +that were originally designed for vision-only classification models. Through +extensive experimental evaluations conducted across multiple datasets and +diverse model architectures, the research demonstrates the effectiveness of +these adaptation strategies. Code is available at: +https://github.com/mariodoebler/test-time-adaptation + +
+
+ comment: Accepted at ECCV 2024 OOD-CV Workshop +
+
+
+
+
+ + ♻ ☆ MeshBrush: Painting the Anatomical Mesh with Neural Stylization for + Endoscopy + + +
+ Style transfer is a promising approach to close the sim-to-real gap in +medical endoscopy. Rendering synthetic endoscopic videos by traversing +pre-operative scans (such as MRI or CT) can generate structurally accurate +simulations as well as ground truth camera poses and depth maps. Although +image-to-image (I2I) translation models such as CycleGAN can imitate realistic +endoscopic images from these simulations, they are unsuitable for +video-to-video synthesis due to the lack of temporal consistency, resulting in +artifacts between frames. We propose MeshBrush, a neural mesh stylization +method to synthesize temporally consistent videos with differentiable +rendering. MeshBrush uses the underlying geometry of patient imaging data while +leveraging existing I2I methods. With learned per-vertex textures, the stylized +mesh guarantees consistency while producing high-fidelity outputs. We +demonstrate that mesh stylization is a promising approach for creating +realistic simulations for downstream tasks such as training networks and +preoperative planning. Although our method is tested and designed for +ureteroscopy, its components are transferable to general endoscopic and +laparoscopic procedures. The code will be made public on GitHub. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Evaluation of Histopathology Foundation Models for + Ovarian Cancer Subtype Classification + + +
+ Large pretrained transformers are increasingly being developed as generalised +foundation models which can underpin powerful task-specific artificial +intelligence models. Histopathology foundation models show great promise across +many tasks, but analyses have typically been limited by arbitrary +hyperparameters that were not tuned to the specific task. We report the most +rigorous single-task validation of histopathology foundation models to date, +specifically in ovarian cancer morphological subtyping. Attention-based +multiple instance learning classifiers were compared using three +ImageNet-pretrained feature extractors and fourteen histopathology foundation +models. The training set consisted of 1864 whole slide images from 434 ovarian +carcinoma cases at Leeds Teaching Hospitals NHS Trust. Five-class +classification performance was evaluated through five-fold cross-validation, +and these cross-validation models were ensembled for hold-out testing and +external validation on the Transcanadian Study and OCEAN Challenge datasets. +The best-performing model used the H-optimus-0 foundation model, with +five-class balanced accuracies of 89%, 97%, and 74% in the test sets. +Normalisations and augmentations aided the performance of the +ImageNet-pretrained ResNets, but these were still outperformed by 13 of the 14 +foundation models. Hyperparameter tuning the downstream classifiers improved +performance by a median 1.9% balanced accuracy, with many improvements being +statistically significant. Histopathology foundation models offer a clear +benefit to ovarian cancer subtyping, improving classification performance to a +degree where clinical utility is tangible, albeit with an increased +computational burden. Such models could provide a second opinion to +histopathologists diagnosing challenging cases and may improve the accuracy, +objectivity, and efficiency of pathological diagnoses overall. + +
+
+
+
+
+ + ♻ ☆ Espresso: Robust Concept Filtering in Text-to-Image Models + + +
+ Diffusion based text-to-image models are trained on large datasets scraped +from the Internet, potentially containing unacceptable concepts (e.g., +copyright infringing or unsafe). We need concept removal techniques (CRTs) +which are effective in preventing the generation of images with unacceptable +concepts, utility-preserving on acceptable concepts, and robust against evasion +with adversarial prompts. None of the prior CRTs satisfy all these requirements +simultaneously. We introduce Espresso, the first robust concept filter based on +Contrastive Language-Image Pre-Training (CLIP). We configure CLIP to identify +unacceptable concepts in generated images using the distance of their +embeddings to the text embeddings of both unacceptable and acceptable concepts. +This lets us fine-tune for robustness by separating the text embeddings of +unacceptable and acceptable concepts while preserving their pairing with image +embeddings for utility. We present a pipeline to evaluate various CRTs, attacks +against them, and show that Espresso, is more effective and robust than prior +CRTs, while retaining utility. + +
+
+
+
+
+ + ♻ ☆ Stepping Stones: A Progressive Training Strategy for Audio-Visual + Semantic Segmentation ECCV2024 + + +
+ Audio-Visual Segmentation (AVS) aims to achieve pixel-level localization of +sound sources in videos, while Audio-Visual Semantic Segmentation (AVSS), as an +extension of AVS, further pursues semantic understanding of audio-visual +scenes. However, since the AVSS task requires the establishment of audio-visual +correspondence and semantic understanding simultaneously, we observe that +previous methods have struggled to handle this mashup of objectives in +end-to-end training, resulting in insufficient learning and sub-optimization. +Therefore, we propose a two-stage training strategy called \textit{Stepping +Stones}, which decomposes the AVSS task into two simple subtasks from +localization to semantic understanding, which are fully optimized in each stage +to achieve step-by-step global optimization. This training strategy has also +proved its generalization and effectiveness on existing methods. To further +improve the performance of AVS tasks, we propose a novel framework Adaptive +Audio Visual Segmentation, in which we incorporate an adaptive audio query +generator and integrate masked attention into the transformer decoder, +facilitating the adaptive fusion of visual and audio features. Extensive +experiments demonstrate that our methods achieve state-of-the-art results on +all three AVS benchmarks. The project homepage can be accessed at +https://gewu-lab.github.io/stepping_stones/. + +
+
+ comment: ECCV2024 poster. Project url: + https://gewu-lab.github.io/stepping_stones +
+
+
+
+
+ + ♻ ☆ PromptCCD: Learning Gaussian Mixture Prompt Pool for Continual Category + Discovery ECCV 2024 + + +
+ We tackle the problem of Continual Category Discovery (CCD), which aims to +automatically discover novel categories in a continuous stream of unlabeled +data while mitigating the challenge of catastrophic forgetting -- an open +problem that persists even in conventional, fully supervised continual +learning. To address this challenge, we propose PromptCCD, a simple yet +effective framework that utilizes a Gaussian Mixture Model (GMM) as a prompting +method for CCD. At the core of PromptCCD lies the Gaussian Mixture Prompting +(GMP) module, which acts as a dynamic pool that updates over time to facilitate +representation learning and prevent forgetting during category discovery. +Moreover, GMP enables on-the-fly estimation of category numbers, allowing +PromptCCD to discover categories in unlabeled data without prior knowledge of +the category numbers. We extend the standard evaluation metric for Generalized +Category Discovery (GCD) to CCD and benchmark state-of-the-art methods on +diverse public datasets. PromptCCD significantly outperforms existing methods, +demonstrating its effectiveness. Project page: +https://visual-ai.github.io/promptccd . + +
+
+ comment: ECCV 2024, Project page: https://visual-ai.github.io/promptccd +
+
+
+
+
+ + ♻ ☆ DiffCSG: Differentiable CSG via Rasterization + + +
+ Differentiable rendering is a key ingredient for inverse rendering and +machine learning, as it allows to optimize scene parameters (shape, materials, +lighting) to best fit target images. Differentiable rendering requires that +each scene parameter relates to pixel values through differentiable operations. +While 3D mesh rendering algorithms have been implemented in a differentiable +way, these algorithms do not directly extend to Constructive-Solid-Geometry +(CSG), a popular parametric representation of shapes, because the underlying +boolean operations are typically performed with complex black-box +mesh-processing libraries. We present an algorithm, DiffCSG, to render CSG +models in a differentiable manner. Our algorithm builds upon CSG rasterization, +which displays the result of boolean operations between primitives without +explicitly computing the resulting mesh and, as such, bypasses black-box mesh +processing. We describe how to implement CSG rasterization within a +differentiable rendering pipeline, taking special care to apply antialiasing +along primitive intersections to obtain gradients in such critical areas. Our +algorithm is simple and fast, can be easily incorporated into modern machine +learning setups, and enables a range of applications for computer-aided design, +including direct and image-based editing of CSG primitives. Code and data: +https://yyyyyhc.github.io/DiffCSG/. + +
+
+
+
+
+ + ♻ ☆ Prediction-Feedback DETR for Temporal Action Detection + + +
+ Temporal Action Detection (TAD) is fundamental yet challenging for real-world +video applications. Leveraging the unique benefits of transformers, various +DETR-based approaches have been adopted in TAD. However, it has recently been +identified that the attention collapse in self-attention causes the performance +degradation of DETR for TAD. Building upon previous research, this paper newly +addresses the attention collapse problem in cross-attention within DETR-based +TAD methods. Moreover, our findings reveal that cross-attention exhibits +patterns distinct from predictions, indicating a short-cut phenomenon. To +resolve this, we propose a new framework, Prediction-Feedback DETR (Pred-DETR), +which utilizes predictions to restore the collapse and align the cross- and +self-attention with predictions. Specifically, we devise novel +prediction-feedback objectives using guidance from the relations of the +predictions. As a result, Pred-DETR significantly alleviates the collapse and +achieves state-of-the-art performance among DETR-based methods on various +challenging benchmarks including THUMOS14, ActivityNet-v1.3, HACS, and +FineAction. + +
+
+
+
+
+ + ♻ ☆ 3D Lymphoma Segmentation on PET/CT Images via Multi-Scale Information + Fusion with Cross-Attention + + +
+ Background: Accurate segmentation of diffuse large B-cell lymphoma (DLBCL) +lesions is challenging due to their complex patterns in medical imaging. + Objective: This study aims to develop a precise segmentation method for DLBCL +using 18F-Fluorodeoxyglucose (FDG) positron emission tomography (PET) and +computed tomography (CT) images. + Methods: We propose a 3D dual-branch encoder segmentation method using +shifted window transformers and a Multi-Scale Information Fusion (MSIF) module. +To enhance feature integration, the MSIF module performs multi-scale feature +fusion using cross-attention mechanisms with a shifted window framework. A +gated neural network within the MSIF module dynamically balances the +contributions from each modality. The model was optimized using the Dice +Similarity Coefficient (DSC) loss function. Additionally, we computed the total +metabolic tumor volume (TMTV) and performed statistical analyses. + Results: The model was trained and validated on a dataset of 165 DLBCL +patients using 5-fold cross-validation, achieving a DSC of 0.7512. Statistical +analysis showed a significant improvement over comparative methods (p < 0.05). +Additionally, a Pearson correlation coefficient of 0.91 and an R^2 of 0.89 were +observed when comparing manual annotations to segmentation results for TMTV +measurement. + Conclusion: This study presents an effective automatic segmentation method +for DLBCL that leverages the complementary strengths of PET and CT imaging. Our +method has the potential to improve diagnostic interpretations and assist in +treatment planning for DLBCL patients. + +
+
+ comment: 19 pages, 7 figures; reference added +
+
+
+
+
+ + ♻ ☆ Long-term Pre-training for Temporal Action Detection with Transformers + + +
+ Temporal action detection (TAD) is challenging, yet fundamental for +real-world video applications. Recently, DETR-based models for TAD have been +prevailing thanks to their unique benefits. However, transformers demand a huge +dataset, and unfortunately data scarcity in TAD causes a severe degeneration. +In this paper, we identify two crucial problems from data scarcity: attention +collapse and imbalanced performance. To this end, we propose a new pre-training +strategy, Long-Term Pre-training (LTP), tailored for transformers. LTP has two +main components: 1) class-wise synthesis, 2) long-term pretext tasks. Firstly, +we synthesize long-form video features by merging video snippets of a target +class and non-target classes. They are analogous to untrimmed data used in TAD, +despite being created from trimmed data. In addition, we devise two types of +long-term pretext tasks to learn long-term dependency. They impose long-term +conditions such as finding second-to-fourth or short-duration actions. Our +extensive experiments show state-of-the-art performances in DETR-based methods +on ActivityNet-v1.3 and THUMOS14 by a large margin. Moreover, we demonstrate +that LTP significantly relieves the data scarcity issues in TAD. + +
+
+
+
+
+ + ♻ ☆ TivNe-SLAM: Dynamic Mapping and Tracking via Time-Varying Neural + Radiance Fields + + +
+ Previous attempts to integrate Neural Radiance Fields (NeRF) into the +Simultaneous Localization and Mapping (SLAM) framework either rely on the +assumption of static scenes or require the ground truth camera poses, which +impedes their application in real-world scenarios. This paper proposes a +time-varying representation to track and reconstruct the dynamic scenes. +Firstly, two processes, a tracking process and a mapping process, are +maintained simultaneously in our framework. In the tracking process, all input +images are uniformly sampled and then progressively trained in a +self-supervised paradigm. In the mapping process, we leverage motion masks to +distinguish dynamic objects from the static background, and sample more pixels +from dynamic areas. Secondly, the parameter optimization for both processes is +comprised of two stages: the first stage associates time with 3D positions to +convert the deformation field to the canonical field. The second stage +associates time with the embeddings of the canonical field to obtain colors and +a Signed Distance Function (SDF). Lastly, we propose a novel keyframe selection +strategy based on the overlapping rate. Our approach is evaluated on two +synthetic datasets and one real-world dataset, and the experiments validate +that our method achieves competitive results in both tracking and mapping when +compared to existing state-of-the-art NeRF-based dynamic SLAM systems. + +
+
+
+
+
+ + ♻ ☆ X-InstructBLIP: A Framework for aligning X-Modal instruction-aware + representations to LLMs and Emergent Cross-modal Reasoning + + +
+ Recent research has achieved significant advancements in visual reasoning +tasks through learning image-to-language projections and leveraging the +impressive reasoning abilities of Large Language Models (LLMs). This paper +introduces an efficient and effective framework that integrates multiple +modalities (images, 3D, audio and video) to a frozen LLM and demonstrates an +emergent ability for cross-modal reasoning (2+ modality inputs). Our approach +explores two distinct projection mechanisms: Q-Formers and Linear Projections +(LPs). Through extensive experimentation across all four modalities on 16 +benchmarks, we explore both methods and assess their adaptability in integrated +and separate cross-modal reasoning. The Q-Former projection demonstrates +superior performance in single modality scenarios and adaptability in joint +versus discriminative reasoning involving two or more modalities. However, it +exhibits lower generalization capabilities than linear projection in contexts +where task-modality data are limited. To enable this framework, we devise a +scalable pipeline that automatically generates high-quality, instruction-tuning +datasets from readily available captioning data across different modalities, +and contribute 24K QA data for audio and 250K QA data for 3D. To facilitate +further research in cross-modal reasoning, we introduce the DisCRn +(Discriminative Cross-modal Reasoning) benchmark comprising 9K audio-video QA +samples and 28K image-3D QA samples that require the model to reason +discriminatively across disparate input modalities. + +
+
+
+
+
+ + ♻ ☆ The Influence of Faulty Labels in Data Sets on Human Pose Estimation + + +
+ In this study we provide empirical evidence demonstrating that the quality of +training data impacts model performance in Human Pose Estimation (HPE). +Inaccurate labels in widely used data sets, ranging from minor errors to severe +mislabeling, can negatively influence learning and distort performance metrics. +We perform an in-depth analysis of popular HPE data sets to show the extent and +nature of label inaccuracies. Our findings suggest that accounting for the +impact of faulty labels will facilitate the development of more robust and +accurate HPE models for a variety of real-world applications. We show improved +performance with cleansed data. + +
+
+ comment: 15 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale + Space Using Wearable IMUs and LiDAR + + +
+ We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture +method, aimed at accurately and efficiently creating a dynamic digital world, +containing large-scale indoor-outdoor scenes, diverse human motions, rich +human-human interactions, and human-environment interactions. By utilizing +body-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human +motions in unconstrained space without the need for external devices and +pre-built maps. This affords great flexibility and accessibility for +human-centered interaction and 4D scene capturing in various environments. +Taking into account that IMUs can capture human spatially unrestricted poses +but are prone to drifting for long-period using, and while LiDAR is stable for +global localization but rough for local positions and orientations, HiSC4D +employs a joint optimization method, harmonizing all sensors and utilizing +environment cues, yielding promising results for long-term capture in large +scenes. To promote research of egocentric human interaction in large scenes and +facilitate downstream tasks, we also present a dataset, containing 8 sequences +in 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D +human motions with SMPL annotations and dynamic scenes, 31k frames of cropped +human point clouds, and scene mesh of the environment. A variety of scenarios, +such as the basketball gym and commercial street, alongside challenging human +motions, such as daily greeting, one-on-one basketball playing, and tour +guiding, demonstrate the effectiveness and the generalization ability of +HiSC4D. The dataset and code will be publicated on +www.lidarhumanmotion.net/hisc4d available for research purposes. + +
+
+ comment: 17 pages, 10 figures, Jornal +
+
+
+
+
+ + ♻ ☆ The Principle of Uncertain Maximum Entropy + + +
+ The principle of maximum entropy is a well-established technique for choosing +a distribution that matches available information while minimizing bias. It +finds broad use across scientific disciplines and in machine learning. However, +the principle as defined by is susceptible to noise and error in observations. +This forces real-world practitioners to use relaxed versions of the principle +in an ad hoc way, negatively impacting interpretation. To address this +situation, we present a new principle we call uncertain maximum entropy that +generalizes the classic principle and provides interpretable solutions +irrespective of the observational methods in use. We introduce a convex +approximation and expectation-maximization based algorithm for finding +solutions to our new principle. Finally, we contrast this new technique with +two simpler generally applicable solutions theoretically and experimentally +show our technique provides superior accuracy. + +
+
+
+
+
+ + ♻ ☆ Auto-ACD: A Large-scale Dataset for Audio-Language Representation + Learning ACM MM 2024 + + +
+ Recently, the AI community has made significant strides in developing +powerful foundation models, driven by large-scale multimodal datasets. However, +for audio representation learning, existing datasets suffer from limitations in +the following aspects: insufficient volume, simplistic content, and arduous +collection procedures. To establish an audio dataset with high-quality +captions, we propose an innovative, automatic approach leveraging multimodal +inputs, such as video frames, audio streams. Specifically, we construct a +large-scale, high-quality, audio-language dataset, named as Auto-ACD, +comprising over 1.5M audio-text pairs. We exploit a series of pre-trained +models or APIs, to determine audio-visual synchronisation, generate image +captions, object detection, or audio tags for specific videos. Subsequently, we +employ LLM to paraphrase a congruent caption for each audio, guided by the +extracted multi-modality clues. To demonstrate the effectiveness of the +proposed dataset, we train widely used models on our dataset and show +performance improvement on various downstream tasks, for example, +audio-language retrieval, audio captioning, zero-shot classification. In +addition, we establish a novel benchmark with environmental information and +provide a benchmark for audio-text tasks. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Separation of Body and Background in Radiological Images. A Practical + Python Code + + +
+ Radiological images, such as magnetic resonance imaging (MRI) and computed +tomography (CT) images, typically consist of a body part and a dark background. +For many analyses, it is necessary to separate the body part from the +background. In this article, we present a Python code designed to separate body +and background regions in 2D and 3D radiological images. We tested the +algorithm on various MRI and CT images of different body parts, including the +brain, neck, and abdominal regions. Additionally, we introduced a method for +intensity normalization and outlier restriction, adjusted for data conversion +into 8-bit unsigned integer (UINT8) format, and examined its effects on +body-background separation. Our Python code is available for use with proper +citation. + +
+
+ comment: 14 pages, 8 figures. typos corrected +
+
+
+
+
+ + ♻ ☆ TextGaze: Gaze-Controllable Face Generation with Natural Language ACM MM2024 + + +
+ Generating face image with specific gaze information has attracted +considerable attention. Existing approaches typically input gaze values +directly for face generation, which is unnatural and requires annotated gaze +datasets for training, thereby limiting its application. In this paper, we +present a novel gaze-controllable face generation task. Our approach inputs +textual descriptions that describe human gaze and head behavior and generates +corresponding face images. Our work first introduces a text-of-gaze dataset +containing over 90k text descriptions spanning a dense distribution of gaze and +head poses. We further propose a gaze-controllable text-to-face method. Our +method contains a sketch-conditioned face diffusion module and a model-based +sketch diffusion module. We define a face sketch based on facial landmarks and +eye segmentation map. The face diffusion module generates face images from the +face sketch, and the sketch diffusion module employs a 3D face model to +generate face sketch from text description. Experiments on the FFHQ dataset +show the effectiveness of our method. We will release our dataset and code for +future research. + +
+
+ comment: ACM MM2024 +
+
+
+
+
+ + ♻ ☆ FashionLOGO: Prompting Multimodal Large Language Models for Fashion Logo + Embeddings + + +
+ Logo embedding models convert the product logos in images into vectors, +enabling their utilization for logo recognition and detection within e-commerce +platforms. This facilitates the enforcement of intellectual property rights and +enhances product search capabilities. However, current methods treat logo +embedding as a purely visual problem. A noteworthy issue is that visual models +capture features more than logos. Instead, we view this as a multimodal task, +using text as auxiliary information to facilitate the visual model's +understanding of the logo. The emerging Multimodal Large Language Models +(MLLMs) have demonstrated remarkable capabilities in both visual and textual +understanding. Inspired by this, we propose an approach, \textbf{FashionLOGO}, +to explore how to prompt MLLMs to generate appropriate text for product images, +which can help visual models achieve better logo embeddings. We adopt a +cross-attention transformer block that enables visual embedding to +automatically learn supplementary knowledge from textual embedding. Our +extensive experiments on real-world datasets prove that FashionLOGO is capable +of generating generic and robust logo embeddings, achieving state-of-the-art +performance in all benchmarks. + +
+
+
+
+
+ + ♻ ☆ CLFT: Camera-LiDAR Fusion Transformer for Semantic Segmentation in + Autonomous Driving + + +
+ Critical research about camera-and-LiDAR-based semantic object segmentation +for autonomous driving significantly benefited from the recent development of +deep learning. Specifically, the vision transformer is the novel ground-breaker +that successfully brought the multi-head-attention mechanism to computer vision +applications. Therefore, we propose a vision-transformer-based network to carry +out camera-LiDAR fusion for semantic segmentation applied to autonomous +driving. Our proposal uses the novel progressive-assemble strategy of vision +transformers on a double-direction network and then integrates the results in a +cross-fusion strategy over the transformer decoder layers. Unlike other works +in the literature, our camera-LiDAR fusion transformers have been evaluated in +challenging conditions like rain and low illumination, showing robust +performance. The paper reports the segmentation results over the vehicle and +human classes in different modalities: camera-only, LiDAR-only, and +camera-LiDAR fusion. We perform coherent controlled benchmark experiments of +CLFT against other networks that are also designed for semantic segmentation. +The experiments aim to evaluate the performance of CLFT independently from two +perspectives: multimodal sensor fusion and backbone architectures. The +quantitative assessments show our CLFT networks yield an improvement of up to +10% for challenging dark-wet conditions when comparing with +Fully-Convolutional-Neural-Network-based (FCN) camera-LiDAR fusion neural +network. Contrasting to the network with transformer backbone but using single +modality input, the all-around improvement is 5-10%. + +
+
+ comment: Accepted to IEEE Transactions on Intelligent Vehicles +
+
+
+
+
+ + ♻ ☆ OD-VAE: An Omni-dimensional Video Compressor for Improving Latent Video + Diffusion Model + + +
+ Variational Autoencoder (VAE), compressing videos into latent +representations, is a crucial preceding component of Latent Video Diffusion +Models (LVDMs). With the same reconstruction quality, the more sufficient the +VAE's compression for videos is, the more efficient the LVDMs are. However, +most LVDMs utilize 2D image VAE, whose compression for videos is only in the +spatial dimension and often ignored in the temporal dimension. How to conduct +temporal compression for videos in a VAE to obtain more concise latent +representations while promising accurate reconstruction is seldom explored. To +fill this gap, we propose an omni-dimension compression VAE, named OD-VAE, +which can temporally and spatially compress videos. Although OD-VAE's more +sufficient compression brings a great challenge to video reconstruction, it can +still achieve high reconstructed accuracy by our fine design. To obtain a +better trade-off between video reconstruction quality and compression speed, +four variants of OD-VAE are introduced and analyzed. In addition, a novel tail +initialization is designed to train OD-VAE more efficiently, and a novel +inference strategy is proposed to enable OD-VAE to handle videos of arbitrary +length with limited GPU memory. Comprehensive experiments on video +reconstruction and LVDM-based video generation demonstrate the effectiveness +and efficiency of our proposed methods. + +
+
+ comment: https://github.com/PKU-YuanGroup/Open-Sora-Plan +
+
+
+
+
+ + ♻ ☆ Question-Answering Dense Video Events AAAI 2025 + + +
+ Multimodal Large Language Models (MLLMs) have shown excellent performance in +question-answering of single-event videos. In this paper, we present +question-answering dense video events, a novel task that requires answering and +grounding the dense-event questions in long videos, thus challenging MLLMs to +faithfully comprehend and reason about multiple events occurring over extended +time periods. To facilitate the study, we construct DeVE-QA - a dataset +featuring 78K questions about 26K events on 10.6K long videos. We then +benchmark and show that existing MLLMs excelling at single-event QA struggle to +perform well in DeVE-QA. For improvement, we propose DeVi, a novel +training-free MLLM approach that highlights a hierarchical captioning module, a +temporal event memory module, and a self-consistency checking module to +respectively detect, contextualize and memorize, and ground dense-events in +long videos for question answering. Extensive experiments show that DeVi is +superior at answering dense-event questions and grounding relevant video +moments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1 +percent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA +respectively. + +
+
+ comment: Submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding + Benchmark + + +
+ As the capabilities of large multimodal models (LMMs) continue to advance, +evaluating the performance of LMMs emerges as an increasing need. Additionally, +there is an even larger gap in evaluating the advanced knowledge and reasoning +abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU, +a new Chinese Massive Multi-discipline Multimodal Understanding benchmark +designed to evaluate LMMs on tasks demanding college-level subject knowledge +and deliberate reasoning in a Chinese context. CMMMU is inspired by and +strictly follows the annotation and analysis pattern of MMMU. CMMMU includes +12k manually collected multimodal questions from college exams, quizzes, and +textbooks, covering six core disciplines: Art & Design, Business, Science, +Health & Medicine, Humanities & Social Science, and Tech & Engineering, like +its companion, MMMU. These questions span 30 subjects and comprise 39 highly +heterogeneous image types, such as charts, diagrams, maps, tables, music +sheets, and chemical structures. CMMMU focuses on complex perception and +reasoning with domain-specific knowledge in the Chinese context. We evaluate 11 +open-source LLMs and one proprietary GPT-4V(ision). Even GPT-4V only achieves +accuracies of 42%, indicating a large space for improvement. CMMMU will boost +the community to build the next-generation LMMs towards expert artificial +intelligence and promote the democratization of LMMs by providing diverse +language contexts. + +
+
+
+
+
+ + ♻ ☆ Concept Conductor: Orchestrating Multiple Personalized Concepts in + Text-to-Image Synthesis + + +
+ The customization of text-to-image models has seen significant advancements, +yet generating multiple personalized concepts remains a challenging task. +Current methods struggle with attribute leakage and layout confusion when +handling multiple concepts, leading to reduced concept fidelity and semantic +consistency. In this work, we introduce a novel training-free framework, +Concept Conductor, designed to ensure visual fidelity and correct layout in +multi-concept customization. Concept Conductor isolates the sampling processes +of multiple custom models to prevent attribute leakage between different +concepts and corrects erroneous layouts through self-attention-based spatial +guidance. Additionally, we present a concept injection technique that employs +shape-aware masks to specify the generation area for each concept. This +technique injects the structure and appearance of personalized concepts through +feature fusion in the attention layers, ensuring harmony in the final image. +Extensive qualitative and quantitative experiments demonstrate that Concept +Conductor can consistently generate composite images with accurate layouts +while preserving the visual details of each concept. Compared to existing +baselines, Concept Conductor shows significant performance improvements. Our +method supports the combination of any number of concepts and maintains high +fidelity even when dealing with visually similar concepts. The code and models +are available at https://github.com/Nihukat/Concept-Conductor. + +
+
+ comment: Github Page: https://github.com/Nihukat/Concept-Conductor +
+
+
+
+
+ + ♻ ☆ No Captions, No Problem: Captionless 3D-CLIP Alignment with Hard + Negatives via CLIP Knowledge and LLMs BMVC 2024 + + +
+ In this study, we explore an alternative approach to enhance contrastive +text-image-3D alignment in the absence of textual descriptions for 3D objects. +We introduce two unsupervised methods, $I2I$ and $(I2L)^2$, which leverage CLIP +knowledge about textual and 2D data to compute the neural perceived similarity +between two 3D samples. We employ the proposed methods to mine 3D hard +negatives, establishing a multimodal contrastive pipeline with hard negative +weighting via a custom loss function. We train on different configurations of +the proposed hard negative mining approach, and we evaluate the accuracy of our +models in 3D classification and on the cross-modal retrieval benchmark, testing +image-to-shape and shape-to-image retrieval. Results demonstrate that our +approach, even without explicit text alignment, achieves comparable or superior +performance on zero-shot and standard 3D classification, while significantly +improving both image-to-shape and shape-to-image retrieval compared to previous +methods. + +
+
+ comment: to be published in BMVC 2024 Proceedings +
+
+
+
+
+ + ♻ ☆ SUM: Saliency Unification through Mamba for Visual Attention Modeling WACV 2025 + + +
+ Visual attention modeling, important for interpreting and prioritizing visual +stimuli, plays a significant role in applications such as marketing, +multimedia, and robotics. Traditional saliency prediction models, especially +those based on Convolutional Neural Networks (CNNs) or Transformers, achieve +notable success by leveraging large-scale annotated datasets. However, the +current state-of-the-art (SOTA) models that use Transformers are +computationally expensive. Additionally, separate models are often required for +each image type, lacking a unified approach. In this paper, we propose Saliency +Unification through Mamba (SUM), a novel approach that integrates the efficient +long-range dependency modeling of Mamba with U-Net to provide a unified model +for diverse image types. Using a novel Conditional Visual State Space (C-VSS) +block, SUM dynamically adapts to various image types, including natural scenes, +web pages, and commercial imagery, ensuring universal applicability across +different data types. Our comprehensive evaluations across five benchmarks +demonstrate that SUM seamlessly adapts to different visual characteristics and +consistently outperforms existing models. These results position SUM as a +versatile and powerful tool for advancing visual attention modeling, offering a +robust solution universally applicable across different types of visual +content. + +
+
+ comment: Accepted at IEEE/CVF WACV 2025 +
+
+
+
+
+ + ♻ ☆ Student Classroom Behavior Detection based on YOLOv7-BRA and Multi-Model + Fusion + + +
+ Accurately detecting student behavior in classroom videos can aid in +analyzing their classroom performance and improving teaching effectiveness. +However, the current accuracy rate in behavior detection is low. To address +this challenge, we propose the Student Classroom Behavior Detection system +based on based on YOLOv7-BRA (YOLOv7 with Bi-level Routing Attention ). We +identified eight different behavior patterns, including standing, sitting, +speaking, listening, walking, raising hands, reading, and writing. We +constructed a dataset, which contained 11,248 labels and 4,001 images, with an +emphasis on the common behavior of raising hands in a classroom setting +(Student Classroom Behavior dataset, SCB-Dataset). To improve detection +accuracy, we added the biformer attention module to the YOLOv7 network. +Finally, we fused the results from YOLOv7 CrowdHuman, SlowFast, and DeepSort +models to obtain student classroom behavior data. We conducted experiments on +the SCB-Dataset, and YOLOv7-BRA achieved an mAP@0.5 of 87.1%, resulting in a +2.2% improvement over previous results. Our SCB-dataset can be downloaded from: +https://github.com/Whiffe/SCB-datase + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2304.02488 +
+
+
+
+
+ + ♻ ☆ Student Classroom Behavior Detection based on Improved YOLOv7 + + +
+ Accurately detecting student behavior in classroom videos can aid in +analyzing their classroom performance and improving teaching effectiveness. +However, the current accuracy rate in behavior detection is low. To address +this challenge, we propose the Student Classroom Behavior Detection method, +based on improved YOLOv7. First, we created the Student Classroom Behavior +dataset (SCB-Dataset), which includes 18.4k labels and 4.2k images, covering +three behaviors: hand raising, reading, and writing. To improve detection +accuracy in crowded scenes, we integrated the biformer attention module and +Wise-IoU into the YOLOv7 network. Finally, experiments were conducted on the +SCB-Dataset, and the model achieved an mAP@0.5 of 79%, resulting in a 1.8% +improvement over previous results. The SCB-Dataset and code are available for +download at: https://github.com/Whiffe/SCB-dataset. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2305.07825 +
+
+
+
+
+ + ♻ ☆ SCB-Dataset3: A Benchmark for Detecting Student Classroom Behavior + + +
+ The use of deep learning methods to automatically detect students' classroom +behavior is a promising approach for analyzing their class performance and +improving teaching effectiveness. However, the lack of publicly available +datasets on student behavior poses a challenge for researchers in this field. +To address this issue, we propose the Student Classroom Behavior dataset +(SCB-dataset3), which represents real-life scenarios. Our dataset comprises +5686 images with 45578 labels, focusing on six behaviors: hand-raising, +reading, writing, using a phone, bowing the head, and leaning over the table. +We evaluated the dataset using the YOLOv5, YOLOv7, and YOLOv8 algorithms, +achieving a mean average precision (map) of up to 80.3$\%$. We believe that our +dataset can serve as a robust foundation for future research in student +behavior detection and contribute to advancements in this field. Our +SCB-dataset3 is available for download at: +https://github.com/Whiffe/SCB-dataset + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.02488, + arXiv:2306.03318 +
+
+
+
+
+ + ♻ ☆ A Spatio-Temporal Attention-Based Method for Detecting Student Classroom + Behaviors + + +
+ Accurately detecting student behavior from classroom videos is beneficial for +analyzing their classroom status and improving teaching efficiency. However, +low accuracy in student classroom behavior detection is a prevalent issue. To +address this issue, we propose a Spatio-Temporal Attention-Based Method for +Detecting Student Classroom Behaviors (BDSTA). Firstly, the SlowFast network is +used to generate motion and environmental information feature maps from the +video. Then, the spatio-temporal attention module is applied to the feature +maps, including information aggregation, compression and stimulation processes. +Subsequently, attention maps in the time, channel and space dimensions are +obtained, and multi-label behavior classification is performed based on these +attention maps. To solve the long-tail data problem that exists in student +classroom behavior datasets, we use an improved focal loss function to assign +more weight to the tail class data during training. Experimental results are +conducted on a self-made student classroom behavior dataset named STSCB. +Compared with the SlowFast model, the average accuracy of student behavior +classification detection improves by 8.94\% using BDSTA. + +
+
+
+
+
+ + ♻ ☆ Student Classroom Behavior Detection based on Spatio-Temporal Network + and Multi-Model Fusion + + +
+ Using deep learning methods to detect students' classroom behavior +automatically is a promising approach for analyzing their class performance and +improving teaching effectiveness. However, the lack of publicly available +spatio-temporal datasets on student behavior, as well as the high cost of +manually labeling such datasets, pose significant challenges for researchers in +this field. To address this issue, we proposed a method for extending the +spatio-temporal behavior dataset in Student Classroom Scenarios +(SCB-ST-Dataset4) through image dataset. Our SCB-ST-Dataset4 comprises 757265 +images with 25810 labels, focusing on 3 behaviors: hand-raising, reading, +writing. Our proposed method can rapidly generate spatio-temporal behavior +datasets without requiring extra manual labeling. Furthermore, we proposed a +Behavior Similarity Index (BSI) to explore the similarity of behaviors. We +evaluated the dataset using the YOLOv5, YOLOv7, YOLOv8, and SlowFast +algorithms, achieving a mean average precision (map) of up to 82.3%. Last, we +fused multiple models to generate student behavior-related data from various +perspectives. The experiment further demonstrates the effectiveness of our +method. And SCB-ST-Dataset4 provides a robust foundation for future research in +student behavior detection, potentially contributing to advancements in this +field. The SCB-ST-Dataset4 is available for download at: +https://github.com/Whiffe/SCB-dataset. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2310.02522; + text overlap with arXiv:2306.03318 +
+
+
+
+
+ + ♻ ☆ SOVC: Subject-Oriented Video Captioning + + +
+ Describing video content according to users' needs is a long-held goal. +Although existing video captioning methods have made significant progress, the +generated captions may not focus on the entity that users are particularly +interested in. To address this problem, we propose a new video captioning task, +Subject-Oriented Video Captioning (SOVC), which aims to allow users to specify +the describing target via a bounding box. To support this task, we construct +two subject-oriented video captioning datasets based on two widely used video +captioning datasets: MSVD and MSRVTT, by annotating subjects in each video for +each caption. These datasets pave the way for describing users' interested +targets. To tackle this task, we introduce a method tailored to this task, +named SOVCNet. It consists of two key components: a subject-oriented sampling +module that samples frames related to the subject to minimize irrelevant +information; and a subject-oriented encoding module that utilizes the subject +areas as hard prompts and integrates learnable soft prompts, enhancing the +model's focus on the subject's activities and facilitating adaptation to the +downstream generation task. Extensive experimental results demonstrate the +effectiveness of our method on this new task. + +
+
+
+
+
+ + ♻ ☆ Boundary Constraint-free Biomechanical Model-Based Surface Matching for + Intraoperative Liver Deformation Correction + + +
+ In image-guided liver surgery, 3D-3D non-rigid registration methods play a +crucial role in estimating the mapping between the preoperative model and the +intraoperative surface represented as point clouds, addressing the challenge of +tissue deformation. Typically, these methods incorporate a biomechanical model, +represented as a finite element model (FEM), used to regularize a surface +matching term. This paper introduces a novel 3D-3D non-rigid registration +method. In contrast to the preceding techniques, our method uniquely +incorporates the FEM within the surface matching term itself, ensuring that the +estimated deformation maintains geometric consistency throughout the +registration process. Additionally, we eliminate the need to determine +zero-boundary conditions and applied force locations in the FEM. We achieve +this by integrating soft springs into the stiffness matrix and allowing forces +to be distributed across the entire liver surface. To further improve +robustness, we introduce a regularization technique focused on the gradient of +the force magnitudes. This regularization imposes spatial smoothness and helps +prevent the overfitting of irregular noise in intraoperative data. Optimization +is achieved through an accelerated proximal gradient algorithm, further +enhanced by our proposed method for determining the optimal step size. Our +method is evaluated and compared to both a learning-based method and a +traditional method that features FEM regularization using data collected on our +custom-developed phantom, as well as two publicly available datasets. Our +method consistently outperforms or is comparable to the baseline techniques. +Both the code and dataset will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical + MLLM Prompts + + +
+ The potential for higher-resolution image generation using pretrained +diffusion models is immense, yet these models often struggle with issues of +object repetition and structural artifacts especially when scaling to 4K +resolution and higher. We figure out that the problem is caused by that, a +single prompt for the generation of multiple scales provides insufficient +efficacy. In response, we propose HiPrompt, a new tuning-free solution that +tackles the above problems by introducing hierarchical prompts. The +hierarchical prompts offer both global and local guidance. Specifically, the +global guidance comes from the user input that describes the overall content, +while the local guidance utilizes patch-wise descriptions from MLLMs to +elaborately guide the regional structure and texture generation. Furthermore, +during the inverse denoising process, the generated noise is decomposed into +low- and high-frequency spatial components. These components are conditioned on +multiple prompt levels, including detailed patch-wise descriptions and broader +image-level prompts, facilitating prompt-guided denoising under hierarchical +semantic guidance. It further allows the generation to focus more on local +spatial regions and ensures the generated images maintain coherent local and +global semantics, structures, and textures with high definition. Extensive +experiments demonstrate that HiPrompt outperforms state-of-the-art works in +higher-resolution image generation, significantly reducing object repetition +and enhancing structural quality. + +
+
+ comment: https://liuxinyv.github.io/HiPrompt/ +
+
+
+
+
+ + ♻ ☆ PlaneRecTR++: Unified Query Learning for Joint 3D Planar Reconstruction + and Pose Estimation ICCV 2023 + + +
+ 3D plane reconstruction from images can usually be divided into several +sub-tasks of plane detection, segmentation, parameters regression and possibly +depth prediction for per-frame, along with plane correspondence and relative +camera pose estimation between frames. Previous works tend to divide and +conquer these sub-tasks with distinct network modules, overall formulated by a +two-stage paradigm. With an initial camera pose and per-frame plane predictions +provided from the first stage, exclusively designed modules, potentially +relying on extra plane correspondence labelling, are applied to merge +multi-view plane entities and produce 6DoF camera pose. As none of existing +works manage to integrate above closely related sub-tasks into a unified +framework but treat them separately and sequentially, we suspect it potentially +as a main source of performance limitation for existing approaches. Motivated +by this finding and the success of query-based learning in enriching reasoning +among semantic entities, in this paper, we propose PlaneRecTR++, a +Transformer-based architecture, which for the first time unifies all sub-tasks +related to multi-view reconstruction and pose estimation with a compact +single-stage model, refraining from initial pose estimation and plane +correspondence supervision. Extensive quantitative and qualitative experiments +demonstrate that our proposed unified learning achieves mutual benefits across +sub-tasks, obtaining a new state-of-the-art performance on public ScanNetv1, +ScanNetv2, NYUv2-Plane, and MatterPort3D datasets. + +
+
+ comment: Journal extension of our ICCV 2023 paper "PlaneRecTR", which expands + from single view reconstruction to simultaneous multi-view reconstruction and + camera pose estimation. Note that the ICCV23 PlaneRecTR paper could be found + in the previous arxiv version [v2](arXiv:2307.13756v2) +
+
+
+
+
+ + ♻ ☆ Pan-denoising: Guided Hyperspectral Image Denoising via Weighted + Represent Coefficient Total Variation + + +
+ This paper introduces a novel paradigm for hyperspectral image (HSI) +denoising, which is termed \textit{pan-denoising}. In a given scene, +panchromatic (PAN) images capture similar structures and textures to HSIs but +with less noise. This enables the utilization of PAN images to guide the HSI +denoising process. Consequently, pan-denoising, which incorporates an +additional prior, has the potential to uncover underlying structures and +details beyond the internal information modeling of traditional HSI denoising +methods. However, the proper modeling of this additional prior poses a +significant challenge. To alleviate this issue, the paper proposes a novel +regularization term, Panchromatic Weighted Representation Coefficient Total +Variation (PWRCTV). It employs the gradient maps of PAN images to automatically +assign different weights of TV regularization for each pixel, resulting in +larger weights for smooth areas and smaller weights for edges. This +regularization forms the basis of a pan-denoising model, which is solved using +the Alternating Direction Method of Multipliers. Extensive experiments on +synthetic and real-world datasets demonstrate that PWRCTV outperforms several +state-of-the-art methods in terms of metrics and visual quality. Furthermore, +an HSI classification experiment confirms that PWRCTV, as a preprocessing +method, can enhance the performance of downstream classification tasks. The +code and data are available at https://github.com/shuangxu96/PWRCTV. + +
+
+
+
+
+ + ♻ ☆ OAFuser: Towards Omni-Aperture Fusion for Light Field Semantic + Segmentation + + +
+ Light field cameras are capable of capturing intricate angular and spatial +details. This allows for acquiring complex light patterns and details from +multiple angles, significantly enhancing the precision of image semantic +segmentation. However, two significant issues arise: (1) The extensive angular +information of light field cameras contains a large amount of redundant data, +which is overwhelming for the limited hardware resources of intelligent agents. +(2) A relative displacement difference exists in the data collected by +different micro-lenses. To address these issues, we propose an Omni-Aperture +Fusion model (OAFuser) that leverages dense context from the central view and +extracts the angular information from sub-aperture images to generate +semantically consistent results. To simultaneously streamline the redundant +information from the light field cameras and avoid feature loss during network +propagation, we present a simple yet very effective Sub-Aperture Fusion Module +(SAFM). This module efficiently embeds sub-aperture images in angular features, +allowing the network to process each sub-aperture image with a minimal +computational demand of only (around 1GFlops). Furthermore, to address the +mismatched spatial information across viewpoints, we present a Center Angular +Rectification Module (CARM) to realize feature resorting and prevent feature +occlusion caused by misalignment. The proposed OAFuser achieves +state-of-the-art performance on four UrbanLF datasets in terms of all +evaluation metrics and sets a new record of 84.93% in mIoU on the UrbanLF-Real +Extended dataset, with a gain of +3.69%. The source code for OAFuser is +available at https://github.com/FeiBryantkit/OAFuser. + +
+
+ comment: Accepted to IEEE Transactions on Artificial Intelligence (TAI). The + source code is available at https://github.com/FeiBryantkit/OAFuser +
+
+
+
+
+ + ♻ ☆ GLAD: Towards Better Reconstruction with Global and Local Adaptive + Diffusion Models for Unsupervised Anomaly Detection ECCV 2024 + + +
+ Diffusion models have shown superior performance on unsupervised anomaly +detection tasks. Since trained with normal data only, diffusion models tend to +reconstruct normal counterparts of test images with certain noises added. +However, these methods treat all potential anomalies equally, which may cause +two main problems. From the global perspective, the difficulty of +reconstructing images with different anomalies is uneven. Therefore, instead of +utilizing the same setting for all samples, we propose to predict a particular +denoising step for each sample by evaluating the difference between image +contents and the priors extracted from diffusion models. From the local +perspective, reconstructing abnormal regions differs from normal areas even in +the same image. Theoretically, the diffusion model predicts a noise for each +step, typically following a standard Gaussian distribution. However, due to the +difference between the anomaly and its potential normal counterpart, the +predicted noise in abnormal regions will inevitably deviate from the standard +Gaussian distribution. To this end, we propose introducing synthetic abnormal +samples in training to encourage the diffusion models to break through the +limitation of standard Gaussian distribution, and a spatial-adaptive feature +fusion scheme is utilized during inference. With the above modifications, we +propose a global and local adaptive diffusion model (abbreviated to GLAD) for +unsupervised anomaly detection, which introduces appealing flexibility and +achieves anomaly-free reconstruction while retaining as much normal information +as possible. Extensive experiments are conducted on three commonly used anomaly +detection datasets (MVTec-AD, MPDD, and VisA) and a printed circuit board +dataset (PCB-Bank) we integrated, showing the effectiveness of the proposed +method. + +
+
+ comment: Accepted by ECCV 2024, code and models: + https://github.com/hyao1/GLAD. Due to the limitation "The abstract field + cannot be longer than 1,920 characters", the abstract here is shorter than + that in the PDF file +
+
+
+
+
+ + ♻ ☆ W-HMR: Monocular Human Mesh Recovery in World Space with Weak-Supervised + Calibration + + +
+ Previous methods for 3D human motion recovery from monocular images often +fall short due to reliance on camera coordinates, leading to inaccuracies in +real-world applications. The limited availability and diversity of focal length +labels further exacerbate misalignment issues in reconstructed 3D human bodies. +To address these challenges, we introduce W-HMR, a weak-supervised calibration +method that predicts "reasonable" focal lengths based on body distortion +information, eliminating the need for precise focal length labels. Our approach +enhances 2D supervision precision and recovery accuracy. Additionally, we +present the OrientCorrect module, which corrects body orientation for plausible +reconstructions in world space, avoiding the error accumulation associated with +inaccurate camera rotation predictions. Our contributions include a novel +weak-supervised camera calibration technique, an effective orientation +correction module, and a decoupling strategy that significantly improves the +generalizability and accuracy of human motion recovery in both camera and world +coordinates. The robustness of W-HMR is validated through extensive experiments +on various datasets, showcasing its superiority over existing methods. Codes +and demos have been made available on the project page +https://yw0208.github.io/w-hmr/. + +
+
+
+
+
+ + ♻ ☆ iBA: Backdoor Attack on 3D Point Cloud via Reconstructing Itself + + +
+ The widespread deployment of Deep Neural Networks (DNNs) for 3D point cloud +processing starkly contrasts with their susceptibility to security breaches, +notably backdoor attacks. These attacks hijack DNNs during training, embedding +triggers in the data that, once activated, cause the network to make +predetermined errors while maintaining normal performance on unaltered data. +This vulnerability poses significant risks, especially given the insufficient +research on robust defense mechanisms for 3D point cloud networks against such +sophisticated threats. Existing attacks either struggle to resist basic point +cloud pre-processing methods, or rely on delicate manual design. Exploring +simple, effective, imperceptible, and difficult-to-defend triggers in 3D point +clouds is still challenging.To address these challenges, we introduce +MirrorAttack, a novel effective 3D backdoor attack method, which implants the +trigger by simply reconstructing a clean point cloud with an auto-encoder. The +data-driven nature of the MirrorAttack obviates the need for complex manual +design. Minimizing the reconstruction loss automatically improves +imperceptibility. Simultaneously, the reconstruction network endows the trigger +with pronounced nonlinearity and sample specificity, rendering traditional +preprocessing techniques ineffective in eliminating it. A trigger smoothing +module based on spherical harmonic transformation is also attached to regulate +the intensity of the attack.Both quantitive and qualitative results verify the +effectiveness of our method. We achieve state-of-the-art ASR on different types +of victim models with the intervention of defensive techniques. Moreover, the +minimal perturbation introduced by our trigger, as assessed by various metrics, +attests to the method's stealth, ensuring its imperceptibility. + +
+
+ comment: 16 pages. in IEEE Transactions on Information Forensics and Security + (2024) +
+
+
+
+
+ + ♻ ☆ Diffusion-Occ: 3D Point Cloud Completion via Occupancy Diffusion + + +
+ Point clouds are crucial for capturing three-dimensional data but often +suffer from incompleteness due to limitations such as resolution and occlusion. +Traditional methods typically rely on point-based approaches within +discriminative frameworks for point cloud completion. In this paper, we +introduce \textbf{Diffusion-Occ}, a novel framework for Diffusion Point Cloud +Completion. Diffusion-Occ utilizes a two-stage coarse-to-fine approach. In the +first stage, the Coarse Density Voxel Prediction Network (CDNet) processes +partial points to predict coarse density voxels, streamlining global feature +extraction through voxel classification, as opposed to previous +regression-based methods. In the second stage, we introduce the Occupancy +Generation Network (OccGen), a conditional occupancy diffusion model based on a +transformer architecture and enhanced by our Point-Voxel Fuse (PVF) block. This +block integrates coarse density voxels with partial points to leverage both +global and local features for comprehensive completion. By thresholding the +occupancy field, we convert it into a complete point cloud. Additionally, our +method employs diverse training mixtures and efficient diffusion +parameterization to enable effective one-step sampling during both training and +inference. Experimental results demonstrate that Diffusion-Occ outperforms +existing discriminative and generative methods. + +
+
+ comment: After a closer examination of our work, we've determined that our + experiments are not thorough and robust enough, possibly impacting the + accuracy of our conclusions. Hence, we've decided to withdraw our article + and, after refining our experiments, intend to resubmit the paper once + significant improvements have been made +
+
+
+
+
+ + ♻ ☆ Animate3D: Animating Any 3D Model with Multi-view Video Diffusion + + +
+ Recent advances in 4D generation mainly focus on generating 4D content by +distilling pre-trained text or single-view image-conditioned models. It is +inconvenient for them to take advantage of various off-the-shelf 3D assets with +multi-view attributes, and their results suffer from spatiotemporal +inconsistency owing to the inherent ambiguity in the supervision signals. In +this work, we present Animate3D, a novel framework for animating any static 3D +model. The core idea is two-fold: 1) We propose a novel multi-view video +diffusion model (MV-VDM) conditioned on multi-view renderings of the static 3D +object, which is trained on our presented large-scale multi-view video dataset +(MV-Video). 2) Based on MV-VDM, we introduce a framework combining +reconstruction and 4D Score Distillation Sampling (4D-SDS) to leverage the +multi-view video diffusion priors for animating 3D objects. Specifically, for +MV-VDM, we design a new spatiotemporal attention module to enhance spatial and +temporal consistency by integrating 3D and video diffusion models. +Additionally, we leverage the static 3D model's multi-view renderings as +conditions to preserve its identity. For animating 3D models, an effective +two-stage pipeline is proposed: we first reconstruct motions directly from +generated multi-view videos, followed by the introduced 4D-SDS to refine both +appearance and motion. Benefiting from accurate motion learning, we could +achieve straightforward mesh animation. Qualitative and quantitative +experiments demonstrate that Animate3D significantly outperforms previous +approaches. Data, code, and models will be open-released. + +
+
+ comment: Project Page: https://animate3d.github.io/ +
+
+
+
+
+ + ♻ ☆ Invariant Causal Knowledge Distillation in Neural Networks + + +
+ Knowledge distillation (KD) involves transferring the knowledge from one +neural network to another, often from a larger, well-trained model (teacher) to +a smaller, more efficient model (student). Traditional KD methods minimize the +Kullback-Leibler (KL) divergence between the probabilistic outputs of the +teacher and student networks. However, this approach often overlooks crucial +structural knowledge embedded within the teacher's network. In this paper, we +introduce Invariant Consistency Distillation (ICD), a novel methodology +designed to enhance KD by ensuring that the student model's representations are +both discriminative and invariant with respect to the teacher's outputs. Our +approach is based on causal inference principles and combines contrastive +learning with an explicit invariance penalty, capturing significantly more +information from the teacher's representation. ICD uses an efficient, +parameter-free approach for flexible teacher-student alignment. We provide a +theoretical foundation for ICD and demonstrate its effectiveness through +extensive experiments. Our results on CIFAR-100 and ImageNet ILSVRC-2012 show +that ICD outperforms traditional KD techniques and surpasses state-of-the-art +methods. In some cases, the student model even exceeds the teacher model in +terms of accuracy. Furthermore, we successfully apply our method to other +datasets, such as Tiny ImageNet and STL-10, demonstrating superior +cross-dataset generalization. Code is available at +https://github.com/giakoumoglou/distillers. + +
+
+ comment: 8 pages, 2 figures, 4 tables. The paper's title has been changed to + better emphasize its theoretical foundation +
+
+
+
+
+ + ♻ ☆ Relational Representation Distillation + + +
+ Knowledge distillation (KD) is an effective method for transferring knowledge +from a large, well-trained teacher model to a smaller, more efficient student +model. Despite its success, one of the main challenges in KD is ensuring the +efficient transfer of complex knowledge while maintaining the student's +computational efficiency. Unlike previous works that applied contrastive +objectives promoting explicit negative instances with little attention to the +relationships between them, we introduce Relational Representation Distillation +(RRD). Our approach leverages pairwise similarities to explore and reinforce +the relationships between the teacher and student models. Inspired by +self-supervised learning principles, it uses a relaxed contrastive loss that +focuses on similarity rather than exact replication. This method aligns the +output distributions of teacher samples in a large memory buffer, improving the +robustness and performance of the student model without the need for strict +negative instance differentiation. Our approach demonstrates superior +performance on CIFAR-100 and ImageNet ILSVRC-2012, outperforming traditional KD +and sometimes even outperforms the teacher network when combined with KD. It +also transfers successfully to other datasets like Tiny ImageNet and STL-10. +Code is available at https://github.com/giakoumoglou/distillers. + +
+
+ comment: 8 pages, 4 figures, 4 tables. Updated experiments on ImageNet + ILSVRC-2012. arXiv admin note: text overlap with arXiv:2407.11802 +
+
+
+
+
+ + ♻ ☆ A Greedy Hierarchical Approach to Whole-Network Filter-Pruning in CNNs + + +
+ Deep convolutional neural networks (CNNs) have achieved impressive +performance in many computer vision tasks. However, their large model sizes +require heavy computational resources, making pruning redundant filters from +existing pre-trained CNNs an essential task in developing efficient models for +resource-constrained devices. Whole-network filter pruning algorithms prune +varying fractions of filters from each layer, hence providing greater +flexibility. Current whole-network pruning methods are either computationally +expensive due to the need to calculate the loss for each pruned filter using a +training dataset, or use various heuristic / learned criteria for determining +the pruning fractions for each layer. This paper proposes a two-level +hierarchical approach for whole-network filter pruning which is efficient and +uses the classification loss as the final criterion. The lower-level algorithm +(called filter-pruning) uses a sparse-approximation formulation based on linear +approximation of filter weights. We explore two algorithms: orthogonal matching +pursuit-based greedy selection and a greedy backward pruning approach. The +backward pruning algorithm uses a novel closed-form error criterion for +efficiently selecting the optimal filter at each stage, thus making the whole +algorithm much faster. The higher-level algorithm (called layer-selection) +greedily selects the best-pruned layer (pruning using the filter-selection +algorithm) using a global pruning criterion. We propose algorithms for two +different global-pruning criteria: (1) layer-wise relative error (HBGS), and +(2) final classification error (HBGTS). Our suite of algorithms outperforms +state-of-the-art pruning methods on ResNet18, ResNet32, ResNet56, VGG16, and +ResNext101. Our method reduces the RAM requirement for ResNext101 from 7.6 GB +to 1.5 GB and achieves a 94% reduction in FLOPS without losing accuracy on +CIFAR-10. + +
+
+ comment: Accepted in TMLR 2024 +
+
+
+
+
+ + ♻ ☆ mPLUG-DocOwl2: High-resolution Compressing for OCR-free Multi-page + Document Understanding + + +
+ Multimodel Large Language Models(MLLMs) have achieved promising OCR-free +Document Understanding performance by increasing the supported resolution of +document images. However, this comes at the cost of generating thousands of +visual tokens for a single document image, leading to excessive GPU memory and +slower inference times, particularly in multi-page document comprehension. In +this work, to address these challenges, we propose a High-resolution +DocCompressor module to compress each high-resolution document image into 324 +tokens, guided by low-resolution global visual features. With this compression +module, to strengthen multi-page document comprehension ability and balance +both token efficiency and question-answering performance, we develop the +DocOwl2 under a three-stage training framework: Single-image Pretraining, +Multi-image Continue-pretraining, and Multi-task Finetuning. DocOwl2 sets a new +state-of-the-art across multi-page document understanding benchmarks and +reduces first token latency by more than 50%, demonstrating advanced +capabilities in multi-page questioning answering, explanation with evidence +pages, and cross-page structure understanding. Additionally, compared to +single-image MLLMs trained on similar data, our DocOwl2 achieves comparable +single-page understanding performance with less than 20% of the visual tokens. +Our codes, models, and data are publicly available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl2. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ SiamMo: Siamese Motion-Centric 3D Object Tracking + + +
+ Current 3D single object tracking methods primarily rely on the Siamese +matching-based paradigm, which struggles with textureless and incomplete LiDAR +point clouds. Conversely, the motion-centric paradigm avoids appearance +matching, thus overcoming these issues. However, its complex multi-stage +pipeline and the limited temporal modeling capability of a single-stream +architecture constrain its potential. In this paper, we introduce SiamMo, a +novel and simple Siamese motion-centric tracking approach. Unlike the +traditional single-stream architecture, we employ Siamese feature extraction +for motion-centric tracking. This decouples feature extraction from temporal +fusion, significantly enhancing tracking performance. Additionally, we design a +Spatio-Temporal Feature Aggregation module to integrate Siamese features at +multiple scales, capturing motion information effectively. We also introduce a +Box-aware Feature Encoding module to encode object size priors into motion +estimation. SiamMo is a purely motion-centric tracker that eliminates the need +for additional processes like segmentation and box refinement. Without whistles +and bells, SiamMo not only surpasses state-of-the-art methods across multiple +benchmarks but also demonstrates exceptional robustness in challenging +scenarios. SiamMo sets a new record on the KITTI tracking benchmark with 90.1\% +precision while maintaining a high inference speed of 108 FPS. The code will be +released at https://github.com/HDU-VRLab/SiamMo. + +
+
+
+
+
+ + ♻ ☆ TPA3D: Triplane Attention for Fast Text-to-3D Generation ECCV2024 + + +
+ Due to the lack of large-scale text-3D correspondence data, recent text-to-3D +generation works mainly rely on utilizing 2D diffusion models for synthesizing +3D data. Since diffusion-based methods typically require significant +optimization time for both training and inference, the use of GAN-based models +would still be desirable for fast 3D generation. In this work, we propose +Triplane Attention for text-guided 3D generation (TPA3D), an end-to-end +trainable GAN-based deep learning model for fast text-to-3D generation. With +only 3D shape data and their rendered 2D images observed during training, our +TPA3D is designed to retrieve detailed visual descriptions for synthesizing the +corresponding 3D mesh data. This is achieved by the proposed attention +mechanisms on the extracted sentence and word-level text features. In our +experiments, we show that TPA3D generates high-quality 3D textured shapes +aligned with fine-grained descriptions, while impressive computation efficiency +can be observed. + +
+
+ comment: ECCV2024, Project Page: https://redxouls.github.io/TPA3D/ +
+
+
+
+
+ + ♻ ☆ EventZoom: A Progressive Approach to Event-Based Data Augmentation for + Enhanced Neuromorphic Vision + + +
+ Dynamic Vision Sensors (DVS) capture event data with high temporal resolution +and low power consumption, presenting a more efficient solution for visual +processing in dynamic and real-time scenarios compared to conventional video +capture methods. Event data augmentation serve as an essential method for +overcoming the limitation of scale and diversity in event datasets. Our +comparative experiments demonstrate that the two factors, spatial integrity and +temporal continuity, can significantly affect the capacity of event data +augmentation, which are guarantee for maintaining the sparsity and high dynamic +range characteristics unique to event data. However, existing augmentation +methods often neglect the preservation of spatial integrity and temporal +continuity. To address this, we developed a novel event data augmentation +strategy EventZoom, which employs a temporal progressive strategy, embedding +transformed samples into the original samples through progressive scaling and +shifting. The scaling process avoids the spatial information loss associated +with cropping, while the progressive strategy prevents interruptions or abrupt +changes in temporal information. We validated EventZoom across various +supervised learning frameworks. The experimental results show that EventZoom +consistently outperforms existing event data augmentation methods with SOTA +performance. For the first time, we have concurrently employed Semi-supervised +and Unsupervised learning to verify feasibility on event augmentation +algorithms, demonstrating the applicability and effectiveness of EventZoom as a +powerful event-based data augmentation tool in handling real-world scenes with +high dynamics and variability environments. + +
+
+
+
+
+ + ♻ ☆ UV-Mamba: A DCN-Enhanced State Space Model for Urban Village Boundary + Identification in High-Resolution Remote Sensing Images + + +
+ Due to the diverse geographical environments, intricate landscapes, and +high-density settlements, the automatic identification of urban village +boundaries using remote sensing images remains a highly challenging task. This +paper proposes a novel and efficient neural network model called UV-Mamba for +accurate boundary detection in high-resolution remote sensing images. UV-Mamba +mitigates the memory loss problem in lengthy sequence modeling, which arises in +state space models with increasing image size, by incorporating deformable +convolutions. Its architecture utilizes an encoder-decoder framework and +includes an encoder with four deformable state space augmentation blocks for +efficient multi-level semantic extraction and a decoder to integrate the +extracted semantic information. We conducted experiments on two large datasets +showing that UV-Mamba achieves state-of-the-art performance. Specifically, our +model achieves 73.3% and 78.1% IoU on the Beijing and Xi'an datasets, +respectively, representing improvements of 1.2% and 3.4% IoU over the previous +best model while also being 6x faster in inference speed and 40x smaller in +parameter count. Source code and pre-trained models are available at +https://github.com/Devin-Egber/UV-Mamba. + +
+
+ comment: 5 pages, 4 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Fine-Grained Building Function Recognition from Street-View Images via + Geometry-Aware Semi-Supervised Learning + + +
+ In this work, we propose a geometry-aware semi-supervised framework for +fine-grained building function recognition, utilizing geometric relationships +among multi-source data to enhance pseudo-label accuracy in semi-supervised +learning, broadening its applicability to various building function +categorization systems. Firstly, we design an online semi-supervised +pre-training stage, which facilitates the precise acquisition of building +facade location information in street-view images. In the second stage, we +propose a geometry-aware coarse annotation generation module. This module +effectively combines GIS data and street-view data based on the geometric +relationships, improving the accuracy of pseudo annotations. In the third +stage, we combine the newly generated coarse annotations with the existing +labeled dataset to achieve fine-grained functional recognition of buildings +across multiple cities at a large scale. Extensive experiments demonstrate that +our proposed framework exhibits superior performance in fine-grained functional +recognition of buildings. Within the same categorization system, it achieves +improvements of 7.6\% and 4.8\% compared to fully-supervised methods and +state-of-the-art semi-supervised methods, respectively. Additionally, our +method also performs well in cross-city scenarios, i.e., extending the model +trained on OmniCity (New York) to new cities (i.e., Los Angeles and Boston) +with different building function categorization systems. This study offers a +new solution for large-scale multi-city applications with minimal annotation +requirements, facilitating more efficient data updates and resource allocation +in urban management. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ♻ ☆ INK: Inheritable Natural Backdoor Attack Against Model Distillation + + +
+ Deep learning models are vulnerable to backdoor attacks, where attackers +inject malicious behavior through data poisoning and later exploit triggers to +manipulate deployed models. To improve the stealth and effectiveness of +backdoors, prior studies have introduced various imperceptible attack methods +targeting both defense mechanisms and manual inspection. However, all +poisoning-based attacks still rely on privileged access to the training +dataset. Consequently, model distillation using a trusted dataset has emerged +as an effective defense against these attacks. To bridge this gap, we introduce +INK, an inheritable natural backdoor attack that targets model distillation. +The key insight behind INK is the use of naturally occurring statistical +features in all datasets, allowing attackers to leverage them as backdoor +triggers without direct access to the training data. Specifically, INK employs +image variance as a backdoor trigger and enables both clean-image and +clean-label attacks by manipulating the labels and image variance in an +unauthenticated dataset. Once the backdoor is embedded, it transfers from the +teacher model to the student model, even when defenders use a trusted dataset +for distillation. Theoretical analysis and experimental results demonstrate the +robustness of INK against transformation-based, search-based, and +distillation-based defenses. For instance, INK maintains an attack success rate +of over 98\% post-distillation, compared to an average success rate of 1.4\% +for existing methods. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Data standardization for robust lip sync + + +
+ Lip sync is a fundamental audio-visual task. However, existing lip sync +methods fall short of being robust in the wild. One important cause could be +distracting factors on the visual input side, making extracting lip motion +information difficult. To address these issues, this paper proposes a data +standardization pipeline to standardize the visual input for lip sync. Based on +recent advances in 3D face reconstruction, we first create a model that can +consistently disentangle lip motion information from the raw images. Then, +standardized images are synthesized with disentangled lip motion information, +with all other attributes related to distracting factors set to predefined +values independent of the input, to reduce their effects. Using synthesized +images, existing lip sync methods improve their data efficiency and robustness, +and they achieve competitive performance for the active speaker detection task. + +
+
+
+
+
+ + ♻ ☆ 360VFI: A Dataset and Benchmark for Omnidirectional Video Frame + Interpolation + + +
+ Head-mounted 360{\deg} displays and portable 360{\deg} cameras have +significantly progressed, providing viewers a realistic and immersive +experience. However, many omnidirectional videos have low frame rates that can +lead to visual fatigue, and the prevailing plane frame interpolation +methodologies are unsuitable for omnidirectional video interpolation because +they are designed solely for traditional videos. This paper introduces the +benchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We +present a practical implementation that introduces a distortion prior from +omnidirectional video into the network to modulate distortions. Specifically, +we propose a pyramid distortion-sensitive feature extractor that uses the +unique characteristics of equirectangular projection (ERP) format as prior +information. Moreover, we devise a decoder that uses an affine transformation +to further facilitate the synthesis of intermediate frames. 360VFI is the first +dataset and benchmark that explores the challenge of Omnidirectional Video +Frame Interpolation. Through our benchmark analysis, we present four different +distortion condition scenes in the proposed 360VFI dataset to evaluate the +challenges triggered by distortion during interpolation. Besides, experimental +results demonstrate that Omnidirectional Video Interpolation can be effectively +improved by modeling for omnidirectional distortion. + +
+
+ comment: This is a preprint version +
+
+
+
+
+ + ♻ ☆ Diffusion Cocktail: Mixing Domain-Specific Diffusion Models for + Diversified Image Generations + + +
+ Diffusion models, capable of high-quality image generation, receive +unparalleled popularity for their ease of extension. Active users have created +a massive collection of domain-specific diffusion models by fine-tuning base +models on self-collected datasets. Recent work has focused on improving a +single diffusion model by uncovering semantic and visual information encoded in +various architecture components. However, those methods overlook the vastly +available set of fine-tuned diffusion models and, therefore, miss the +opportunity to utilize their combined capacity for novel generation. In this +work, we propose Diffusion Cocktail (Ditail), a training-free method that +transfers style and content information between multiple diffusion models. This +allows us to perform diversified generations using a set of diffusion models, +resulting in novel images unobtainable by a single model. Ditail also offers +fine-grained control of the generation process, which enables flexible +manipulations of styles and contents. With these properties, Ditail excels in +numerous applications, including style transfer guided by diffusion models, +novel-style image generation, and image manipulation via prompts or collage +inputs. + +
+
+ comment: Project Page: https://maps-research.github.io/Ditail/ +
+
+
+
+
+ + ♻ ☆ Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver + + +
+ Mathematical reasoning remains an ongoing challenge for AI models, especially +for geometry problems that require both linguistic and visual signals. As the +vision encoders of most MLLMs are trained on natural scenes, they often +struggle to understand geometric diagrams, performing no better in geometry +problem solving than LLMs that only process text. This limitation is amplified +by the lack of effective methods for representing geometric relationships. To +address these issues, we introduce the Diagram Formalization Enhanced Geometry +Problem Solver (DFE-GPS), a new framework that integrates visual features, +geometric formal language, and natural language representations. We propose a +novel synthetic data approach and create a large-scale geometric dataset, +SynthGeo228K, annotated with both formal and natural language captions, +designed to enhance the vision encoder for a better understanding of geometric +structures. Our framework improves MLLMs' ability to process geometric diagrams +and extends their application to open-ended tasks on the formalgeo7k dataset. + +
+
+
+
+
+ + ♻ ☆ Scaling Diffusion Transformers to 16 Billion Parameters + + +
+ In this paper, we present DiT-MoE, a sparse version of the diffusion +Transformer, that is scalable and competitive with dense networks while +exhibiting highly optimized inference. The DiT-MoE includes two simple designs: +shared expert routing and expert-level balance loss, thereby capturing common +knowledge and reducing redundancy among the different routed experts. When +applied to conditional image generation, a deep analysis of experts +specialization gains some interesting observations: (i) Expert selection shows +preference with spatial position and denoising time step, while insensitive +with different class-conditional information; (ii) As the MoE layers go deeper, +the selection of experts gradually shifts from specific spacial position to +dispersion and balance. (iii) Expert specialization tends to be more +concentrated at the early time step and then gradually uniform after half. We +attribute it to the diffusion process that first models the low-frequency +spatial information and then high-frequency complex information. Based on the +above guidance, a series of DiT-MoE experimentally achieves performance on par +with dense networks yet requires much less computational load during inference. +More encouragingly, we demonstrate the potential of DiT-MoE with synthesized +image data, scaling diffusion model at a 16.5B parameter that attains a new +SoTA FID-50K score of 1.80 in 512$\times$512 resolution settings. The project +page: https://github.com/feizc/DiT-MoE. + +
+
+
+
+
+ + ♻ ☆ CLAMP-ViT: Contrastive Data-Free Learning for Adaptive Post-Training + Quantization of ViTs ECCV 2024 + + +
+ We present CLAMP-ViT, a data-free post-training quantization method for +vision transformers (ViTs). We identify the limitations of recent techniques, +notably their inability to leverage meaningful inter-patch relationships, +leading to the generation of simplistic and semantically vague data, impacting +quantization accuracy. CLAMP-ViT employs a two-stage approach, cyclically +adapting between data generation and model quantization. Specifically, we +incorporate a patch-level contrastive learning scheme to generate richer, +semantically meaningful data. Furthermore, we leverage contrastive learning in +layer-wise evolutionary search for fixed- and mixed-precision quantization to +identify optimal quantization parameters while mitigating the effects of a +non-smooth loss landscape. Extensive evaluations across various vision tasks +demonstrate the superiority of CLAMP-ViT, with performance improvements of up +to 3% in top-1 accuracy for classification, 0.6 mAP for object detection, and +1.5 mIoU for segmentation at similar or better compression ratio over existing +alternatives. Code is available at +https://github.com/georgia-tech-synergy-lab/CLAMP-ViT.git + +
+
+ comment: ECCV 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Benchmarking Chinese Knowledge Rectification in Large Language Models + + +
+ While Large Language Models (LLMs) exhibit remarkable generative +capabilities, they are not without flaws, particularly in the form of +hallucinations. This issue is even more pronounced when LLMs are applied to +specific languages and domains. For example, LLMs may generate nonsense +information when handling Chinese ancient poetry, proverbs, or idioms, owing to +the lack of specific knowledge. To this end, this paper introduces a benchmark +for rectifying Chinese knowledge in LLMs via knowledge editing. Specifically, +we introduce a new Chinese dataset, CKnowEdit, by collecting seven type of +knowledge from various sources, including classical texts, idioms, and content +from Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony, +antithesis, and logical constructs inherent in the Chinese language. Through +the analysis of this dataset, we uncover the challenges faced by current LLMs +in mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge +editing techniques on this dataset unveil the substantial scope for advancement +in the rectification of Chinese knowledge. Code and dataset are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work; code and dataset are available at + https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ Extracting the U.S. building types from OpenStreetMap data + + +
+ Building type information is crucial for population estimation, traffic +planning, urban planning, and emergency response applications. Although +essential, such data is often not readily available. To alleviate this problem, +this work creates a comprehensive dataset by providing +residential/non-residential building classification covering the entire United +States. We propose and utilize an unsupervised machine learning method to +classify building types based on building footprints and available +OpenStreetMap information. The classification result is validated using +authoritative ground truth data for select counties in the U.S. The validation +shows a high precision for non-residential building classification and a high +recall for residential buildings. We identified various approaches to improving +the quality of the classification, such as removing sheds and garages from the +dataset. Furthermore, analyzing the misclassifications revealed that they are +mainly due to missing and scarce metadata in OSM. A major result of this work +is the resulting dataset of classifying 67,705,475 buildings. We hope that this +data is of value to the scientific community, including urban and +transportation planners. + +
+
+
+
+
+ + ☆ RegNLP in Action: Facilitating Compliance Through Automated Information + Retrieval and Answer Generation + + +
+ Regulatory documents, issued by governmental regulatory bodies, establish +rules, guidelines, and standards that organizations must adhere to for legal +compliance. These documents, characterized by their length, complexity and +frequent updates, are challenging to interpret, requiring significant +allocation of time and expertise on the part of organizations to ensure ongoing +compliance.Regulatory Natural Language Processing (RegNLP) is a +multidisciplinary subfield aimed at simplifying access to and interpretation of +regulatory rules and obligations. We define an Automated Question-Passage +Generation task for RegNLP, create the ObliQA dataset containing 27,869 +questions derived from the Abu Dhabi Global Markets (ADGM) financial regulation +document collection, design a baseline Regulatory Information Retrieval and +Answer Generation system, and evaluate it with RePASs, a novel evaluation +metric that tests whether generated answers accurately capture all relevant +obligations and avoid contradictions. + +
+
+
+
+
+ + ☆ Enhancing Graph Contrastive Learning with Reliable and Informative + Augmentation for Recommendation + + +
+ Graph neural network (GNN) has been a powerful approach in collaborative +filtering (CF) due to its ability to model high-order user-item relationships. +Recently, to alleviate the data sparsity and enhance representation learning, +many efforts have been conducted to integrate contrastive learning (CL) with +GNNs. Despite the promising improvements, the contrastive view generation based +on structure and representation perturbations in existing methods potentially +disrupts the collaborative information in contrastive views, resulting in +limited effectiveness of positive alignment. To overcome this issue, we propose +CoGCL, a novel framework that aims to enhance graph contrastive learning by +constructing contrastive views with stronger collaborative information via +discrete codes. The core idea is to map users and items into discrete codes +rich in collaborative information for reliable and informative contrastive view +generation. To this end, we initially introduce a multi-level vector quantizer +in an end-to-end manner to quantize user and item representations into discrete +codes. Based on these discrete codes, we enhance the collaborative information +of contrastive views by considering neighborhood structure and semantic +relevance respectively. For neighborhood structure, we propose virtual neighbor +augmentation by treating discrete codes as virtual neighbors, which expands an +observed user-item interaction into multiple edges involving discrete codes. +Regarding semantic relevance, we identify similar users/items based on shared +discrete codes and interaction targets to generate the semantically relevant +view. Through these strategies, we construct contrastive views with stronger +collaborative information and develop a triple-view graph contrastive learning +approach. Extensive experiments on four public datasets demonstrate the +effectiveness of our proposed approach. + +
+
+
+
+
+ + ☆ Rs4rs: Semantically Find Recent Publications from Top Recommendation + System-Related Venues + + +
+ Rs4rs is a web application designed to perform semantic search on recent +papers from top conferences and journals related to Recommender Systems. +Current scholarly search engine tools like Google Scholar, Semantic Scholar, +and ResearchGate often yield broad results that fail to target the most +relevant high-quality publications. Moreover, manually visiting individual +conference and journal websites is a time-consuming process that primarily +supports only syntactic searches. Rs4rs addresses these issues by providing a +user-friendly platform where researchers can input their topic of interest and +receive a list of recent, relevant papers from top Recommender Systems venues. +Utilizing semantic search techniques, Rs4rs ensures that the search results are +not only precise and relevant but also comprehensive, capturing papers +regardless of variations in wording. This tool significantly enhances research +efficiency and accuracy, thereby benefitting the research community and public +by facilitating access to high-quality, pertinent academic resources in the +field of Recommender Systems. Rs4rs is available at https://rs4rs.com. + +
+
+
+
+
+ + ☆ End-to-End Learnable Item Tokenization for Generative Recommendation + + +
+ Recently, generative recommendation has emerged as a promising new paradigm +that directly generates item identifiers for recommendation. However, a key +challenge lies in how to effectively construct item identifiers that are +suitable for recommender systems. Existing methods typically decouple item +tokenization from subsequent generative recommendation training, likely +resulting in suboptimal performance. To address this limitation, we propose +ETEGRec, a novel End-To-End Generative Recommender by seamlessly integrating +item tokenization and generative recommendation. Our framework is developed +based on the dual encoder-decoder architecture, which consists of an item +tokenizer and a generative recommender. In order to achieve mutual enhancement +between the two components, we propose a recommendation-oriented alignment +approach by devising two specific optimization objectives: sequence-item +alignment and preference-semantic alignment. These two alignment objectives can +effectively couple the learning of item tokenizer and generative recommender, +thereby fostering the mutual enhancement between the two components. Finally, +we further devise an alternating optimization method, to facilitate stable and +effective end-to-end learning of the entire framework. Extensive experiments +demonstrate the effectiveness of our proposed framework compared to a series of +traditional sequential recommendation models and generative recommendation +baselines. + +
+
+
+
+
+ + ☆ RBoard: A Unified Platform for Reproducible and Reusable Recommender + System Benchmarks + + +
+ Recommender systems research lacks standardized benchmarks for +reproducibility and algorithm comparisons. We introduce RBoard, a novel +framework addressing these challenges by providing a comprehensive platform for +benchmarking diverse recommendation tasks, including CTR prediction, Top-N +recommendation, and others. RBoard's primary objective is to enable fully +reproducible and reusable experiments across these scenarios. The framework +evaluates algorithms across multiple datasets within each task, aggregating +results for a holistic performance assessment. It implements standardized +evaluation protocols, ensuring consistency and comparability. To facilitate +reproducibility, all user-provided code can be easily downloaded and executed, +allowing researchers to reliably replicate studies and build upon previous +work. By offering a unified platform for rigorous, reproducible evaluation +across various recommendation scenarios, RBoard aims to accelerate progress in +the field and establish a new standard for recommender systems benchmarking in +both academia and industry. The platform is available at https://rboard.org and +the demo video can be found at https://bit.ly/rboard-demo. + +
+
+
+
+
+ + ☆ DatAasee -- A Metadata-Lake as Metadata Catalog for a Virtual Data-Lake + + +
+ Metadata management for distributed data sources is a long-standing but +ever-growing problem. To counter this challenge in a research-data and +library-oriented setting, this work constructs a data architecture, derived +from the data-lake: the metadata-lake. A proof-of-concept implementation of +this proposed metadata system is presented and evaluated as well. + +
+
+
+
+
+ + ☆ Federated Transfer Learning Based Cooperative Wideband Spectrum Sensing + with Model Pruning + + +
+ For ultra-wideband and high-rate wireless communication systems, wideband +spectrum sensing (WSS) is critical, since it empowers secondary users (SUs) to +capture the spectrum holes for opportunistic transmission. However, WSS +encounters challenges such as excessive costs of hardware and computation due +to the high sampling rate, as well as robustness issues arising from scenario +mismatch. In this paper, a WSS neural network (WSSNet) is proposed by +exploiting multicoset preprocessing to enable the sub-Nyquist sampling, with +the two dimensional convolution design specifically tailored to work with the +preprocessed samples. A federated transfer learning (FTL) based framework +mobilizing multiple SUs is further developed to achieve a robust model +adaptable to various scenarios, which is paved by the selective weight pruning +for the fast model adaptation and inference. Simulation results demonstrate +that the proposed FTL-WSSNet achieves the fairly good performance in different +target scenarios even without local adaptation samples. + +
+
+
+
+
+ + ☆ Recommender Systems Algorithm Selection for Ranking Prediction on + Implicit Feedback Datasets + + +
+ The recommender systems algorithm selection problem for ranking prediction on +implicit feedback datasets is under-explored. Traditional approaches in +recommender systems algorithm selection focus predominantly on rating +prediction on explicit feedback datasets, leaving a research gap for ranking +prediction on implicit feedback datasets. Algorithm selection is a critical +challenge for nearly every practitioner in recommender systems. In this work, +we take the first steps toward addressing this research gap. We evaluate the +NDCG@10 of 24 recommender systems algorithms, each with two hyperparameter +configurations, on 72 recommender systems datasets. We train four optimized +machine-learning meta-models and one automated machine-learning meta-model with +three different settings on the resulting meta-dataset. Our results show that +the predictions of all tested meta-models exhibit a median Spearman correlation +ranging from 0.857 to 0.918 with the ground truth. We show that the median +Spearman correlation between meta-model predictions and the ground truth +increases by an average of 0.124 when the meta-model is optimized to predict +the ranking of algorithms instead of their performance. Furthermore, in terms +of predicting the best algorithm for an unknown dataset, we demonstrate that +the best optimized traditional meta-model, e.g., XGBoost, achieves a recall of +48.6%, outperforming the best tested automated machine learning meta-model, +e.g., AutoGluon, which achieves a recall of 47.2%. + +
+
+ comment: Accepted for presentation at the 18th ACM Conference on Recommender + Systems in the Late-Breaking Results Track +
+
+
+
+
+ + ☆ Replicability Measures for Longitudinal Information Retrieval Evaluation + + +
+ Information Retrieval (IR) systems are exposed to constant changes in most +components. Documents are created, updated, or deleted, the information needs +are changing, and even relevance might not be static. While it is generally +expected that the IR systems retain a consistent utility for the users, test +collection evaluations rely on a fixed experimental setup. Based on the +LongEval shared task and test collection, this work explores how the +effectiveness measured in evolving experiments can be assessed. Specifically, +the persistency of effectiveness is investigated as a replicability task. It is +observed how the effectiveness progressively deteriorates over time compared to +the initial measurement. Employing adapted replicability measures provides +further insight into the persistence of effectiveness. The ranking of systems +varies across retrieval measures and time. In conclusion, it was found that the +most effective systems are not necessarily the ones with the most persistent +performance. + +
+
+ comment: Experimental IR Meets Multilinguality, Multimodality, and Interaction + - 15th International Conference of the CLEF Association, CLEF 2024, Grenoble, + France, September 9-12, 2024, Proceedings. arXiv admin note: text overlap + with arXiv:2308.10549 +
+
+
+
+
+ + ☆ A Survey of Multimodal Composite Editing and Retrieval + + +
+ In the real world, where information is abundant and diverse across different +modalities, understanding and utilizing various data types to improve retrieval +systems is a key focus of research. Multimodal composite retrieval integrates +diverse modalities such as text, image and audio, etc. to provide more +accurate, personalized, and contextually relevant results. To facilitate a +deeper understanding of this promising direction, this survey explores +multimodal composite editing and retrieval in depth, covering image-text +composite editing, image-text composite retrieval, and other multimodal +composite retrieval. In this survey, we systematically organize the application +scenarios, methods, benchmarks, experiments, and future directions. Multimodal +learning is a hot topic in large model era, and have also witnessed some +surveys in multimodal learning and vision-language models with transformers +published in the PAMI journal. To the best of our knowledge, this survey is the +first comprehensive review of the literature on multimodal composite retrieval, +which is a timely complement of multimodal fusion to existing reviews. To help +readers' quickly track this field, we build the project page for this survey, +which can be found at +https://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval. + +
+
+ comment: 22 pages, 3 figures, and 11 tables +
+
+
+
+
+ + ☆ NLLB-E5: A Scalable Multilingual Retrieval Model + + +
+ Despite significant progress in multilingual information retrieval, the lack +of models capable of effectively supporting multiple languages, particularly +low-resource like Indic languages, remains a critical challenge. This paper +presents NLLB-E5: A Scalable Multilingual Retrieval Model. NLLB-E5 leverages +the in-built multilingual capabilities in the NLLB encoder for translation +tasks. It proposes a distillation approach from multilingual retriever E5 to +provide a zero-shot retrieval approach handling multiple languages, including +all major Indic languages, without requiring multilingual training data. We +evaluate the model on a comprehensive suite of existing benchmarks, including +Hindi-BEIR, highlighting its robust performance across diverse languages and +tasks. Our findings uncover task and domain-specific challenges, providing +valuable insights into the retrieval performance, especially for low-resource +languages. NLLB-E5 addresses the urgent need for an inclusive, scalable, and +language-agnostic text retrieval model, advancing the field of multilingual +information access and promoting digital inclusivity for millions of users +globally. + +
+
+
+
+
+ + ♻ ☆ WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild + + +
+ The increasing availability of real-world conversation data offers exciting +opportunities for researchers to study user-chatbot interactions. However, the +sheer volume of this data makes manually examining individual conversations +impractical. To overcome this challenge, we introduce WildVis, an interactive +tool that enables fast, versatile, and large-scale conversation analysis. +WildVis provides search and visualization capabilities in the text and +embedding spaces based on a list of criteria. To manage million-scale datasets, +we implemented optimizations including search index construction, embedding +precomputation and compression, and caching to ensure responsive user +interactions within seconds. We demonstrate WildVis' utility through three case +studies: facilitating chatbot misuse research, visualizing and comparing topic +distributions across datasets, and characterizing user-specific conversation +patterns. WildVis is open-source and designed to be extendable, supporting +additional datasets and customized search and visualization functionalities. + +
+
+
+
+
+ + ♻ ☆ Understanding Fairness in Recommender Systems: A Healthcare Perspective + + +
+ Fairness in AI-driven decision-making systems has become a critical concern, +especially when these systems directly affect human lives. This paper explores +the public's comprehension of fairness in healthcare recommendations. We +conducted a survey where participants selected from four fairness metrics -- +Demographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive +Value -- across different healthcare scenarios to assess their understanding of +these concepts. Our findings reveal that fairness is a complex and often +misunderstood concept, with a generally low level of public understanding +regarding fairness metrics in recommender systems. This study highlights the +need for enhanced information and education on algorithmic fairness to support +informed decision-making in using these systems. Furthermore, the results +suggest that a one-size-fits-all approach to fairness may be insufficient, +pointing to the importance of context-sensitive designs in developing equitable +AI systems. + +
+
+ comment: Accepted to the 18th ACM Conference on Recommender Systems +
+
+
+
+
+ + ♻ ☆ ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG + Capabilities + + +
+ In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K +context window, designed to bridge the gap between open-source LLMs and leading +proprietary models (e.g., GPT-4-Turbo) in long-context understanding and +retrieval-augmented generation (RAG) capabilities. These two capabilities are +essential for LLMs to process large volumes of information that cannot fit into +a single prompt and are complementary to each other, depending on the +downstream tasks and computational budgets. We present a detailed continued +training recipe to extend the context window of Llama3-70B-base from 8K to 128K +tokens, along with a three-stage instruction tuning process to enhance the +model's instruction-following, RAG performance, and long-context understanding +capabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model +outperforms most existing state-of-the-art models, including +GPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on +ultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only +a 4K context window, showing the strong long context capability across varying +sequence lengths. We further provide extensive comparisons between direct +long-context and RAG solutions using the same state-of-the-art long-context +LLMs. Interestingly, we find that the performance of strong long-context LLMs +using RAG improves when retrieving a larger number of chunks. With a large set +of top-k chunks, RAG consistently outperforms direct long-context solution +using the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B +and Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To +advance research in this field, we open-sourced the model weights, training +data, and the evaluation setup for the for the community: +https://chatqa2-project.github.io/ + +
+
+ comment: v2: major update with significantly improved results +
+
+
+
+
+
+
+
+ + Machine Learning 129 + +
+
+
+ + ☆ A Framework for Evaluating PM2.5 Forecasts from the Perspective of + Individual Decision Making + + +
+ Wildfire frequency is increasing as the climate changes, and the resulting +air pollution poses health risks. Just as people routinely use weather +forecasts to plan their activities around precipitation, reliable air quality +forecasts could help individuals reduce their exposure to air pollution. In the +present work, we evaluate several existing forecasts of fine particular matter +(PM2.5) within the continental United States in the context of individual +decision-making. Our comparison suggests there is meaningful room for +improvement in air pollution forecasting, which might be realized by +incorporating more data sources and using machine learning tools. To facilitate +future machine learning development and benchmarking, we set up a framework to +evaluate and compare air pollution forecasts for individual decision making. We +introduce a new loss to capture decisions about when to use mitigation +measures. We highlight the importance of visualizations when comparing +forecasts. Finally, we provide code to download and compare archived forecast +predictions. + +
+
+ comment: 22 pages, 3 figures +
+
+
+
+
+ + ☆ Robot Utility Models: General Policies for Zero-Shot Deployment in New + Environments + + +
+ Robot models, particularly those trained with large amounts of data, have +recently shown a plethora of real-world manipulation and navigation +capabilities. Several independent efforts have shown that given sufficient +training data in an environment, robot policies can generalize to demonstrated +variations in that environment. However, needing to finetune robot models to +every new environment stands in stark contrast to models in language or vision +that can be deployed zero-shot for open-world problems. In this work, we +present Robot Utility Models (RUMs), a framework for training and deploying +zero-shot robot policies that can directly generalize to new environments +without any finetuning. To create RUMs efficiently, we develop new tools to +quickly collect data for mobile manipulation tasks, integrate such data into a +policy with multi-modal imitation learning, and deploy policies on-device on +Hello Robot Stretch, a cheap commodity robot, with an external mLLM verifier +for retrying. We train five such utility models for opening cabinet doors, +opening drawers, picking up napkins, picking up paper bags, and reorienting +fallen objects. Our system, on average, achieves 90% success rate in unseen, +novel environments interacting with unseen objects. Moreover, the utility +models can also succeed in different robot and camera set-ups with no further +data, training, or fine-tuning. Primary among our lessons are the importance of +training data over training algorithm and policy class, guidance about data +scaling, necessity for diverse yet high-quality demonstrations, and a recipe +for robot introspection and retrying to improve performance on individual +environments. Our code, data, models, hardware designs, as well as our +experiment and deployment videos are open sourced and can be found on our +project website: https://robotutilitymodels.com + +
+
+ comment: Project website https://robotutilitymodels.com +
+
+
+
+
+ + ☆ Neural MP: A Generalist Neural Motion Planner + + +
+ The current paradigm for motion planning generates solutions from scratch for +every new problem, which consumes significant amounts of time and computational +resources. For complex, cluttered scenes, motion planning approaches can often +take minutes to produce a solution, while humans are able to accurately and +safely reach any goal in seconds by leveraging their prior experience. We seek +to do the same by applying data-driven learning at scale to the problem of +motion planning. Our approach builds a large number of complex scenes in +simulation, collects expert data from a motion planner, then distills it into a +reactive generalist policy. We then combine this with lightweight optimization +to obtain a safe path for real world deployment. We perform a thorough +evaluation of our method on 64 motion planning tasks across four diverse +environments with randomized poses, scenes and obstacles, in the real world, +demonstrating an improvement of 23%, 17% and 79% motion planning success rate +over state of the art sampling, optimization and learning based planning +methods. Video results available at mihdalal.github.io/neuralmotionplanner + +
+
+ comment: Website at mihdalal.github.io/neuralmotionplanner. Main paper: 7 + pages, 4 figures, 2 tables. Appendix: 9 pages, 5 figures, 6 tables +
+
+
+
+
+ + ☆ An Introduction to Quantum Reinforcement Learning (QRL) + + +
+ Recent advancements in quantum computing (QC) and machine learning (ML) have +sparked considerable interest in the integration of these two cutting-edge +fields. Among the various ML techniques, reinforcement learning (RL) stands out +for its ability to address complex sequential decision-making problems. RL has +already demonstrated substantial success in the classical ML community. Now, +the emerging field of Quantum Reinforcement Learning (QRL) seeks to enhance RL +algorithms by incorporating principles from quantum computing. This paper +offers an introduction to this exciting area for the broader AI and ML +community. + +
+
+ comment: Accepted by The 15th International Conference on ICT Convergence - + ICTC 2024 +
+
+
+
+
+ + ☆ Improving Pretraining Data Using Perplexity Correlations + + +
+ Quality pretraining data is often seen as the key to high-performance +language models. However, progress in understanding pretraining data has been +slow due to the costly pretraining runs required for data selection +experiments. We present a framework that avoids these costs and selects +high-quality pretraining data without any LLM training of our own. Our work is +based on a simple observation: LLM losses on many pretraining texts are +correlated with downstream benchmark performance, and selecting +high-correlation documents is an effective pretraining data selection method. +We build a new statistical framework for data selection centered around +estimates of perplexity-benchmark correlations and perform data selection using +a sample of 90 LLMs taken from the Open LLM Leaderboard on texts from tens of +thousands of web domains. In controlled pretraining experiments at the 160M +parameter scale on 8 benchmarks, our approach outperforms DSIR on every +benchmark, while matching the best data selector found in DataComp-LM, a +hand-engineered bigram classifier. + +
+
+
+
+
+ + ☆ Benchmarking Chinese Knowledge Rectification in Large Language Models + + +
+ While Large Language Models (LLMs) exhibit remarkable generative +capabilities, they are not without flaws, particularly in the form of +hallucinations. This issue is even more pronounced when LLMs are applied to +specific languages and domains. For example, LLMs may generate nonsense +information when handling Chinese ancient poetry, proverbs, or idioms, owing to +the lack of specific knowledge. To this end, this paper introduces a benchmark +for rectifying Chinese knowledge in LLMs via knowledge editing. Specifically, +we introduce a new Chinese dataset, CKnowEdit, by collecting seven type of +knowledge from various sources, including classical texts, idioms, and content +from Baidu Tieba Ruozhiba, thereby accounting for the unique polyphony, +antithesis, and logical constructs inherent in the Chinese language. Through +the analysis of this dataset, we uncover the challenges faced by current LLMs +in mastering Chinese. Furthermore, our evaluation of state-of-the-art knowledge +editing techniques on this dataset unveil the substantial scope for advancement +in the rectification of Chinese knowledge. Code and dataset are available at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Ongoing work; code and dataset are available at + https://github.com/zjunlp/EasyEdit +
+
+
+
+
+ + ☆ Celcomen: spatial causal disentanglement for single-cell and tissue + perturbation modeling + + +
+ Celcomen leverages a mathematical causality framework to disentangle intra- +and inter- cellular gene regulation programs in spatial transcriptomics and +single-cell data through a generative graph neural network. It can learn +gene-gene interactions, as well as generate post-perturbation counterfactual +spatial transcriptomics, thereby offering access to experimentally inaccessible +samples. We validated its disentanglement, identifiability, and counterfactual +prediction capabilities through simulations and in clinically relevant human +glioblastoma, human fetal spleen, and mouse lung cancer samples. Celcomen +provides the means to model disease and therapy induced changes allowing for +new insights into single-cell spatially resolved tissue responses relevant to +human health. + +
+
+
+
+
+ + ☆ Input Space Mode Connectivity in Deep Neural Networks + + +
+ We extend the concept of loss landscape mode connectivity to the input space +of deep neural networks. Mode connectivity was originally studied within +parameter space, where it describes the existence of low-loss paths between +different solutions (loss minimizers) obtained through gradient descent. We +present theoretical and empirical evidence of its presence in the input space +of deep networks, thereby highlighting the broader nature of the phenomenon. We +observe that different input images with similar predictions are generally +connected, and for trained models, the path tends to be simple, with only a +small deviation from being a linear path. Our methodology utilizes real, +interpolated, and synthetic inputs created using the input optimization +technique for feature visualization. We conjecture that input space mode +connectivity in high-dimensional spaces is a geometric effect that takes place +even in untrained models and can be explained through percolation theory. We +exploit mode connectivity to obtain new insights about adversarial examples and +demonstrate its potential for adversarial detection. Additionally, we discuss +applications for the interpretability of deep networks. + +
+
+
+
+
+ + ☆ Enhancing Preference-based Linear Bandits via Human Response Time + + +
+ Binary human choice feedback is widely used in interactive preference +learning for its simplicity, but it provides limited information about +preference strength. To overcome this limitation, we leverage human response +times, which inversely correlate with preference strength, as complementary +information. Our work integrates the EZ-diffusion model, which jointly models +human choices and response times, into preference-based linear bandits. We +introduce a computationally efficient utility estimator that reformulates the +utility estimation problem using both choices and response times as a linear +regression problem. Theoretical and empirical comparisons with traditional +choice-only estimators reveal that for queries with strong preferences ("easy" +queries), choices alone provide limited information, while response times offer +valuable complementary information about preference strength. As a result, +incorporating response times makes easy queries more useful. We demonstrate +this advantage in the fixed-budget best-arm identification problem, with +simulations based on three real-world datasets, consistently showing +accelerated learning when response times are incorporated. + +
+
+
+
+
+ + ☆ Predicting Critical Heat Flux with Uncertainty Quantification and Domain + Generalization Using Conditional Variational Autoencoders and Deep Neural + Networks + + +
+ Deep generative models (DGMs) have proven to be powerful in generating +realistic data samples. Their capability to learn the underlying distribution +of a dataset enable them to generate synthetic data samples that closely +resemble the original training dataset, thus addressing the challenge of data +scarcity. In this work, we investigated the capabilities of DGMs by developing +a conditional variational autoencoder (CVAE) model to augment the critical heat +flux (CHF) measurement data that was used to generate the 2006 Groeneveld +lookup table. To determine how this approach compared to traditional methods, a +fine-tuned deep neural network (DNN) regression model was created and evaluated +with the same dataset. Both the CVAE and DNN models achieved small mean +absolute relative errors, with the CVAE model maintaining more favorable +results. To quantify the uncertainty in the model's predictions, uncertainty +quantification (UQ) was performed with repeated sampling of the CVAE model and +ensembling of the DNN model. Following UQ, the DNN ensemble notably improved +performance when compared to the baseline DNN model, while the CVAE model +achieved similar results to its non-UQ results. The CVAE model was shown to +have significantly less variability and a higher confidence after assessment of +the prediction-wise relative standard deviations. Evaluating domain +generalization, both models achieved small mean error values when predicting +both inside and outside the training domain, with predictions outside the +training domain showing slightly larger errors. Overall, the CVAE model was +comparable to the DNN regression model in predicting CHF values but with better +uncertainty behavior. + +
+
+
+
+
+ + ☆ Leveraging Object Priors for Point Tracking ECCV 2024 + + +
+ Point tracking is a fundamental problem in computer vision with numerous +applications in AR and robotics. A common failure mode in long-term point +tracking occurs when the predicted point leaves the object it belongs to and +lands on the background or another object. We identify this as the failure to +correctly capture objectness properties in learning to track. To address this +limitation of prior work, we propose a novel objectness regularization approach +that guides points to be aware of object priors by forcing them to stay inside +the the boundaries of object instances. By capturing objectness cues at +training time, we avoid the need to compute object masks during testing. In +addition, we leverage contextual attention to enhance the feature +representation for capturing objectness at the feature level more effectively. +As a result, our approach achieves state-of-the-art performance on three point +tracking benchmarks, and we further validate the effectiveness of our +components via ablation studies. The source code is available at: +https://github.com/RehgLab/tracking_objectness + +
+
+ comment: ECCV 2024 ILR Workshop +
+
+
+
+
+ + ☆ Unified Neural Network Scaling Laws and Scale-time Equivalence + + +
+ As neural networks continue to grow in size but datasets might not, it is +vital to understand how much performance improvement can be expected: is it +more important to scale network size or data volume? Thus, neural network +scaling laws, which characterize how test error varies with network size and +data volume, have become increasingly important. However, existing scaling laws +are often applicable only in limited regimes and often do not incorporate or +predict well-known phenomena such as double descent. Here, we present a novel +theoretical characterization of how three factors -- model size, training time, +and data volume -- interact to determine the performance of deep neural +networks. We first establish a theoretical and empirical equivalence between +scaling the size of a neural network and increasing its training time +proportionally. Scale-time equivalence challenges the current practice, wherein +large models are trained for small durations, and suggests that smaller models +trained over extended periods could match their efficacy. It also leads to a +novel method for predicting the performance of large-scale networks from +small-scale networks trained for extended epochs, and vice versa. We next +combine scale-time equivalence with a linear model analysis of double descent +to obtain a unified theoretical scaling law, which we confirm with experiments +across vision benchmarks and network architectures. These laws explain several +previously unexplained phenomena: reduced data requirements for generalization +in larger models, heightened sensitivity to label noise in overparameterized +models, and instances where increasing model scale does not necessarily enhance +performance. Our findings hold significant implications for the practical +deployment of neural networks, offering a more accessible and efficient path to +training and fine-tuning large models. + +
+
+
+
+
+ + ☆ Breaking Neural Network Scaling Laws with Modularity + + +
+ Modular neural networks outperform nonmodular neural networks on tasks +ranging from visual question answering to robotics. These performance +improvements are thought to be due to modular networks' superior ability to +model the compositional and combinatorial structure of real-world problems. +However, a theoretical explanation of how modularity improves generalizability, +and how to leverage task modularity while training networks remains elusive. +Using recent theoretical progress in explaining neural network generalization, +we investigate how the amount of training data required to generalize on a task +varies with the intrinsic dimensionality of a task's input. We show +theoretically that when applied to modularly structured tasks, while nonmodular +networks require an exponential number of samples with task dimensionality, +modular networks' sample complexity is independent of task dimensionality: +modular networks can generalize in high dimensions. We then develop a novel +learning rule for modular networks to exploit this advantage and empirically +show the improved generalization of the rule, both in- and out-of-distribution, +on high-dimensional, modular tasks. + +
+
+
+
+
+ + ☆ Advanced LSTM Neural Networks for Predicting Directional Changes in + Sector-Specific ETFs Using Machine Learning Techniques + + +
+ Trading and investing in stocks for some is their full-time career, while for +others, it's simply a supplementary income stream. Universal among all +investors is the desire to turn a profit. The key to achieving this goal is +diversification. Spreading investments across sectors is critical to +profitability and maximizing returns. This study aims to gauge the viability of +machine learning methods in practicing the principle of diversification to +maximize portfolio returns. To test this, the study evaluates the Long-Short +Term Memory (LSTM) model across nine different sectors and over 2,200 stocks +using Vanguard's sector-based ETFs. The R-squared value across all sectors +showed promising results, with an average of 0.8651 and a high of 0.942 for the +VNQ ETF. These findings suggest that the LSTM model is a capable and viable +model for accurately predicting directional changes across various industry +sectors, helping investors diversify and grow their portfolios. + +
+
+
+
+
+ + ☆ Consensus-based Distributed Quantum Kernel Learning for Speech + Recognition + + +
+ This paper presents a Consensus-based Distributed Quantum Kernel Learning +(CDQKL) framework aimed at improving speech recognition through distributed +quantum computing.CDQKL addresses the challenges of scalability and data +privacy in centralized quantum kernel learning. It does this by distributing +computational tasks across quantum terminals, which are connected through +classical channels. This approach enables the exchange of model parameters +without sharing local training data, thereby maintaining data privacy and +enhancing computational efficiency. Experimental evaluations on benchmark +speech emotion recognition datasets demonstrate that CDQKL achieves competitive +classification accuracy and scalability compared to centralized and local +quantum kernel learning models. The distributed nature of CDQKL offers +advantages in privacy preservation and computational efficiency, making it +suitable for data-sensitive fields such as telecommunications, automotive, and +finance. The findings suggest that CDQKL can effectively leverage distributed +quantum computing for large-scale machine-learning tasks. + +
+
+
+
+
+ + ☆ Are Heterophily-Specific GNNs and Homophily Metrics Really Effective? + Evaluation Pitfalls and New Benchmarks + + +
+ Over the past decade, Graph Neural Networks (GNNs) have achieved great +success on machine learning tasks with relational data. However, recent studies +have found that heterophily can cause significant performance degradation of +GNNs, especially on node-level tasks. Numerous heterophilic benchmark datasets +have been put forward to validate the efficacy of heterophily-specific GNNs and +various homophily metrics have been designed to help people recognize these +malignant datasets. Nevertheless, there still exist multiple pitfalls that +severely hinder the proper evaluation of new models and metrics. In this paper, +we point out three most serious pitfalls: 1) a lack of hyperparameter tuning; +2) insufficient model evaluation on the real challenging heterophilic datasets; +3) missing quantitative evaluation benchmark for homophily metrics on synthetic +graphs. To overcome these challenges, we first train and fine-tune baseline +models on $27$ most widely used benchmark datasets, categorize them into three +distinct groups: malignant, benign and ambiguous heterophilic datasets, and +identify the real challenging subsets of tasks. To our best knowledge, we are +the first to propose such taxonomy. Then, we re-evaluate $10$ +heterophily-specific state-of-the-arts (SOTA) GNNs with fine-tuned +hyperparameters on different groups of heterophilic datasets. Based on the +model performance, we reassess their effectiveness on addressing heterophily +challenge. At last, we evaluate $11$ popular homophily metrics on synthetic +graphs with three different generation approaches. To compare the metrics +strictly, we propose the first quantitative evaluation method based on +Fr\'echet distance. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2407.09618 +
+
+
+
+
+ + ☆ LLMs Will Always Hallucinate, and We Need to Live With This + + +
+ As Large Language Models become more ubiquitous across domains, it becomes +important to examine their inherent limitations critically. This work argues +that hallucinations in language models are not just occasional errors but an +inevitable feature of these systems. We demonstrate that hallucinations stem +from the fundamental mathematical and logical structure of LLMs. It is, +therefore, impossible to eliminate them through architectural improvements, +dataset enhancements, or fact-checking mechanisms. Our analysis draws on +computational theory and Godel's First Incompleteness Theorem, which references +the undecidability of problems like the Halting, Emptiness, and Acceptance +Problems. We demonstrate that every stage of the LLM process-from training data +compilation to fact retrieval, intent classification, and text generation-will +have a non-zero probability of producing hallucinations. This work introduces +the concept of Structural Hallucination as an intrinsic nature of these +systems. By establishing the mathematical certainty of hallucinations, we +challenge the prevailing notion that they can be fully mitigated. + +
+
+
+
+
+ + ☆ Real-time optimal control of high-dimensional parametrized systems by + deep learning-based reduced order models + + +
+ Steering a system towards a desired target in a very short amount of time is +challenging from a computational standpoint. Indeed, the intrinsically +iterative nature of optimal control problems requires multiple simulations of +the physical system to be controlled. Moreover, the control action needs to be +updated whenever the underlying scenario undergoes variations. Full-order +models based on, e.g., the Finite Element Method, do not meet these +requirements due to the computational burden they usually entail. On the other +hand, conventional reduced order modeling techniques such as the Reduced Basis +method, are intrusive, rely on a linear superimposition of modes, and lack of +efficiency when addressing nonlinear time-dependent dynamics. In this work, we +propose a non-intrusive Deep Learning-based Reduced Order Modeling (DL-ROM) +technique for the rapid control of systems described in terms of parametrized +PDEs in multiple scenarios. In particular, optimal full-order snapshots are +generated and properly reduced by either Proper Orthogonal Decomposition or +deep autoencoders (or a combination thereof) while feedforward neural networks +are exploited to learn the map from scenario parameters to reduced optimal +solutions. Nonlinear dimensionality reduction therefore allows us to consider +state variables and control actions that are both low-dimensional and +distributed. After (i) data generation, (ii) dimensionality reduction, and +(iii) neural networks training in the offline phase, optimal control strategies +can be rapidly retrieved in an online phase for any scenario of interest. The +computational speedup and the high accuracy obtained with the proposed approach +are assessed on different PDE-constrained optimization problems, ranging from +the minimization of energy dissipation in incompressible flows modelled through +Navier-Stokes equations to the thermal active cooling in heat transfer. + +
+
+
+
+
+ + ☆ pFedGPA: Diffusion-based Generative Parameter Aggregation for + Personalized Federated Learning + + +
+ Federated Learning (FL) offers a decentralized approach to model training, +where data remains local and only model parameters are shared between the +clients and the central server. Traditional methods, such as Federated +Averaging (FedAvg), linearly aggregate these parameters which are usually +trained on heterogeneous data distributions, potentially overlooking the +complex, high-dimensional nature of the parameter space. This can result in +degraded performance of the aggregated model. While personalized FL approaches +can mitigate the heterogeneous data issue to some extent, the limitation of +linear aggregation remains unresolved. To alleviate this issue, we investigate +the generative approach of diffusion model and propose a novel generative +parameter aggregation framework for personalized FL, \texttt{pFedGPA}. In this +framework, we deploy a diffusion model on the server to integrate the diverse +parameter distributions and propose a parameter inversion method to efficiently +generate a set of personalized parameters for each client. This inversion +method transforms the uploaded parameters into a latent code, which is then +aggregated through denoising sampling to produce the final personalized +parameters. By encoding the dependence of a client's model parameters on the +specific data distribution using the high-capacity diffusion model, +\texttt{pFedGPA} can effectively decouple the complexity of the overall +distribution of all clients' model parameters from the complexity of each +individual client's parameter distribution. Our experimental results +consistently demonstrate the superior performance of the proposed method across +multiple datasets, surpassing baseline approaches. + +
+
+
+
+
+ + ☆ MANA-Net: Mitigating Aggregated Sentiment Homogenization with News + Weighting for Enhanced Market Prediction CIKM 24 + + +
+ It is widely acknowledged that extracting market sentiments from news data +benefits market predictions. However, existing methods of using financial +sentiments remain simplistic, relying on equal-weight and static aggregation to +manage sentiments from multiple news items. This leads to a critical issue +termed ``Aggregated Sentiment Homogenization'', which has been explored through +our analysis of a large financial news dataset from industry practice. This +phenomenon occurs when aggregating numerous sentiments, causing representations +to converge towards the mean values of sentiment distributions and thereby +smoothing out unique and important information. Consequently, the aggregated +sentiment representations lose much predictive value of news data. To address +this problem, we introduce the Market Attention-weighted News Aggregation +Network (MANA-Net), a novel method that leverages a dynamic market-news +attention mechanism to aggregate news sentiments for market prediction. +MANA-Net learns the relevance of news sentiments to price changes and assigns +varying weights to individual news items. By integrating the news aggregation +step into the networks for market prediction, MANA-Net allows for trainable +sentiment representations that are optimized directly for prediction. We +evaluate MANA-Net using the S&P 500 and NASDAQ 100 indices, along with +financial news spanning from 2003 to 2018. Experimental results demonstrate +that MANA-Net outperforms various recent market prediction methods, enhancing +Profit & Loss by 1.1% and the daily Sharpe ratio by 0.252. + +
+
+ comment: Accepted by CIKM 24 +
+
+
+
+
+ + ☆ Segmentation by Factorization: Unsupervised Semantic Segmentation for + Pathology by Factorizing Foundation Model Features + + +
+ We introduce Segmentation by Factorization (F-SEG), an unsupervised +segmentation method for pathology that generates segmentation masks from +pre-trained deep learning models. F-SEG allows the use of pre-trained deep +neural networks, including recently developed pathology foundation models, for +semantic segmentation. It achieves this without requiring additional training +or finetuning, by factorizing the spatial features extracted by the models into +segmentation masks and their associated concept features. We create generic +tissue phenotypes for H&E images by training clustering models for multiple +numbers of clusters on features extracted from several deep learning models on +The Cancer Genome Atlas Program (TCGA), and then show how the clusters can be +used for factorizing corresponding segmentation masks using off-the-shelf deep +learning models. Our results show that F-SEG provides robust unsupervised +segmentation capabilities for H&E pathology images, and that the segmentation +quality is greatly improved by utilizing pathology foundation models. We +discuss and propose methods for evaluating the performance of unsupervised +segmentation in pathology. + +
+
+
+
+
+ + ☆ Extracting the U.S. building types from OpenStreetMap data + + +
+ Building type information is crucial for population estimation, traffic +planning, urban planning, and emergency response applications. Although +essential, such data is often not readily available. To alleviate this problem, +this work creates a comprehensive dataset by providing +residential/non-residential building classification covering the entire United +States. We propose and utilize an unsupervised machine learning method to +classify building types based on building footprints and available +OpenStreetMap information. The classification result is validated using +authoritative ground truth data for select counties in the U.S. The validation +shows a high precision for non-residential building classification and a high +recall for residential buildings. We identified various approaches to improving +the quality of the classification, such as removing sheds and garages from the +dataset. Furthermore, analyzing the misclassifications revealed that they are +mainly due to missing and scarce metadata in OSM. A major result of this work +is the resulting dataset of classifying 67,705,475 buildings. We hope that this +data is of value to the scientific community, including urban and +transportation planners. + +
+
+
+
+
+ + ☆ Zero-shot Outlier Detection via Prior-data Fitted Networks: Model + Selection Bygone! + + +
+ Outlier detection (OD) has a vast literature as it finds numerous +applications in environmental monitoring, cybersecurity, finance, and medicine +to name a few. Being an inherently unsupervised task, model selection is a key +bottleneck for OD (both algorithm and hyperparameter selection) without label +supervision. There is a long list of techniques to choose from -- both +classical algorithms and deep neural architectures -- and while several studies +report their hyperparameter sensitivity, the literature is quite slim on +unsupervised model selection -- limiting the effective use of OD in practice. +In this paper we present FoMo-0D, for zero/0-shot OD exploring a transformative +new direction that bypasses the hurdle of model selection altogether (!), thus +breaking new ground. The fundamental idea behind FoMo-0D is the Prior-data +Fitted Networks, recently introduced by Muller et al.(2022), which trains a +Transformer model on a large body of synthetically generated data from a prior +data distribution. In essence, FoMo-0D is a pretrained Foundation Model for +zero/0-shot OD on tabular data, which can directly predict the (outlier/inlier) +label of any test data at inference time, by merely a single forward pass -- +making obsolete the need for choosing an algorithm/architecture, tuning its +associated hyperparameters, and even training any model parameters when given a +new OD dataset. Extensive experiments on 57 public benchmark datasets against +26 baseline methods show that FoMo-0D performs statistically no different from +the top 2nd baseline, while significantly outperforming the majority of the +baselines, with an average inference time of 7.7 ms per test sample. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Unlearning or Concealment? A Critical Analysis and Evaluation Metrics + for Unlearning in Diffusion Models + + +
+ Recent research has seen significant interest in methods for concept removal +and targeted forgetting in diffusion models. In this paper, we conduct a +comprehensive white-box analysis to expose significant vulnerabilities in +existing diffusion model unlearning methods. We show that the objective +functions used for unlearning in the existing methods lead to decoupling of the +targeted concepts (meant to be forgotten) for the corresponding prompts. This +is concealment and not actual unlearning, which was the original goal. The +ineffectiveness of current methods stems primarily from their narrow focus on +reducing generation probabilities for specific prompt sets, neglecting the +diverse modalities of intermediate guidance employed during the inference +process. The paper presents a rigorous theoretical and empirical examination of +four commonly used techniques for unlearning in diffusion models. We introduce +two new evaluation metrics: Concept Retrieval Score (CRS) and Concept +Confidence Score (CCS). These metrics are based on a successful adversarial +attack setup that can recover forgotten concepts from unlearned diffusion +models. The CRS measures the similarity between the latent representations of +the unlearned and fully trained models after unlearning. It reports the extent +of retrieval of the forgotten concepts with increasing amount of guidance. The +CCS quantifies the confidence of the model in assigning the target concept to +the manipulated data. It reports the probability of the unlearned model's +generations to be aligned with the original domain knowledge with increasing +amount of guidance. Evaluating existing unlearning methods with our proposed +stringent metrics for diffusion models reveals significant shortcomings in +their ability to truly unlearn concepts. Source Code: +https://respailab.github.io/unlearning-or-concealment + +
+
+
+
+
+ + ☆ K-Fold Causal BART for CATE Estimation + + +
+ This research aims to propose and evaluate a novel model named K-Fold Causal +Bayesian Additive Regression Trees (K-Fold Causal BART) for improved estimation +of Average Treatment Effects (ATE) and Conditional Average Treatment Effects +(CATE). The study employs synthetic and semi-synthetic datasets, including the +widely recognized Infant Health and Development Program (IHDP) benchmark +dataset, to validate the model's performance. Despite promising results in +synthetic scenarios, the IHDP dataset reveals that the proposed model is not +state-of-the-art for ATE and CATE estimation. Nonetheless, the research +provides several novel insights: 1. The ps-BART model is likely the preferred +choice for CATE and ATE estimation due to better generalization compared to the +other benchmark models - including the Bayesian Causal Forest (BCF) model, +which is considered by many the current best model for CATE estimation, 2. The +BCF model's performance deteriorates significantly with increasing treatment +effect heterogeneity, while the ps-BART model remains robust, 3. Models tend to +be overconfident in CATE uncertainty quantification when treatment effect +heterogeneity is low, 4. A second K-Fold method is unnecessary for avoiding +overfitting in CATE estimation, as it adds computational costs without +improving performance, 5. Detailed analysis reveals the importance of +understanding dataset characteristics and using nuanced evaluation methods, 6. +The conclusion of Curth et al. (2021) that indirect strategies for CATE +estimation are superior for the IHDP dataset is contradicted by the results of +this research. These findings challenge existing assumptions and suggest +directions for future research to enhance causal inference methodologies. + +
+
+
+
+
+ + ☆ Real-Time Human Action Recognition on Embedded Platforms + + +
+ With advancements in computer vision and deep learning, video-based human +action recognition (HAR) has become practical. However, due to the complexity +of the computation pipeline, running HAR on live video streams incurs excessive +delays on embedded platforms. This work tackles the real-time performance +challenges of HAR with four contributions: 1) an experimental study identifying +a standard Optical Flow (OF) extraction technique as the latency bottleneck in +a state-of-the-art HAR pipeline, 2) an exploration of the latency-accuracy +tradeoff between the standard and deep learning approaches to OF extraction, +which highlights the need for a novel, efficient motion feature extractor, 3) +the design of Integrated Motion Feature Extractor (IMFE), a novel single-shot +neural network architecture for motion feature extraction with drastic +improvement in latency, 4) the development of RT-HARE, a real-time HAR system +tailored for embedded platforms. Experimental results on an Nvidia Jetson +Xavier NX platform demonstrated that RT-HARE realizes real-time HAR at a video +frame rate of 30 frames per second while delivering high levels of recognition +accuracy. + +
+
+
+
+
+ + ☆ Adversarial Attacks on Data Attribution + + +
+ Data attribution aims to quantify the contribution of individual training +data points to the outputs of an AI model, which has been used to measure the +value of training data and compensate data providers. Given the impact on +financial decisions and compensation mechanisms, a critical question arises +concerning the adversarial robustness of data attribution methods. However, +there has been little to no systematic research addressing this issue. In this +work, we aim to bridge this gap by detailing a threat model with clear +assumptions about the adversary's goal and capabilities, and by proposing +principled adversarial attack methods on data attribution. We present two such +methods, Shadow Attack and Outlier Attack, both of which generate manipulated +datasets to adversarially inflate the compensation. The Shadow Attack leverages +knowledge about the data distribution in the AI applications, and derives +adversarial perturbations through "shadow training", a technique commonly used +in membership inference attacks. In contrast, the Outlier Attack does not +assume any knowledge about the data distribution and relies solely on black-box +queries to the target model's predictions. It exploits an inductive bias +present in many data attribution methods - outlier data points are more likely +to be influential - and employs adversarial examples to generate manipulated +datasets. Empirically, in image classification and text generation tasks, the +Shadow Attack can inflate the data-attribution-based compensation by at least +200%, while the Outlier Attack achieves compensation inflation ranging from +185% to as much as 643%. + +
+
+
+
+
+ + ☆ Interactive incremental learning of generalizable skills with local + trajectory modulation + + +
+ The problem of generalization in learning from demonstration (LfD) has +received considerable attention over the years, particularly within the context +of movement primitives, where a number of approaches have emerged. Recently, +two important approaches have gained recognition. While one leverages +via-points to adapt skills locally by modulating demonstrated trajectories, +another relies on so-called task-parameterized models that encode movements +with respect to different coordinate systems, using a product of probabilities +for generalization. While the former are well-suited to precise, local +modulations, the latter aim at generalizing over large regions of the workspace +and often involve multiple objects. Addressing the quality of generalization by +leveraging both approaches simultaneously has received little attention. In +this work, we propose an interactive imitation learning framework that +simultaneously leverages local and global modulations of trajectory +distributions. Building on the kernelized movement primitives (KMP) framework, +we introduce novel mechanisms for skill modulation from direct human corrective +feedback. Our approach particularly exploits the concept of via-points to +incrementally and interactively 1) improve the model accuracy locally, 2) add +new objects to the task during execution and 3) extend the skill into regions +where demonstrations were not provided. We evaluate our method on a bearing +ring-loading task using a torque-controlled, 7-DoF, DLR SARA robot. + +
+
+ comment: 21 pages, 16 figures +
+
+
+
+
+ + ☆ Optimal Projections for Classification with Naive Bayes + + +
+ In the Naive Bayes classification model the class conditional densities are +estimated as the products of their marginal densities along the cardinal basis +directions. We study the problem of obtaining an alternative basis for this +factorisation with the objective of enhancing the discriminatory power of the +associated classification model. We formulate the problem as a projection +pursuit to find the optimal linear projection on which to perform +classification. Optimality is determined based on the multinomial likelihood +within which probabilities are estimated using the Naive Bayes factorisation of +the projected data. Projection pursuit offers the added benefits of dimension +reduction and visualisation. We discuss an intuitive connection with class +conditional independent components analysis, and show how this is realised +visually in practical applications. The performance of the resulting +classification models is investigated using a large collection of (162) +publicly available benchmark data sets and in comparison with relevant +alternatives. We find that the proposed approach substantially outperforms +other popular probabilistic discriminant analysis models and is highly +competitive with Support Vector Machines. + +
+
+
+
+
+ + ☆ Forward KL Regularized Preference Optimization for Aligning Diffusion + Policies + + +
+ Diffusion models have achieved remarkable success in sequential +decision-making by leveraging the highly expressive model capabilities in +policy learning. A central problem for learning diffusion policies is to align +the policy output with human intents in various tasks. To achieve this, +previous methods conduct return-conditioned policy generation or Reinforcement +Learning (RL)-based policy optimization, while they both rely on pre-defined +reward functions. In this work, we propose a novel framework, Forward KL +regularized Preference optimization for aligning Diffusion policies, to align +the diffusion policy with preferences directly. We first train a diffusion +policy from the offline dataset without considering the preference, and then +align the policy to the preference data via direct preference optimization. +During the alignment phase, we formulate direct preference learning in a +diffusion policy, where the forward KL regularization is employed in preference +optimization to avoid generating out-of-distribution actions. We conduct +extensive experiments for MetaWorld manipulation and D4RL tasks. The results +show our method exhibits superior alignment with preferences and outperforms +previous state-of-the-art algorithms. + +
+
+
+
+
+ + ☆ Joint Input and Output Coordination for Class-Incremental Learning IJCAI 2024 + + +
+ Incremental learning is nontrivial due to severe catastrophic forgetting. +Although storing a small amount of data on old tasks during incremental +learning is a feasible solution, current strategies still do not 1) adequately +address the class bias problem, and 2) alleviate the mutual interference +between new and old tasks, and 3) consider the problem of class bias within +tasks. This motivates us to propose a joint input and output coordination +(JIOC) mechanism to address these issues. This mechanism assigns different +weights to different categories of data according to the gradient of the output +score, and uses knowledge distillation (KD) to reduce the mutual interference +between the outputs of old and new tasks. The proposed mechanism is general and +flexible, and can be incorporated into different incremental learning +approaches that use memory storage. Extensive experiments show that our +mechanism can significantly improve their performance. + +
+
+ comment: 11 pages, 4 figues. Accepted by IJCAI 2024 +
+
+
+
+
+ + ☆ Normalizing Energy Consumption for Hardware-Independent Evaluation + + +
+ The increasing use of machine learning (ML) models in signal processing has +raised concerns about their environmental impact, particularly during +resource-intensive training phases. In this study, we present a novel +methodology for normalizing energy consumption across different hardware +platforms to facilitate fair and consistent comparisons. We evaluate different +normalization strategies by measuring the energy used to train different ML +architectures on different GPUs, focusing on audio tagging tasks. Our approach +shows that the number of reference points, the type of regression and the +inclusion of computational metrics significantly influences the normalization +process. We find that the appropriate selection of two reference points +provides robust normalization, while incorporating the number of floating-point +operations and parameters improves the accuracy of energy consumption +predictions. By supporting more accurate energy consumption evaluation, our +methodology promotes the development of environmentally sustainable ML +practices. + +
+
+
+
+
+ + ☆ When resampling/reweighting improves feature learning in imbalanced + classification?: A toy-model study + + +
+ A toy model of binary classification is studied with the aim of clarifying +the class-wise resampling/reweighting effect on the feature learning +performance under the presence of class imbalance. In the analysis, a +high-dimensional limit of the feature is taken while keeping the dataset size +ratio against the feature dimension finite and the non-rigorous replica method +from statistical mechanics is employed. The result shows that there exists a +case in which the no resampling/reweighting situation gives the best feature +learning performance irrespectively of the choice of losses or classifiers, +supporting recent findings in Cao et al. (2019); Kang et al. (2019). It is also +revealed that the key of the result is the symmetry of the loss and the problem +setting. Inspired by this, we propose a further simplified model exhibiting the +same property for the multiclass setting. These clarify when the class-wise +resampling/reweighting becomes effective in imbalanced classification. + +
+
+ comment: 30 pages, 14 figures +
+
+
+
+
+ + ☆ SynMorph: Generating Synthetic Face Morphing Dataset with Mated Samples + + +
+ Face morphing attack detection (MAD) algorithms have become essential to +overcome the vulnerability of face recognition systems. To solve the lack of +large-scale and public-available datasets due to privacy concerns and +restrictions, in this work we propose a new method to generate a synthetic face +morphing dataset with 2450 identities and more than 100k morphs. The proposed +synthetic face morphing dataset is unique for its high-quality samples, +different types of morphing algorithms, and the generalization for both single +and differential morphing attack detection algorithms. For experiments, we +apply face image quality assessment and vulnerability analysis to evaluate the +proposed synthetic face morphing dataset from the perspective of biometric +sample quality and morphing attack potential on face recognition systems. The +results are benchmarked with an existing SOTA synthetic dataset and a +representative non-synthetic and indicate improvement compared with the SOTA. +Additionally, we design different protocols and study the applicability of +using the proposed synthetic dataset on training morphing attack detection +algorithms. + +
+
+
+
+
+ + ☆ Approximation Bounds for Recurrent Neural Networks with Application to + Regression + + +
+ We study the approximation capacity of deep ReLU recurrent neural networks +(RNNs) and explore the convergence properties of nonparametric least squares +regression using RNNs. We derive upper bounds on the approximation error of +RNNs for H\"older smooth functions, in the sense that the output at each time +step of an RNN can approximate a H\"older function that depends only on past +and current information, termed a past-dependent function. This allows a +carefully constructed RNN to simultaneously approximate a sequence of +past-dependent H\"older functions. We apply these approximation results to +derive non-asymptotic upper bounds for the prediction error of the empirical +risk minimizer in regression problem. Our error bounds achieve minimax optimal +rate under both exponentially $\beta$-mixing and i.i.d. data assumptions, +improving upon existing ones. Our results provide statistical guarantees on the +performance of RNNs. + +
+
+
+
+
+ + ☆ Learning to Model Graph Structural Information on MLPs via Graph + Structure Self-Contrasting + + +
+ Recent years have witnessed great success in handling graph-related tasks +with Graph Neural Networks (GNNs). However, most existing GNNs are based on +message passing to perform feature aggregation and transformation, where the +structural information is explicitly involved in the forward propagation by +coupling with node features through graph convolution at each layer. As a +result, subtle feature noise or structure perturbation may cause severe error +propagation, resulting in extremely poor robustness. In this paper, we rethink +the roles played by graph structural information in graph data training and +identify that message passing is not the only path to modeling structural +information. Inspired by this, we propose a simple but effective Graph +Structure Self-Contrasting (GSSC) framework that learns graph structural +information without message passing. The proposed framework is based purely on +Multi-Layer Perceptrons (MLPs), where the structural information is only +implicitly incorporated as prior knowledge to guide the computation of +supervision signals, substituting the explicit message propagation as in GNNs. +Specifically, it first applies structural sparsification to remove potentially +uninformative or noisy edges in the neighborhood, and then performs structural +self-contrasting in the sparsified neighborhood to learn robust node +representations. Finally, structural sparsification and self-contrasting are +formulated as a bi-level optimization problem and solved in a unified +framework. Extensive experiments have qualitatively and quantitatively +demonstrated that the GSSC framework can produce truly encouraging performance +with better generalization and robustness than other leading competitors. + +
+
+
+
+
+ + ☆ SciAgents: Automating scientific discovery through multi-agent + intelligent graph reasoning + + +
+ A key challenge in artificial intelligence is the creation of systems capable +of autonomously advancing scientific understanding by exploring novel domains, +identifying complex patterns, and uncovering previously unseen connections in +vast scientific data. In this work, we present SciAgents, an approach that +leverages three core concepts: (1) the use of large-scale ontological knowledge +graphs to organize and interconnect diverse scientific concepts, (2) a suite of +large language models (LLMs) and data retrieval tools, and (3) multi-agent +systems with in-situ learning capabilities. Applied to biologically inspired +materials, SciAgents reveals hidden interdisciplinary relationships that were +previously considered unrelated, achieving a scale, precision, and exploratory +power that surpasses traditional human-driven research methods. The framework +autonomously generates and refines research hypotheses, elucidating underlying +mechanisms, design principles, and unexpected material properties. By +integrating these capabilities in a modular fashion, the intelligent system +yields material discoveries, critique and improve existing hypotheses, retrieve +up-to-date data about existing research, and highlights their strengths and +limitations. Our case studies demonstrate scalable capabilities to combine +generative AI, ontological representations, and multi-agent modeling, +harnessing a `swarm of intelligence' similar to biological systems. This +provides new avenues for materials discovery and accelerates the development of +advanced materials by unlocking Nature's design principles. + +
+
+
+
+
+ + ☆ CoBo: Collaborative Learning via Bilevel Optimization + + +
+ Collaborative learning is an important tool to train multiple clients more +effectively by enabling communication among clients. Identifying helpful +clients, however, presents challenging and often introduces significant +overhead. In this paper, we model client-selection and model-training as two +interconnected optimization problems, proposing a novel bilevel optimization +problem for collaborative learning. We introduce CoBo, a scalable and elastic, +SGD-type alternating optimization algorithm that efficiently addresses these +problem with theoretical convergence guarantees. Empirically, CoBo achieves +superior performance, surpassing popular personalization algorithms by 9.3% in +accuracy on a task with high heterogeneity, involving datasets distributed +among 80 clients. + +
+
+
+
+
+ + ☆ Interpolation, Extrapolation, Hyperpolation: Generalising into new + dimensions + + +
+ This paper introduces the concept of hyperpolation: a way of generalising +from a limited set of data points that is a peer to the more familiar concepts +of interpolation and extrapolation. Hyperpolation is the task of estimating the +value of a function at new locations that lie outside the subspace (or +manifold) of the existing data. We shall see that hyperpolation is possible and +explore its links to creativity in the arts and sciences. We will also examine +the role of hyperpolation in machine learning and suggest that the lack of +fundamental creativity in current AI systems is deeply connected to their +limited ability to hyperpolate. + +
+
+ comment: 22 pages, 8 figures +
+
+
+
+
+ + ☆ A general reduced-order neural operator for spatio-temporal predictive + learning on complex spatial domains + + +
+ Predictive learning for spatio-temporal processes (PL-STP) on complex spatial +domains plays a critical role in various scientific and engineering fields, +with its essence being the construction of operators between +infinite-dimensional function spaces. This paper focuses on the unequal-domain +mappings in PL-STP and categorising them into increase-domain and +decrease-domain mapping. Recent advances in deep learning have revealed the +great potential of neural operators (NOs) to learn operators directly from +observational data. However, existing NOs require input space and output space +to be the same domain, which pose challenges in ensuring predictive accuracy +and stability for unequal-domain mappings. To this end, this study presents a +general reduced-order neural operator named Reduced-Order Neural Operator on +Riemannian Manifolds (RO-NORM), which consists of two parts: the unequal-domain +encoder/decoder and the same-domain approximator. Motivated by the variable +separation in classical modal decomposition, the unequal-domain encoder/decoder +uses the pre-computed bases to reformulate the spatio-temporal function as a +sum of products between spatial (or temporal) bases and corresponding +temporally (or spatially) distributed weight functions, thus the original +unequal-domain mapping can be converted into a same-domain mapping. +Consequently, the same-domain approximator NORM is applied to model the +transformed mapping. The performance of our proposed method has been evaluated +on six benchmark cases, including parametric PDEs, engineering and biomedical +applications, and compared with four baseline algorithms: DeepONet, +POD-DeepONet, PCA-Net, and vanilla NORM. The experimental results demonstrate +the superiority of RO-NORM in prediction accuracy and training efficiency for +PL-STP. + +
+
+
+
+
+ + ☆ Optimizing VarLiNGAM for Scalable and Efficient Time Series Causal + Discovery + + +
+ Causal discovery is designed to identify causal relationships in data, a task +that has become increasingly complex due to the computational demands of +traditional methods such as VarLiNGAM, which combines Vector Autoregressive +Model with Linear Non-Gaussian Acyclic Model for time series data. + This study is dedicated to optimising causal discovery specifically for time +series data, which is common in practical applications. Time series causal +discovery is particularly challenging due to the need to account for temporal +dependencies and potential time lag effects. By designing a specialised dataset +generator and reducing the computational complexity of the VarLiNGAM model from +\( O(m^3 \cdot n) \) to \( O(m^3 + m^2 \cdot n) \), this study significantly +improves the feasibility of processing large datasets. The proposed methods +have been validated on advanced computational platforms and tested across +simulated, real-world, and large-scale datasets, showcasing enhanced efficiency +and performance. The optimised algorithm achieved 7 to 13 times speedup +compared with the original algorithm and around 4.5 times speedup compared with +the GPU-accelerated version on large-scale datasets with feature sizes between +200 and 400. + Our methods aim to push the boundaries of current causal discovery +capabilities, making them more robust, scalable, and applicable to real-world +scenarios, thus facilitating breakthroughs in various fields such as healthcare +and finance. + +
+
+
+
+
+ + ☆ Using machine learning for fault detection in lighthouse light sensors + + +
+ Lighthouses play a crucial role in ensuring maritime safety by signaling +hazardous areas such as dangerous coastlines, shoals, reefs, and rocks, along +with aiding harbor entries and aerial navigation. This is achieved through the +use of photoresistor sensors that activate or deactivate based on the time of +day. However, a significant issue is the potential malfunction of these +sensors, leading to the gradual misalignment of the light's operational timing. +This paper introduces an innovative machine learning-based approach for +automatically detecting such malfunctions. We evaluate four distinct +algorithms: decision trees, random forest, extreme gradient boosting, and +multi-layer perceptron. Our findings indicate that the multi-layer perceptron +is the most effective, capable of detecting timing discrepancies as small as +10-15 minutes. This accuracy makes it a highly efficient tool for automating +the detection of faults in lighthouse light sensors. + +
+
+
+
+
+ + ☆ CRADLE-VAE: Enhancing Single-Cell Gene Perturbation Modeling with + Counterfactual Reasoning-based Artifact Disentanglement + + +
+ Predicting cellular responses to various perturbations is a critical focus in +drug discovery and personalized therapeutics, with deep learning models playing +a significant role in this endeavor. Single-cell datasets contain technical +artifacts that may hinder the predictability of such models, which poses +quality control issues highly regarded in this area. To address this, we +propose CRADLE-VAE, a causal generative framework tailored for single-cell gene +perturbation modeling, enhanced with counterfactual reasoning-based artifact +disentanglement. Throughout training, CRADLE-VAE models the underlying latent +distribution of technical artifacts and perturbation effects present in +single-cell datasets. It employs counterfactual reasoning to effectively +disentangle such artifacts by modulating the latent basal spaces and learns +robust features for generating cellular response data with improved quality. +Experimental results demonstrate that this approach improves not only treatment +effect estimation performance but also generative quality as well. The +CRADLE-VAE codebase is publicly available at +https://github.com/dmis-lab/CRADLE-VAE. + +
+
+
+
+
+ + ☆ Advancing Machine Learning for Stellar Activity and Exoplanet Period + Rotation + + +
+ This study applied machine learning models to estimate stellar rotation +periods from corrected light curve data obtained by the NASA Kepler mission. +Traditional methods often struggle to estimate rotation periods accurately due +to noise and variability in the light curve data. The workflow involved using +initial period estimates from the LS-Periodogram and Transit Least Squares +techniques, followed by splitting the data into training, validation, and +testing sets. We employed several machine learning algorithms, including +Decision Tree, Random Forest, K-Nearest Neighbors, and Gradient Boosting, and +also utilized a Voting Ensemble approach to improve prediction accuracy and +robustness. + The analysis included data from multiple Kepler IDs, providing detailed +metrics on orbital periods and planet radii. Performance evaluation showed that +the Voting Ensemble model yielded the most accurate results, with an RMSE +approximately 50\% lower than the Decision Tree model and 17\% better than the +K-Nearest Neighbors model. The Random Forest model performed comparably to the +Voting Ensemble, indicating high accuracy. In contrast, the Gradient Boosting +model exhibited a worse RMSE compared to the other approaches. Comparisons of +the predicted rotation periods to the photometric reference periods showed +close alignment, suggesting the machine learning models achieved high +prediction accuracy. The results indicate that machine learning, particularly +ensemble methods, can effectively solve the problem of accurately estimating +stellar rotation periods, with significant implications for advancing the study +of exoplanets and stellar astrophysics. + +
+
+ comment: 15 pages, 8 figures. Submitted for publication in A&A +
+
+
+
+
+ + ☆ Retrofitting Temporal Graph Neural Networks with Transformer + + +
+ Temporal graph neural networks (TGNNs) outperform regular GNNs by +incorporating time information into graph-based operations. However, TGNNs +adopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored +training frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN, +which uses Transformer decoder as the backbone model for TGNN to enjoy +Transformer's codebase for efficient training. In particular, Transformer +achieves tremendous success for language modeling, and thus the community +developed high-performance kernels (e.g., flash-attention and memory-efficient +attention) and efficient distributed training schemes (e.g., PyTorch FSDP, +DeepSpeed, and Megatron-LM). We observe that TGNN resembles language modeling, +i.e., the message aggregation operation between chronologically occurring nodes +and their temporal neighbors in TGNNs can be structured as sequence modeling. +Beside this similarity, we also incorporate a series of algorithm designs +including suffix infilling, temporal graph attention with self-loop, and causal +masking self-attention to make TF-TGN work. During training, existing systems +are slow in transforming the graph topology and conducting graph sampling. As +such, we propose methods to parallelize the CSR format conversion and graph +sampling. We also adapt Transformer codebase to train TF-TGN efficiently with +multiple GPUs. We experiment with 9 graphs and compare with 2 state-of-the-art +TGNN training frameworks. The results show that TF-TGN can accelerate training +by over 2.20 while providing comparable or even superior accuracy to existing +SOTA TGNNs. TF-TGN is available at https://github.com/qianghuangwhu/TF-TGN. + +
+
+ comment: conference Under review +
+
+
+
+
+ + ☆ Reinforcement Learning for Variational Quantum Circuits Design + + +
+ Variational Quantum Algorithms have emerged as promising tools for solving +optimization problems on quantum computers. These algorithms leverage a +parametric quantum circuit called ansatz, where its parameters are adjusted by +a classical optimizer with the goal of optimizing a certain cost function. +However, a significant challenge lies in designing effective circuits for +addressing specific problems. In this study, we leverage the powerful and +flexible Reinforcement Learning paradigm to train an agent capable of +autonomously generating quantum circuits that can be used as ansatzes in +variational algorithms to solve optimization problems. The agent is trained on +diverse problem instances, including Maximum Cut, Maximum Clique and Minimum +Vertex Cover, built from different graph topologies and sizes. Our analysis of +the circuits generated by the agent and the corresponding solutions shows that +the proposed method is able to generate effective ansatzes. While our goal is +not to propose any new specific ansatz, we observe how the agent has discovered +a novel family of ansatzes effective for Maximum Cut problems, which we call +$R_{yz}$-connected. We study the characteristics of one of these ansatzes by +comparing it against state-of-the-art quantum algorithms across instances of +varying graph topologies, sizes, and problem types. Our results indicate that +the $R_{yz}$-connected circuit achieves high approximation ratios for Maximum +Cut problems, further validating our proposed agent. In conclusion, our study +highlights the potential of Reinforcement Learning techniques in assisting +researchers to design effective quantum circuits which could have applications +in a wide number of tasks. + +
+
+
+
+
+ + ☆ Beyond Flatland: A Geometric Take on Matching Methods for Treatment + Effect Estimation + + +
+ Matching is a popular approach in causal inference to estimate treatment +effects by pairing treated and control units that are most similar in terms of +their covariate information. However, classic matching methods completely +ignore the geometry of the data manifold, which is crucial to define a +meaningful distance for matching, and struggle when covariates are noisy and +high-dimensional. In this work, we propose GeoMatching, a matching method to +estimate treatment effects that takes into account the intrinsic data geometry +induced by existing causal mechanisms among the confounding variables. First, +we learn a low-dimensional, latent Riemannian manifold that accounts for +uncertainty and geometry of the original input data. Second, we estimate +treatment effects via matching in the latent space based on the learned latent +Riemannian metric. We provide theoretical insights and empirical results in +synthetic and real-world scenarios, demonstrating that GeoMatching yields more +effective treatment effect estimators, even as we increase input +dimensionality, in the presence of outliers, or in semi-supervised scenarios. + +
+
+
+
+
+ + ☆ State-Novelty Guided Action Persistence in Deep Reinforcement Learning + + +
+ While a powerful and promising approach, deep reinforcement learning (DRL) +still suffers from sample inefficiency, which can be notably improved by +resorting to more sophisticated techniques to address the +exploration-exploitation dilemma. One such technique relies on action +persistence (i.e., repeating an action over multiple steps). However, previous +work exploiting action persistence either applies a fixed strategy or learns +additional value functions (or policy) for selecting the repetition number. In +this paper, we propose a novel method to dynamically adjust the action +persistence based on the current exploration status of the state space. In such +a way, our method does not require training of additional value functions or +policy. Moreover, the use of a smooth scheduling of the repeat probability +allows a more effective balance between exploration and exploitation. +Furthermore, our method can be seamlessly integrated into various basic +exploration strategies to incorporate temporal persistence. Finally, extensive +experiments on different DMControl tasks demonstrate that our state-novelty +guided action persistence method significantly improves the sample efficiency. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ HyperSMOTE: A Hypergraph-based Oversampling Approach for Imbalanced Node + Classifications + + +
+ Hypergraphs are increasingly utilized in both unimodal and multimodal data +scenarios due to their superior ability to model and extract higher-order +relationships among nodes, compared to traditional graphs. However, current +hypergraph models are encountering challenges related to imbalanced data, as +this imbalance can lead to biases in the model towards the more prevalent +classes. While the existing techniques, such as GraphSMOTE, have improved +classification accuracy for minority samples in graph data, they still fall +short when addressing the unique structure of hypergraphs. Inspired by SMOTE +concept, we propose HyperSMOTE as a solution to alleviate the class imbalance +issue in hypergraph learning. This method involves a two-step process: +initially synthesizing minority class nodes, followed by the nodes integration +into the original hypergraph. We synthesize new nodes based on samples from +minority classes and their neighbors. At the same time, in order to solve the +problem on integrating the new node into the hypergraph, we train a decoder +based on the original hypergraph incidence matrix to adaptively associate the +augmented node to hyperedges. We conduct extensive evaluation on multiple +single-modality datasets, such as Cora, Cora-CA and Citeseer, as well as +multimodal conversation dataset MELD to verify the effectiveness of HyperSMOTE, +showing an average performance gain of 3.38% and 2.97% on accuracy, +respectively. + +
+
+
+
+
+ + ☆ Sequential Posterior Sampling with Diffusion Models + + +
+ Diffusion models have quickly risen in popularity for their ability to model +complex distributions and perform effective posterior sampling. Unfortunately, +the iterative nature of these generative models makes them computationally +expensive and unsuitable for real-time sequential inverse problems such as +ultrasound imaging. Considering the strong temporal structure across sequences +of frames, we propose a novel approach that models the transition dynamics to +improve the efficiency of sequential diffusion posterior sampling in +conditional image synthesis. Through modeling sequence data using a video +vision transformer (ViViT) transition model based on previous diffusion +outputs, we can initialize the reverse diffusion trajectory at a lower noise +scale, greatly reducing the number of iterations required for convergence. We +demonstrate the effectiveness of our approach on a real-world dataset of high +frame rate cardiac ultrasound images and show that it achieves the same +performance as a full diffusion trajectory while accelerating inference +25$\times$, enabling real-time posterior sampling. Furthermore, we show that +the addition of a transition model improves the PSNR up to 8\% in cases with +severe motion. Our method opens up new possibilities for real-time applications +of diffusion models in imaging and other domains requiring real-time inference. + +
+
+ comment: 5 pages, 4 figures, preprint +
+
+
+
+
+ + ☆ Shaking Up VLMs: Comparing Transformers and Structured State Space + Models for Vision & Language Modeling + + +
+ This study explores replacing Transformers in Visual Language Models (VLMs) +with Mamba, a recent structured state space model (SSM) that demonstrates +promising performance in sequence modeling. We test models up to 3B parameters +under controlled conditions, showing that Mamba-based VLMs outperforms +Transformers-based VLMs in captioning, question answering, and reading +comprehension. However, we find that Transformers achieve greater performance +in visual grounding and the performance gap widens with scale. We explore two +hypotheses to explain this phenomenon: 1) the effect of task-agnostic visual +encoding on the updates of the hidden states, and 2) the difficulty in +performing visual grounding from the perspective of in-context multimodal +retrieval. Our results indicate that a task-aware encoding yields minimal +performance gains on grounding, however, Transformers significantly outperform +Mamba at in-context multimodal retrieval. Overall, Mamba shows promising +performance on tasks where the correct output relies on a summary of the image +but struggles when retrieval of explicit information from the context is +required. + +
+
+
+
+
+ + ☆ A Novel Representation of Periodic Pattern and Its Application to + Untrained Anomaly Detection + + +
+ There are a variety of industrial products that possess periodic textures or +surfaces, such as carbon fiber textiles and display panels. Traditional +image-based quality inspection methods for these products require identifying +the periodic patterns from normal images (without anomaly and noise) and +subsequently detecting anomaly pixels with inconsistent appearances. However, +it remains challenging to accurately extract the periodic pattern from a single +image in the presence of unknown anomalies and measurement noise. To deal with +this challenge, this paper proposes a novel self-representation of the periodic +image defined on a set of continuous parameters. In this way, periodic pattern +learning can be embedded into a joint optimization framework, which is named +periodic-sparse decomposition, with simultaneously modeling the sparse +anomalies and Gaussian noise. Finally, for the real-world industrial images +that may not strictly satisfy the periodic assumption, we propose a novel +pixel-level anomaly scoring strategy to enhance the performance of anomaly +detection. Both simulated and real-world case studies demonstrate the +effectiveness of the proposed methodology for periodic pattern learning and +anomaly detection. + +
+
+
+
+
+ + ☆ BAMDP Shaping: a Unified Theoretical Framework for Intrinsic Motivation + and Reward Shaping + + +
+ Intrinsic motivation (IM) and reward shaping are common methods for guiding +the exploration of reinforcement learning (RL) agents by adding pseudo-rewards. +Designing these rewards is challenging, however, and they can +counter-intuitively harm performance. To address this, we characterize them as +reward shaping in Bayes-Adaptive Markov Decision Processes (BAMDPs), which +formalizes the value of exploration by formulating the RL process as updating a +prior over possible MDPs through experience. RL algorithms can be viewed as +BAMDP policies; instead of attempting to find optimal algorithms by solving +BAMDPs directly, we use it at a theoretical framework for understanding how +pseudo-rewards guide suboptimal algorithms. By decomposing BAMDP state value +into the value of the information collected plus the prior value of the +physical state, we show how psuedo-rewards can help by compensating for RL +algorithms' misestimation of these two terms, yielding a new typology of IM and +reward shaping approaches. We carefully extend the potential-based shaping +theorem to BAMDPs to prove that when pseudo-rewards are BAMDP Potential-based +shaping Functions (BAMPFs), they preserve optimal, or approximately optimal, +behavior of RL algorithms; otherwise, they can corrupt even optimal learners. +We finally give guidance on how to design or convert existing pseudo-rewards to +BAMPFs by expressing assumptions about the environment as potential functions +on BAMDP states. + +
+
+
+
+
+ + ☆ Attention Based Machine Learning Methods for Data Reduction with + Guaranteed Error Bounds + + +
+ Scientific applications in fields such as high energy physics, computational +fluid dynamics, and climate science generate vast amounts of data at high +velocities. This exponential growth in data production is surpassing the +advancements in computing power, network capabilities, and storage capacities. +To address this challenge, data compression or reduction techniques are +crucial. These scientific datasets have underlying data structures that consist +of structured and block structured multidimensional meshes where each grid +point corresponds to a tensor. It is important that data reduction techniques +leverage strong spatial and temporal correlations that are ubiquitous in these +applications. Additionally, applications such as CFD, process tensors +comprising hundred plus species and their attributes at each grid point. +Reduction techniques should be able to leverage interrelationships between the +elements in each tensor. In this paper, we propose an attention-based +hierarchical compression method utilizing a block-wise compression setup. We +introduce an attention-based hyper-block autoencoder to capture inter-block +correlations, followed by a block-wise encoder to capture block-specific +information. A PCA-based post-processing step is employed to guarantee error +bounds for each data block. Our method effectively captures both spatiotemporal +and inter-variable correlations within and between data blocks. Compared to the +state-of-the-art SZ3, our method achieves up to 8 times higher compression +ratio on the multi-variable S3D dataset. When evaluated on single-variable +setups using the E3SM and XGC datasets, our method still achieves up to 3 times +and 2 times higher compression ratio, respectively. + +
+
+
+
+
+ + ☆ IndicVoices-R: Unlocking a Massive Multilingual Multi-speaker Speech + Corpus for Scaling Indian TTS + + +
+ Recent advancements in text-to-speech (TTS) synthesis show that large-scale +models trained with extensive web data produce highly natural-sounding output. +However, such data is scarce for Indian languages due to the lack of +high-quality, manually subtitled data on platforms like LibriVox or YouTube. To +address this gap, we enhance existing large-scale ASR datasets containing +natural conversations collected in low-quality environments to generate +high-quality TTS training data. Our pipeline leverages the cross-lingual +generalization of denoising and speech enhancement models trained on English +and applied to Indian languages. This results in IndicVoices-R (IV-R), the +largest multilingual Indian TTS dataset derived from an ASR dataset, with 1,704 +hours of high-quality speech from 10,496 speakers across 22 Indian languages. +IV-R matches the quality of gold-standard TTS datasets like LJSpeech, LibriTTS, +and IndicTTS. We also introduce the IV-R Benchmark, the first to assess +zero-shot, few-shot, and many-shot speaker generalization capabilities of TTS +models on Indian voices, ensuring diversity in age, gender, and style. We +demonstrate that fine-tuning an English pre-trained model on a combined dataset +of high-quality IndicTTS and our IV-R dataset results in better zero-shot +speaker generalization compared to fine-tuning on the IndicTTS dataset alone. +Further, our evaluation reveals limited zero-shot generalization for Indian +voices in TTS models trained on prior datasets, which we improve by fine-tuning +the model on our data containing diverse set of speakers across language +families. We open-source all data and code, releasing the first TTS model for +all 22 official Indian languages. + +
+
+
+
+
+ + ☆ Recursive Nested Filtering for Efficient Amortized Bayesian Experimental + Design + + +
+ This paper introduces the Inside-Out Nested Particle Filter (IO-NPF), a +novel, fully recursive, algorithm for amortized sequential Bayesian +experimental design in the non-exchangeable setting. We frame policy +optimization as maximum likelihood estimation in a non-Markovian state-space +model, achieving (at most) $\mathcal{O}(T^2)$ computational complexity in the +number of experiments. We provide theoretical convergence guarantees and +introduce a backward sampling algorithm to reduce trajectory degeneracy. IO-NPF +offers a practical, extensible, and provably consistent approach to sequential +Bayesian experimental design, demonstrating improved efficiency over existing +methods. + +
+
+
+
+
+ + ☆ On the Convergence Analysis of Over-Parameterized Variational + Autoencoders: A Neural Tangent Kernel Perspective + + +
+ Variational Auto-Encoders (VAEs) have emerged as powerful probabilistic +models for generative tasks. However, their convergence properties have not +been rigorously proven. The challenge of proving convergence is inherently +difficult due to the highly non-convex nature of the training objective and the +implementation of a Stochastic Neural Network (SNN) within VAE architectures. +This paper addresses these challenges by characterizing the optimization +trajectory of SNNs utilized in VAEs through the lens of Neural Tangent Kernel +(NTK) techniques. These techniques govern the optimization and generalization +behaviors of ultra-wide neural networks. We provide a mathematical proof of VAE +convergence under mild assumptions, thus advancing the theoretical +understanding of VAE optimization dynamics. Furthermore, we establish a novel +connection between the optimization problem faced by over-parameterized SNNs +and the Kernel Ridge Regression (KRR) problem. Our findings not only contribute +to the theoretical foundation of VAEs but also open new avenues for +investigating the optimization of generative models using advanced kernel +methods. Our theoretical claims are verified by experimental simulations. + +
+
+ comment: Accepted by Machine Learning journal +
+
+
+
+
+ + ☆ TriplePlay: Enhancing Federated Learning with CLIP for Non-IID Data and + Resource Efficiency + + +
+ The rapid advancement and increasing complexity of pretrained models, +exemplified by CLIP, offer significant opportunities as well as challenges for +Federated Learning (FL), a critical component of privacy-preserving artificial +intelligence. This research delves into the intricacies of integrating large +foundation models like CLIP within FL frameworks to enhance privacy, +efficiency, and adaptability across heterogeneous data landscapes. It +specifically addresses the challenges posed by non-IID data distributions, the +computational and communication overheads of leveraging such complex models, +and the skewed representation of classes within datasets. We propose +TriplePlay, a framework that integrates CLIP as an adapter to enhance FL's +adaptability and performance across diverse data distributions. This approach +addresses the long-tail distribution challenge to ensure fairness while +reducing resource demands through quantization and low-rank adaptation +techniques.Our simulation results demonstrate that TriplePlay effectively +decreases GPU usage costs and speeds up the learning process, achieving +convergence with reduced communication overhead. + +
+
+
+
+
+ + ☆ GDFlow: Anomaly Detection with NCDE-based Normalizing Flow for Advanced + Driver Assistance System + + +
+ For electric vehicles, the Adaptive Cruise Control (ACC) in Advanced Driver +Assistance Systems (ADAS) is designed to assist braking based on driving +conditions, road inclines, predefined deceleration strengths, and user braking +patterns. However, the driving data collected during the development of ADAS +are generally limited and lack diversity. This deficiency leads to late or +aggressive braking for different users. Crucially, it is necessary to +effectively identify anomalies, such as unexpected or inconsistent braking +patterns in ADAS, especially given the challenge of working with unlabelled, +limited, and noisy datasets from real-world electric vehicles. In order to +tackle the aforementioned challenges in ADAS, we propose Graph Neural +Controlled Differential Equation Normalizing Flow (GDFlow), a model that +leverages Normalizing Flow (NF) with Neural Controlled Differential Equations +(NCDE) to learn the distribution of normal driving patterns continuously. +Compared to the traditional clustering or anomaly detection algorithms, our +approach effectively captures the spatio-temporal information from different +sensor data and more accurately models continuous changes in driving patterns. +Additionally, we introduce a quantile-based maximum likelihood objective to +improve the likelihood estimate of the normal data near the boundary of the +distribution, enhancing the model's ability to distinguish between normal and +anomalous patterns. We validate GDFlow using real-world electric vehicle +driving data that we collected from Hyundai IONIQ5 and GV80EV, achieving +state-of-the-art performance compared to six baselines across four dataset +configurations of different vehicle types and drivers. Furthermore, our model +outperforms the latest anomaly detection methods across four time series +benchmark datasets. Our approach demonstrates superior efficiency in inference +time compared to existing methods. + +
+
+
+
+
+ + ☆ Robust Non-adaptive Group Testing under Errors in Group Membership + Specifications + + +
+ Given $p$ samples, each of which may or may not be defective, group testing +(GT) aims to determine their defect status by performing tests on $n < p$ +`groups', where a group is formed by mixing a subset of the $p$ samples. +Assuming that the number of defective samples is very small compared to $p$, GT +algorithms have provided excellent recovery of the status of all $p$ samples +with even a small number of groups. Most existing methods, however, assume that +the group memberships are accurately specified. This assumption may not always +be true in all applications, due to various resource constraints. Such errors +could occur, eg, when a technician, preparing the groups in a laboratory, +unknowingly mixes together an incorrect subset of samples as compared to what +was specified. We develop a new GT method, the Debiased Robust Lasso Test +Method (DRLT), that handles such group membership specification errors. The +proposed DRLT method is based on an approach to debias, or reduce the inherent +bias in, estimates produced by Lasso, a popular and effective sparse regression +technique. We also provide theoretical upper bounds on the reconstruction error +produced by our estimator. Our approach is then combined with two carefully +designed hypothesis tests respectively for (i) the identification of defective +samples in the presence of errors in group membership specifications, and (ii) +the identification of groups with erroneous membership specifications. The DRLT +approach extends the literature on bias mitigation of statistical estimators +such as the LASSO, to handle the important case when some of the measurements +contain outliers, due to factors such as group membership specification errors. +We present numerical results which show that our approach outperforms several +baselines and robust regression techniques for identification of defective +samples as well as erroneously specified groups. + +
+
+
+
+
+ + ☆ Graffin: Stand for Tails in Imbalanced Node Classification + + +
+ Graph representation learning (GRL) models have succeeded in many scenarios. +Real-world graphs have imbalanced distribution, such as node labels and +degrees, which leaves a critical challenge to GRL. Imbalanced inputs can lead +to imbalanced outputs. However, most existing works ignore it and assume that +the distribution of input graphs is balanced, which cannot align with real +situations, resulting in worse model performance on tail data. The domination +of head data makes tail data underrepresented when training graph neural +networks (GNNs). Thus, we propose Graffin, a pluggable tail data augmentation +module, to address the above issues. Inspired by recurrent neural networks +(RNNs), Graffin flows head features into tail data through graph serialization +techniques to alleviate the imbalance of tail representation. The local and +global structures are fused to form the node representation under the combined +effect of neighborhood and sequence information, which enriches the semantics +of tail data. We validate the performance of Graffin on four real-world +datasets in node classification tasks. Results show that Graffin can improve +the adaptation to tail data without significantly degrading the overall model +performance. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ A Multi-Modal Deep Learning Based Approach for House Price Prediction + + +
+ Accurate prediction of house price, a vital aspect of the residential real +estate sector, is of substantial interest for a wide range of stakeholders. +However, predicting house prices is a complex task due to the significant +variability influenced by factors such as house features, location, +neighborhood, and many others. Despite numerous attempts utilizing a wide array +of algorithms, including recent deep learning techniques, to predict house +prices accurately, existing approaches have fallen short of considering a wide +range of factors such as textual and visual features. This paper addresses this +gap by comprehensively incorporating attributes, such as features, textual +descriptions, geo-spatial neighborhood, and house images, typically showcased +in real estate listings in a house price prediction system. Specifically, we +propose a multi-modal deep learning approach that leverages different types of +data to learn more accurate representation of the house. In particular, we +learn a joint embedding of raw house attributes, geo-spatial neighborhood, and +most importantly from textual description and images representing the house; +and finally use a downstream regression model to predict the house price from +this jointly learned embedding vector. Our experimental results with a +real-world dataset show that the text embedding of the house advertisement +description and image embedding of the house pictures in addition to raw +attributes and geo-spatial embedding, can significantly improve the house price +prediction accuracy. The relevant source code and dataset are publicly +accessible at the following URL: https://github.com/4P0N/mhpp + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ ICPR 2024 Competition on Safe Segmentation of Drive Scenes in + Unstructured Traffic and Adverse Weather Conditions ICPR + + +
+ The ICPR 2024 Competition on Safe Segmentation of Drive Scenes in +Unstructured Traffic and Adverse Weather Conditions served as a rigorous +platform to evaluate and benchmark state-of-the-art semantic segmentation +models under challenging conditions for autonomous driving. Over several +months, participants were provided with the IDD-AW dataset, consisting of 5000 +high-quality RGB-NIR image pairs, each annotated at the pixel level and +captured under adverse weather conditions such as rain, fog, low light, and +snow. A key aspect of the competition was the use and improvement of the Safe +mean Intersection over Union (Safe mIoU) metric, designed to penalize unsafe +incorrect predictions that could be overlooked by traditional mIoU. This +innovative metric emphasized the importance of safety in developing autonomous +driving systems. The competition showed significant advancements in the field, +with participants demonstrating models that excelled in semantic segmentation +and prioritized safety and robustness in unstructured and adverse conditions. +The results of the competition set new benchmarks in the domain, highlighting +the critical role of safety in deploying autonomous vehicles in real-world +scenarios. The contributions from this competition are expected to drive +further innovation in autonomous driving technology, addressing the critical +challenges of operating in diverse and unpredictable environments. + +
+
+ comment: 15 pages, 7 figures, ICPR Competition Paper +
+
+
+
+
+ + ☆ Sample-Efficient Bayesian Optimization with Transfer Learning for + Heterogeneous Search Spaces + + +
+ Bayesian optimization (BO) is a powerful approach to sample-efficient +optimization of black-box functions. However, in settings with very few +function evaluations, a successful application of BO may require transferring +information from historical experiments. These related experiments may not have +exactly the same tunable parameters (search spaces), motivating the need for BO +with transfer learning for heterogeneous search spaces. In this paper, we +propose two methods for this setting. The first approach leverages a Gaussian +process (GP) model with a conditional kernel to transfer information between +different search spaces. Our second approach treats the missing parameters as +hyperparameters of the GP model that can be inferred jointly with the other GP +hyperparameters or set to fixed values. We show that these two methods perform +well on several benchmark problems. + +
+
+
+
+
+ + ☆ Tele-LLMs: A Series of Specialized Large Language Models for + Telecommunications + + +
+ The emergence of large language models (LLMs) has significantly impacted +various fields, from natural language processing to sectors like medicine and +finance. However, despite their rapid proliferation, the applications of LLMs +in telecommunications remain limited, often relying on general-purpose models +that lack domain-specific specialization. This lack of specialization results +in underperformance, particularly when dealing with telecommunications-specific +technical terminology and their associated mathematical representations. This +paper addresses this gap by first creating and disseminating Tele-Data, a +comprehensive dataset of telecommunications material curated from relevant +sources, and Tele-Eval, a large-scale question-and-answer dataset tailored to +the domain. Through extensive experiments, we explore the most effective +training techniques for adapting LLMs to the telecommunications domain, ranging +from examining the division of expertise across various telecommunications +aspects to employing parameter-efficient techniques. We also investigate how +models of different sizes behave during adaptation and analyze the impact of +their training data on this behavior. Leveraging these findings, we develop and +open-source Tele-LLMs, the first series of language models ranging from 1B to +8B parameters, specifically tailored for telecommunications. Our evaluations +demonstrate that these models outperform their general-purpose counterparts on +Tele-Eval while retaining their previously acquired capabilities, thus avoiding +the catastrophic forgetting phenomenon. + +
+
+
+
+
+ + ☆ Closed-Form Interpretation of Neural Network Latent Spaces with Symbolic + Gradients + + +
+ It has been demonstrated in many scientific fields that artificial neural +networks like autoencoders or Siamese networks encode meaningful concepts in +their latent spaces. However, there does not exist a comprehensive framework +for retrieving this information in a human-readable form without prior +knowledge. In order to extract these concepts, we introduce a framework for +finding closed-form interpretations of neurons in latent spaces of artificial +neural networks. The interpretation framework is based on embedding trained +neural networks into an equivalence class of functions that encode the same +concept. We interpret these neural networks by finding an intersection between +the equivalence class and human-readable equations defined by a symbolic search +space. The approach is demonstrated by retrieving invariants of matrices and +conserved quantities of dynamical systems from latent spaces of Siamese neural +networks. + +
+
+
+
+
+ + ☆ Resource-Efficient Generative AI Model Deployment in Mobile Edge + Networks + + +
+ The surging development of Artificial Intelligence-Generated Content (AIGC) +marks a transformative era of the content creation and production. Edge servers +promise attractive benefits, e.g., reduced service delay and backhaul traffic +load, for hosting AIGC services compared to cloud-based solutions. However, the +scarcity of available resources on the edge pose significant challenges in +deploying generative AI models. In this paper, by characterizing the resource +and delay demands of typical generative AI models, we find that the consumption +of storage and GPU memory, as well as the model switching delay represented by +I/O delay during the preloading phase, are significant and vary across models. +These multidimensional coupling factors render it difficult to make efficient +edge model deployment decisions. Hence, we present a collaborative edge-cloud +framework aiming to properly manage generative AI model deployment on the edge. +Specifically, we formulate edge model deployment problem considering +heterogeneous features of models as an optimization problem, and propose a +model-level decision selection algorithm to solve it. It enables pooled +resource sharing and optimizes the trade-off between resource consumption and +delay in edge generative AI model deployment. Simulation results validate the +efficacy of the proposed algorithm compared with baselines, demonstrating its +potential to reduce overall costs by providing feature-aware model deployment +decisions. + +
+
+
+
+
+ + ☆ TERD: A Unified Framework for Safeguarding Diffusion Models Against + Backdoors + + +
+ Diffusion models have achieved notable success in image generation, but they +remain highly vulnerable to backdoor attacks, which compromise their integrity +by producing specific undesirable outputs when presented with a pre-defined +trigger. In this paper, we investigate how to protect diffusion models from +this dangerous threat. Specifically, we propose TERD, a backdoor defense +framework that builds unified modeling for current attacks, which enables us to +derive an accessible reversed loss. A trigger reversion strategy is further +employed: an initial approximation of the trigger through noise sampled from a +prior distribution, followed by refinement through differential multi-step +samplers. Additionally, with the reversed trigger, we propose backdoor +detection from the noise space, introducing the first backdoor input detection +approach for diffusion models and a novel model detection algorithm that +calculates the KL divergence between reversed and benign distributions. +Extensive evaluations demonstrate that TERD secures a 100% True Positive Rate +(TPR) and True Negative Rate (TNR) across datasets of varying resolutions. TERD +also demonstrates nice adaptability to other Stochastic Differential Equation +(SDE)-based models. Our code is available at https://github.com/PKU-ML/TERD. + +
+
+
+
+
+ + ☆ Mpox Narrative on Instagram: A Labeled Multilingual Dataset of Instagram + Posts on Mpox for Sentiment, Hate Speech, and Anxiety Analysis + + +
+ The world is currently experiencing an outbreak of mpox, which has been +declared a Public Health Emergency of International Concern by WHO. No prior +work related to social media mining has focused on the development of a dataset +of Instagram posts about the mpox outbreak. The work presented in this paper +aims to address this research gap and makes two scientific contributions to +this field. First, it presents a multilingual dataset of 60,127 Instagram posts +about mpox, published between July 23, 2022, and September 5, 2024. The +dataset, available at https://dx.doi.org/10.21227/7fvc-y093, contains Instagram +posts about mpox in 52 languages. For each of these posts, the Post ID, Post +Description, Date of publication, language, and translated version of the post +(translation to English was performed using the Google Translate API) are +presented as separate attributes in the dataset. After developing this dataset, +sentiment analysis, hate speech detection, and anxiety or stress detection were +performed. This process included classifying each post into (i) one of the +sentiment classes, i.e., fear, surprise, joy, sadness, anger, disgust, or +neutral, (ii) hate or not hate, and (iii) anxiety/stress detected or no +anxiety/stress detected. These results are presented as separate attributes in +the dataset. Second, this paper presents the results of performing sentiment +analysis, hate speech analysis, and anxiety or stress analysis. The variation +of the sentiment classes - fear, surprise, joy, sadness, anger, disgust, and +neutral were observed to be 27.95%, 2.57%, 8.69%, 5.94%, 2.69%, 1.53%, and +50.64%, respectively. In terms of hate speech detection, 95.75% of the posts +did not contain hate and the remaining 4.25% of the posts contained hate. +Finally, 72.05% of the posts did not indicate any anxiety/stress, and the +remaining 27.95% of the posts represented some form of anxiety/stress. + +
+
+
+
+
+ + ☆ Towards Fast Rates for Federated and Multi-Task Reinforcement Learning + + +
+ We consider a setting involving $N$ agents, where each agent interacts with +an environment modeled as a Markov Decision Process (MDP). The agents' MDPs +differ in their reward functions, capturing heterogeneous objectives/tasks. The +collective goal of the agents is to communicate intermittently via a central +server to find a policy that maximizes the average of long-term cumulative +rewards across environments. The limited existing work on this topic either +only provide asymptotic rates, or generate biased policies, or fail to +establish any benefits of collaboration. In response, we propose Fast-FedPG - a +novel federated policy gradient algorithm with a carefully designed +bias-correction mechanism. Under a gradient-domination condition, we prove that +our algorithm guarantees (i) fast linear convergence with exact gradients, and +(ii) sub-linear rates that enjoy a linear speedup w.r.t. the number of agents +with noisy, truncated policy gradients. Notably, in each case, the convergence +is to a globally optimal policy with no heterogeneity-induced bias. In the +absence of gradient-domination, we establish convergence to a first-order +stationary point at a rate that continues to benefit from collaboration. + +
+
+ comment: Accepted to the Decision and Control Conference (CDC), 2024 +
+
+
+
+
+ + ☆ Efficiently Learning Markov Random Fields from Dynamics + + +
+ An important task in high-dimensional statistics is learning the parameters +or dependency structure of an undirected graphical model, or Markov random +field (MRF). Much of the prior work on this problem assumes access to i.i.d. +samples from the MRF distribution and state-of-the-art algorithms succeed using +$n^{\Theta(k)}$ runtime, where $n$ is the dimension and $k$ is the order of the +interactions. However, well-known reductions from the sparse parity with noise +problem imply that given i.i.d. samples from a sparse, order-$k$ MRF, any +learning algorithm likely requires $n^{\Omega(k)}$ time, impeding the potential +for significant computational improvements. In this work, we demonstrate that +these fundamental barriers for learning MRFs can surprisingly be completely +circumvented when learning from natural, dynamical samples. We show that in +bounded-degree MRFs, the dependency structure and parameters can be recovered +using a trajectory of Glauber dynamics of length $O(n \log n)$ with runtime +$O(n^2 \log n)$. The implicit constants depend only on the degree and +non-degeneracy parameters of the model, but not the dimension $n$. In +particular, learning MRFs from dynamics is $\textit{provably computationally +easier}$ than learning from i.i.d. samples under standard hardness assumptions. + +
+
+ comment: 40 pages, 3 figures +
+
+
+
+
+ + ☆ Learning Submodular Sequencing from Samples + + +
+ This paper addresses the problem of sequential submodular maximization: +selecting and ranking items in a sequence to optimize some composite submodular +function. In contrast to most of the previous works, which assume access to the +utility function, we assume that we are given only a set of samples. Each +sample includes a random sequence of items and its associated utility. We +present an algorithm that, given polynomially many samples drawn from a +two-stage uniform distribution, achieves an approximation ratio dependent on +the curvature of individual submodular functions. Our results apply in a wide +variety of real-world scenarios, such as ranking products in online retail +platforms, where complete knowledge of the utility function is often impossible +to obtain. Our algorithm gives an empirically useful solution in such contexts, +thus proving that limited data can be of great use in sequencing tasks. From a +technical perspective, our results extend prior work on ``optimization from +samples'' by generalizing from optimizing a set function to a +sequence-dependent function. + +
+
+
+
+
+ + ☆ Towards Automated Machine Learning Research + + +
+ This paper explores a top-down approach to automating incremental advances in +machine learning research through component-level innovation, facilitated by +Large Language Models (LLMs). Our framework systematically generates novel +components, validates their feasibility, and evaluates their performance +against existing baselines. A key distinction of this approach lies in how +these novel components are generated. Unlike traditional AutoML and NAS +methods, which often rely on a bottom-up combinatorial search over predefined, +hardcoded base components, our method leverages the cross-domain knowledge +embedded in LLMs to propose new components that may not be confined to any +hard-coded predefined set. By incorporating a reward model to prioritize +promising hypotheses, we aim to improve the efficiency of the hypothesis +generation and evaluation process. We hope this approach offers a new avenue +for exploration and contributes to the ongoing dialogue in the field. + +
+
+
+
+
+ + ☆ Label-free evaluation of lung and heart transplant biopsies using + virtual staining + + +
+ Organ transplantation serves as the primary therapeutic strategy for +end-stage organ failures. However, allograft rejection is a common complication +of organ transplantation. Histological assessment is essential for the timely +detection and diagnosis of transplant rejection and remains the gold standard. +Nevertheless, the traditional histochemical staining process is time-consuming, +costly, and labor-intensive. Here, we present a panel of virtual staining +neural networks for lung and heart transplant biopsies, which digitally convert +autofluorescence microscopic images of label-free tissue sections into their +brightfield histologically stained counterparts, bypassing the traditional +histochemical staining process. Specifically, we virtually generated +Hematoxylin and Eosin (H&E), Masson's Trichrome (MT), and Elastic Verhoeff-Van +Gieson (EVG) stains for label-free transplant lung tissue, along with H&E and +MT stains for label-free transplant heart tissue. Subsequent blind evaluations +conducted by three board-certified pathologists have confirmed that the virtual +staining networks consistently produce high-quality histology images with high +color uniformity, closely resembling their well-stained histochemical +counterparts across various tissue features. The use of virtually stained +images for the evaluation of transplant biopsies achieved comparable diagnostic +outcomes to those obtained via traditional histochemical staining, with a +concordance rate of 82.4% for lung samples and 91.7% for heart samples. +Moreover, virtual staining models create multiple stains from the same +autofluorescence input, eliminating structural mismatches observed between +adjacent sections stained in the traditional workflow, while also saving +tissue, expert time, and staining costs. + +
+
+ comment: 21 Pages, 5 Figures +
+
+
+
+
+ + ♻ ☆ Pre-processing and Compression: Understanding Hidden Representation + Refinement Across Imaging Domains via Intrinsic Dimension + + +
+ In recent years, there has been interest in how geometric properties such as +intrinsic dimension (ID) of a neural network's hidden representations change +through its layers, and how such properties are predictive of important model +behavior such as generalization ability. However, evidence has begun to emerge +that such behavior can change significantly depending on the domain of the +network's training data, such as natural versus medical images. Here, we +further this inquiry by exploring how the ID of a network's learned +representations changes through its layers, in essence, characterizing how the +network successively refines the information content of input data to be used +for predictions. Analyzing eleven natural and medical image datasets across six +network architectures, we find that how ID changes through the network differs +noticeably between natural and medical image models. Specifically, medical +image models peak in representation ID earlier in the network, implying a +difference in the image features and their abstractness that are typically used +for downstream tasks in these domains. Additionally, we discover a strong +correlation of this peak representation ID with the ID of the data in its input +space, implying that the intrinsic information content of a model's learned +representations is guided by that of the data it was trained on. Overall, our +findings emphasize notable discrepancies in network behavior between natural +and non-natural imaging domains regarding hidden representation information +content, and provide further insights into how a network's learned features are +shaped by its training data. + +
+
+
+
+
+ + ♻ ☆ Bi-Directional Transformers vs. word2vec: Discovering Vulnerabilities in + Lifted Compiled Code + + +
+ Detecting vulnerabilities within compiled binaries is challenging due to lost +high-level code structures and other factors such as architectural +dependencies, compilers, and optimization options. To address these obstacles, +this research explores vulnerability detection using natural language +processing (NLP) embedding techniques with word2vec, BERT, and RoBERTa to learn +semantics from intermediate representation (LLVM IR) code. Long short-term +memory (LSTM) neural networks were trained on embeddings from encoders created +using approximately 48k LLVM functions from the Juliet dataset. This study is +pioneering in its comparison of word2vec models with multiple bidirectional +transformer (BERT, RoBERTa) embeddings built using LLVM code to train neural +networks to detect vulnerabilities in compiled binaries. word2vec Skip-Gram +models achieved 92% validation accuracy in detecting vulnerabilities, +outperforming word2vec Continuous Bag of Words (CBOW), BERT, and RoBERTa. This +suggests that complex contextual embeddings may not provide advantages over +simpler word2vec models for this task when a limited number (e.g. 48K) of data +samples are used to train the bidirectional transformer-based models. The +comparative results provide novel insights into selecting optimal embeddings +for learning compiler-independent semantic code representations to advance +machine learning detection of vulnerabilities in compiled binaries. + +
+
+ comment: Updated with improvements" +
+
+
+
+
+ + ♻ ☆ Improving Antibody Design with Force-Guided Sampling in Diffusion Models + + +
+ Antibodies, crucial for immune defense, primarily rely on +complementarity-determining regions (CDRs) to bind and neutralize antigens, +such as viruses. The design of these CDRs determines the antibody's affinity +and specificity towards its target. Generative models, particularly denoising +diffusion probabilistic models (DDPMs), have shown potential to advance the +structure-based design of CDR regions. However, only a limited dataset of bound +antibody-antigen structures is available, and generalization to +out-of-distribution interfaces remains a challenge. Physics based force-fields, +which approximate atomic interactions, offer a coarse but universal source of +information to better mold designs to target interfaces. Integrating this +foundational information into diffusion models is, therefore, highly desirable. +Here, we propose a novel approach to enhance the sampling process of diffusion +models by integrating force field energy-based feedback. Our model, DiffForce, +employs forces to guide the diffusion sampling process, effectively blending +the two distributions. Through extensive experiments, we demonstrate that our +method guides the model to sample CDRs with lower energy, enhancing both the +structure and sequence of the generated antibodies. + +
+
+
+
+
+ + ♻ ☆ Cross-Input Certified Training for Universal Perturbations ECCV '24 + + +
+ Existing work in trustworthy machine learning primarily focuses on +single-input adversarial perturbations. In many real-world attack scenarios, +input-agnostic adversarial attacks, e.g. universal adversarial perturbations +(UAPs), are much more feasible. Current certified training methods train models +robust to single-input perturbations but achieve suboptimal clean and UAP +accuracy, thereby limiting their applicability in practical applications. We +propose a novel method, CITRUS, for certified training of networks robust +against UAP attackers. We show in an extensive evaluation across different +datasets, architectures, and perturbation magnitudes that our method +outperforms traditional certified training methods on standard accuracy (up to +10.3\%) and achieves SOTA performance on the more practical certified UAP +accuracy metric. + +
+
+ comment: 23 pages, 6 figures, ECCV '24 +
+
+
+
+
+ + ♻ ☆ Instruct-SkillMix: A Powerful Pipeline for LLM Instruction Tuning + + +
+ We introduce Instruct-SkillMix, an automated approach for creating diverse, +high quality SFT data. The Instruct-SkillMix pipeline involves two stages, each +leveraging an existing powerful LLM: (1) Skill extraction: uses the LLM to +extract core "skills" for instruction-following, either from existing datasets, +or by directly prompting the model; (2) Data generation: uses the powerful LLM +to generate (instruction, response) data that exhibit a randomly chosen pair of +these skills. Here, the use of random skill combinations promotes diversity and +difficulty. + Vanilla SFT (i.e., no PPO, DPO, or RL methods) on data generated from +Instruct-SkillMix leads to strong gains on instruction following benchmarks +such as AlpacaEval 2.0, MT-Bench, and WildBench. With just $4$K examples, +LLaMA-3-8B-Base achieves 42.76% length-controlled win rate on AlpacaEval 2.0. +To our knowledge, this achieves state-of-the-art performance among all models +that have only undergone SFT (no RL methods) and competes with proprietary +models such as Claude 3 Opus and LLaMA-3.1-405B-Instruct. + Ablation studies also suggest plausible reasons for why creating open +instruction-tuning datasets via naive crowd-sourcing has proved difficult. +Introducing low quality answers ("shirkers") in $20\%$ of Instruct-SkillMix +examples causes performance to plummet, sometimes catastrophically. + The Instruct-SkillMix pipeline is flexible and is adaptable to other +settings. + +
+
+
+
+
+ + ♻ ☆ Enhancing Accuracy in Deep Learning Using Random Matrix Theory + + +
+ We explore the applications of random matrix theory (RMT) in the training of +deep neural networks (DNNs), focusing on layer pruning that is reducing the +number of DNN parameters (weights). Our numerical results show that this +pruning leads to a drastic reduction of parameters while not reducing the +accuracy of DNNs and CNNs. Moreover, pruning the fully connected DNNs actually +increases the accuracy and decreases the variance for random initializations. +Our numerics indicate that this enhancement in accuracy is due to the +simplification of the loss landscape. We next provide rigorous mathematical +underpinning of these numerical results by proving the RMT-based Pruning +Theorem. Our results offer valuable insights into the practical application of +RMT for the creation of more efficient and accurate deep-learning models. + +
+
+
+
+
+ + ♻ ☆ Mixture of Experts with Mixture of Precisions for Tuning Quality of + Service + + +
+ The increasing demand for deploying large Mixture-of-Experts (MoE) models in +resource-constrained environments necessitates efficient approaches to address +their high memory and computational requirements challenges. Moreover, given +that tasks come in different user-defined constraints and the available +resources change over time in multi-tenant environments, it is necessary to +design an approach which provides a flexible configuration space. This paper +presents an adaptive serving approach for the efficient deployment of MoE +models, capitalizing on partial quantization of the experts. By dynamically +determining the number of quantized experts and their distribution across CPU +and GPU, our approach explores the Pareto frontier and offers a fine-grained +range of configurations for tuning throughput and model quality. Our evaluation +on an NVIDIA A100 GPU using a Mixtral 8x7B MoE model for three language +modelling benchmarks demonstrates that the throughput of token generation can +be adjusted from 0.63 to 13.00 token per second. This enhancement comes with a +marginal perplexity increase of 3.81 to 4.00, 13.59 to 14.17, and 7.24 to 7.40 +for WikiText2, PTB, and C4 datasets respectively under maximum quantization. +These results highlight the practical applicability of our approach in dynamic +and accuracy-sensitive applications where both memory usage and output quality +are important. + +
+
+
+
+
+ + ♻ ☆ Deep Convolutional Autoencoder for Assessment of Drive-Cycle Anomalies + in Connected Vehicle Sensor Data SC + + +
+ This work investigates a practical and novel method for automated +unsupervised fault detection in vehicles using a fully convolutional +autoencoder. The results demonstrate the algorithm we developed can detect +anomalies which correspond to powertrain faults by learning patterns in the +multivariate time-series data of hybrid-electric vehicle powertrain sensors. +Data was collected by engineers at Ford Motor Company from numerous sensors +over several drive cycle variations. This study provides evidence of the +anomaly detecting capability of our trained autoencoder and investigates the +suitability of our autoencoder relative to other unsupervised methods for +automatic fault detection in this data set. Preliminary results of testing the +autoencoder on the powertrain sensor data indicate the data reconstruction +approach availed by the autoencoder is a robust technique for identifying the +abnormal sequences in the multivariate series. These results support that +irregularities in hybrid-electric vehicles' powertrains are conveyed via sensor +signals in the embedded electronic communication system, and therefore can be +identified mechanistically with a trained algorithm. Additional unsupervised +methods are tested and show the autoencoder performs better at fault detection +than outlier detectors and other novel deep learning techniques. + +
+
+ comment: SSCI2022, 7 pages, 3 Tables, 3 Figures +
+
+
+
+
+ + ♻ ☆ Kernel-U-Net: Multivariate Time Series Forecasting using Custom Kernels + + +
+ Time series forecasting task predicts future trends based on historical +information. Transformer-based U-Net architectures, despite their success in +medical image segmentation, have limitations in both expressiveness and +computation efficiency in time series forecasting as evidenced in YFormer. To +tackle these challenges, we introduce Kernel-U-Net, a flexible and +kernel-customizable U-shape neural network architecture. The kernel-U-Net +encoder compresses the input series into latent vectors, and its symmetric +decoder subsequently expands these vectors into output series. Specifically, +Kernel-U-Net separates the procedure of partitioning input time series into +patches from kernel manipulation, thereby providing the convenience of +customized executing kernels. Our method offers two primary advantages: 1) +Flexibility in kernel customization to adapt to specific datasets; and 2) +Enhanced computational efficiency, with the complexity of the Transformer layer +reduced to linear. Experiments on seven real-world datasets, demonstrate that +Kernel-U-Net's performance either exceeds or meets that of the existing +state-of-the-art model in the majority of cases in channel-independent +settings. The source code for Kernel-U-Net will be made publicly available for +further research and application. + +
+
+
+
+
+ + ♻ ☆ The Influence of Faulty Labels in Data Sets on Human Pose Estimation + + +
+ In this study we provide empirical evidence demonstrating that the quality of +training data impacts model performance in Human Pose Estimation (HPE). +Inaccurate labels in widely used data sets, ranging from minor errors to severe +mislabeling, can negatively influence learning and distort performance metrics. +We perform an in-depth analysis of popular HPE data sets to show the extent and +nature of label inaccuracies. Our findings suggest that accounting for the +impact of faulty labels will facilitate the development of more robust and +accurate HPE models for a variety of real-world applications. We show improved +performance with cleansed data. + +
+
+ comment: 15 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ QEDCartographer: Automating Formal Verification Using Reward-Free + Reinforcement Learning ICSE + + +
+ Formal verification is a promising method for producing reliable software, +but the difficulty of manually writing verification proofs severely limits its +utility in practice. Recent methods have automated some proof synthesis by +guiding a search through the proof space using a theorem prover. Unfortunately, +the theorem prover provides only the crudest estimate of progress, resulting in +effectively undirected search. To address this problem, we create +QEDCartographer, an automated proof-synthesis tool that combines supervised and +reinforcement learning to more effectively explore the proof space. +QEDCartographer incorporates the proofs' branching structure, enabling +reward-free search and overcoming the sparse reward problem inherent to formal +verification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K +theorems from 124 open-source Coq projects. QEDCartographer fully automatically +proves 21.4% of the test-set theorems. Previous search-based proof-synthesis +tools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on +supervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively. +Diva, which combines 62 tools, proves 19.2%. Comparing to the most effective +prior tool, Proverbot9001, QEDCartographer produces 34% shorter proofs 29% +faster, on average over the theorems both tools prove. Together, +QEDCartographer and non-learning-based CoqHammer prove 30.3% of the theorems, +while CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement +learning is a fruitful research direction for improving proof-synthesis tools' +search mechanisms. + +
+
+ comment: Published in the International Conference on Software Engineering + (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan + Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal + Verification Using Reward-Free Reinforcement Learning, in Proceedings of the + 47th International Conference on Software Engineering (ICSE), 2025 +
+
+
+
+
+ + ♻ ☆ Large Language Models Synergize with Automated Machine Learning + + +
+ Recently, program synthesis driven by large language models (LLMs) has become +increasingly popular. However, program synthesis for machine learning (ML) +tasks still poses significant challenges. This paper explores a novel form of +program synthesis, targeting ML programs, by combining LLMs and automated +machine learning (autoML). Specifically, our goal is to fully automate the +generation and optimization of the code of the entire ML workflow, from data +preparation to modeling and post-processing, utilizing only textual +descriptions of the ML tasks. To manage the length and diversity of ML +programs, we propose to break each ML program into smaller, manageable parts. +Each part is generated separately by the LLM, with careful consideration of +their compatibilities. To ensure compatibilities, we design a testing technique +for ML programs. Unlike traditional program synthesis, which typically relies +on binary evaluations (i.e., correct or incorrect), evaluating ML programs +necessitates more than just binary judgments. Our approach automates the +numerical evaluation and optimization of these programs, selecting the best +candidates through autoML techniques. In experiments across various ML tasks, +our method outperforms existing methods in 10 out of 12 tasks for generating ML +programs. In addition, autoML significantly improves the performance of the +generated ML programs. In experiments, given the textual task description, our +method, Text-to-ML, generates the complete and optimized ML program in a fully +autonomous process. The implementation of our method is available at +https://github.com/JLX0/llm-automl. + +
+
+ comment: published at TMLR +
+
+
+
+
+ + ♻ ☆ The Principle of Uncertain Maximum Entropy + + +
+ The principle of maximum entropy is a well-established technique for choosing +a distribution that matches available information while minimizing bias. It +finds broad use across scientific disciplines and in machine learning. However, +the principle as defined by is susceptible to noise and error in observations. +This forces real-world practitioners to use relaxed versions of the principle +in an ad hoc way, negatively impacting interpretation. To address this +situation, we present a new principle we call uncertain maximum entropy that +generalizes the classic principle and provides interpretable solutions +irrespective of the observational methods in use. We introduce a convex +approximation and expectation-maximization based algorithm for finding +solutions to our new principle. Finally, we contrast this new technique with +two simpler generally applicable solutions theoretically and experimentally +show our technique provides superior accuracy. + +
+
+
+
+
+ + ♻ ☆ Adaptive Online Learning of Quantum States + + +
+ The problem of efficient quantum state learning, also called shadow +tomography, aims to comprehend an unknown $d$-dimensional quantum state through +POVMs. Yet, these states are rarely static; they evolve due to factors such as +measurements, environmental noise, or inherent Hamiltonian state transitions. +This paper leverages techniques from adaptive online learning to keep pace with +such state changes. + The key metrics considered for learning in these mutable environments are +enhanced notions of regret, specifically adaptive and dynamic regret. We +present adaptive and dynamic regret bounds for online shadow tomography, which +are polynomial in the number of qubits and sublinear in the number of +measurements. To support our theoretical findings, we include numerical +experiments that validate our proposed models. + +
+
+ comment: 28 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ LoQT: Low-Rank Adapters for Quantized Pre-Training + + +
+ Training of large neural networks requires significant computational +resources. Despite advances using low-rank adapters and quantization, +pretraining of models such as LLMs on consumer hardware has not been possible +without model sharding, offloading during training, or per-layer gradient +updates. To address these limitations, we propose LoQT, a method for +efficiently training quantized models. LoQT uses gradient-based tensor +factorization to initialize low-rank trainable weight matrices that are +periodically merged into quantized full-rank weight matrices. Our approach is +suitable for both pretraining and fine-tuning of models, which we demonstrate +experimentally for language modeling and downstream task adaptation. We find +that LoQT enables efficient training of models up to 7B parameters on a +consumer-grade 24GB GPU. We also demonstrate the feasibility of training a 13B +parameter model using per-layer gradient updates on the same hardware. + +
+
+
+
+
+ + ♻ ☆ Online Residual Learning from Offline Experts for Pedestrian Tracking + + +
+ In this paper, we consider the problem of predicting unknown targets from +data. We propose Online Residual Learning (ORL), a method that combines online +adaptation with offline-trained predictions. At a lower level, we employ +multiple offline predictions generated before or at the beginning of the +prediction horizon. We augment every offline prediction by learning their +respective residual error concerning the true target state online, using the +recursive least squares algorithm. At a higher level, we treat the augmented +lower-level predictors as experts, adopting the Prediction with Expert Advice +framework. We utilize an adaptive softmax weighting scheme to form an aggregate +prediction and provide guarantees for ORL in terms of regret. We employ ORL to +boost performance in the setting of online pedestrian trajectory prediction. +Based on data from the Stanford Drone Dataset, we show that ORL can demonstrate +best-of-both-worlds performance. + +
+
+ comment: Accepted to CDC 2024, v2: fixed certain typos +
+
+
+
+
+ + ♻ ☆ PINN surrogate of Li-ion battery models for parameter inference. Part + II: Regularization and application of the pseudo-2D model + + +
+ Bayesian parameter inference is useful to improve Li-ion battery diagnostics +and can help formulate battery aging models. However, it is computationally +intensive and cannot be easily repeated for multiple cycles, multiple operating +conditions, or multiple replicate cells. To reduce the computational cost of +Bayesian calibration, numerical solvers for physics-based models can be +replaced with faster surrogates. A physics-informed neural network (PINN) is +developed as a surrogate for the pseudo-2D (P2D) battery model calibration. For +the P2D surrogate, additional training regularization was needed as compared to +the PINN single-particle model (SPM) developed in Part I. Both the PINN SPM and +P2D surrogate models are exercised for parameter inference and compared to data +obtained from a direct numerical solution of the governing equations. A +parameter inference study highlights the ability to use these PINNs to +calibrate scaling parameters for the cathode Li diffusion and the anode +exchange current density. By realizing computational speed-ups of 2250x for the +P2D model, as compared to using standard integrating methods, the PINN +surrogates enable rapid state-of-health diagnostics. In the low-data +availability scenario, the testing error was estimated to 2mV for the SPM +surrogate and 10mV for the P2D surrogate which could be mitigated with +additional data. + +
+
+
+
+
+ + ♻ ☆ AttentionX: Exploiting Consensus Discrepancy In Attention from A + Distributed Optimization Perspective + + +
+ In this paper, we extend the standard Attention in transformer by exploiting +the consensus discrepancy from a distributed optimization perspective, referred +to as AttentionX. It is noted that the primal-dual method of multipliers (PDMM) +\cite{Zhang16PDMM} is designed to iteratively solve a broad class of +distributed optimization problems over a pear-to-pear (P2P) network, where +neighbouring nodes gradually reach consensus as specified by predefined linear +edge-constraints in the optimization process. In particular, at each iteration +of PDMM, each node in a network first performs information-gathering from +neighbours and then performs local information-fusion. From a high-level point +of view, the $KQ$-softmax-based weighted summation of $V$-representations in +Attention corresponds information-gathering from neighbours while the +feature-processing via the feed-forward network (FFN) in transformer +corresponds to local information fusion. PDMM exploits the Lagrangian +multipliers to capture the historical consensus discrepancy in the form of +residual errors of the linear edge-constraints, which plays a crucial role for +the algorithm to converge. Inspired by PDMM, we propose AttentionX to +incorporate the consensus discrepancy in the output update-expression of the +standard Attention. The consensus discrepancy in AttentionX refers to the +difference between the weighted summation of $V$-representations and scaled +$V$-representions themselves. Experiments on ViT and nanoGPT show promising +performance. + +
+
+
+
+
+ + ♻ ☆ Categorical data clustering: 25 years beyond K-modes + + +
+ The clustering of categorical data is a common and important task in computer +science, offering profound implications across a spectrum of applications. +Unlike purely numerical data, categorical data often lack inherent ordering as +in nominal data, or have varying levels of order as in ordinal data, thus +requiring specialized methodologies for efficient organization and analysis. +This review provides a comprehensive synthesis of categorical data clustering +in the past twenty-five years, starting from the introduction of K-modes. It +elucidates the pivotal role of categorical data clustering in diverse fields +such as health sciences, natural sciences, social sciences, education, +engineering and economics. Practical comparisons are conducted for algorithms +having public implementations, highlighting distinguishing clustering +methodologies and revealing the performance of recent algorithms on several +benchmark categorical datasets. Finally, challenges and opportunities in the +field are discussed. + +
+
+
+
+
+ + ♻ ☆ A Review of Graph Neural Networks in Epidemic Modeling + + +
+ Since the onset of the COVID-19 pandemic, there has been a growing interest +in studying epidemiological models. Traditional mechanistic models +mathematically describe the transmission mechanisms of infectious diseases. +However, they often suffer from limitations of oversimplified or fixed +assumptions, which could cause sub-optimal predictive power and inefficiency in +capturing complex relation information. Consequently, Graph Neural +Networks(GNNs) have emerged as a progressively popular tool in epidemic +research. In this paper, we endeavor to furnish a comprehensive review of GNNs +in epidemic tasks and highlight potential future directions. To accomplish this +objective, we introduce hierarchical taxonomies for both epidemic tasks and +methodologies, offering a trajectory of development within this domain. For +epidemic tasks, we establish a taxonomy akin to those typically employed within +the epidemic domain. For methodology, we categorize existing work into Neural +Models and Hybrid Models. Following this, we perform an exhaustive and +systematic examination of the methodologies, encompassing both the tasks and +their technical details. Furthermore, we discuss the limitations of existing +methods from diverse perspectives and systematically propose future research +directions. This survey aims to bridge literature gaps and promote the +progression of this promising field, with a list of relevant papers at +https://github.com/Emory-Melody/awesome-epidemic-modeling-papers. We hope that +it will facilitate synergies between the communities of GNNs and epidemiology, +and contribute to their collective progress. + +
+
+
+
+
+ + ♻ ☆ EpiLearn: A Python Library for Machine Learning in Epidemic Modeling + + +
+ EpiLearn is a Python toolkit developed for modeling, simulating, and +analyzing epidemic data. Although there exist several packages that also deal +with epidemic modeling, they are often restricted to mechanistic models or +traditional statistical tools. As machine learning continues to shape the +world, the gap between these packages and the latest models has become larger. +To bridge the gap and inspire innovative research in epidemic modeling, +EpiLearn not only provides support for evaluating epidemic models based on +machine learning, but also incorporates comprehensive tools for analyzing +epidemic data, such as simulation, visualization, transformations, etc. For the +convenience of both epidemiologists and data scientists, we provide a unified +framework for training and evaluation of epidemic models on two tasks: +Forecasting and Source Detection. To facilitate the development of new models, +EpiLearn follows a modular design, making it flexible and easy to use. In +addition, an interactive web application is also developed to visualize the +real-world or simulated epidemic data. Our package is available at +https://github.com/Emory-Melody/EpiLearn. + +
+
+
+
+
+ + ♻ ☆ On the Computational Entanglement of Distant Features in Adversarial + Machine Learning + + +
+ In this research, we introduce 'computational entanglement', a phenomenon in +overparameterized neural networks where the model exploits noise patterns in +ways conceptually linked to the effects of length contraction. More specific, +our findings demonstrate that overparameterized feedforward linear networks can +easily achieve zero loss by fitting random noise, even with test samples that +were never encountered during training. This phenomenon accompanies length +contraction, where trained and test samples converge at the same point within a +spacetime diagram. Unlike most models that rely on supervised learning, our +method operates unsupervised, without the need for labels or gradient-based +optimization. Additionally, we show a novel application of computational +entanglement: transforming adversarial examples-highly non-robuts inputs +imperceptible to human observers-into outputs that are recognizable and robust. +This challenges conventional views on non-robust features in adversarial +example generation, providing new insights into the underlying mechanisms. Our +results emphasize the importance of computational entanglement for enhancing +model robustness and understanding neural networks in adversarial contexts. + +
+
+ comment: abstract updated +
+
+
+
+
+ + ♻ ☆ R-SFLLM: Jamming Resilient Framework for Split Federated Learning with + Large Language Models + + +
+ Split federated learning (SFL) is a compute-efficient paradigm in distributed +machine learning (ML), where components of large ML models are outsourced to +remote servers. A significant challenge in SFL, particularly when deployed over +wireless channels, is the susceptibility of transmitted model parameters to +adversarial jamming that could jeopardize the learning process. This is +particularly pronounced for word embedding parameters in large language models +(LLMs), which are crucial for language understanding. In this paper, rigorous +insights are provided into the influence of jamming LLM word embeddings in SFL +by deriving an expression for the ML training loss divergence and showing that +it is upper-bounded by the mean squared error (MSE). Based on this analysis, a +physical layer framework is developed for resilient SFL with LLMs (R-SFLLM) +over wireless networks. R-SFLLM leverages wireless sensing data to gather +information on the jamming directions-of-arrival (DoAs) for the purpose of +devising a novel, sensing-assisted anti-jamming strategy while jointly +optimizing beamforming, user scheduling, and resource allocation. Extensive +experiments using BERT and RoBERTa models demonstrate R-SFLLM's effectiveness, +achieving close-to-baseline performance across various natural language +processing (NLP) tasks and datasets. The proposed methodology further +introduces an adversarial training component, where controlled noise exposure +significantly enhances the LLM's resilience to perturbed parameters during +training. The results show that more noise-sensitive models, such as RoBERTa, +benefit from this feature, especially when resource allocation is unfair. It is +also shown that worst-case jamming in particular translates into worst-case +model outcomes, thereby necessitating the need for jamming-resilient SFL +protocols. + +
+
+
+
+
+ + ♻ ☆ Double Machine Learning for Static Panel Models with Fixed Effects + + +
+ Recent advances in causal inference have seen the development of methods +which make use of the predictive power of machine learning algorithms. In this +paper, we use these algorithms to approximate high-dimensional and non-linear +nuisance functions of the confounders and double machine learning (DML) to make +inferences about the effects of policy interventions from panel data. We +propose new estimators by extending correlated random effects, within-group and +first-difference estimation for linear models to an extension of Robinson +(1988)'s partially linear regression model to static panel data models with +individual fixed effects and unspecified non-linear confounding effects. We +provide an illustrative example of DML for observational panel data showing the +impact of the introduction of the minimum wage on voting behaviour in the UK. + +
+
+
+
+
+ + ♻ ☆ Rolling Diffusion Models + + +
+ Diffusion models have recently been increasingly applied to temporal data +such as video, fluid mechanics simulations, or climate data. These methods +generally treat subsequent frames equally regarding the amount of noise in the +diffusion process. This paper explores Rolling Diffusion: a new approach that +uses a sliding window denoising process. It ensures that the diffusion process +progressively corrupts through time by assigning more noise to frames that +appear later in a sequence, reflecting greater uncertainty about the future as +the generation process unfolds. Empirically, we show that when the temporal +dynamics are complex, Rolling Diffusion is superior to standard diffusion. In +particular, this result is demonstrated in a video prediction task using the +Kinetics-600 video dataset and in a chaotic fluid dynamics forecasting +experiment. + +
+
+
+
+
+ + ♻ ☆ Explaining Learned Reward Functions with Counterfactual Trajectories + + +
+ Learning rewards from human behaviour or feedback is a promising approach to +aligning AI systems with human values but fails to consistently extract correct +reward functions. Interpretability tools could enable users to understand and +evaluate possible flaws in learned reward functions. We propose Counterfactual +Trajectory Explanations (CTEs) to interpret reward functions in reinforcement +learning by contrasting an original with a counterfactual partial trajectory +and the rewards they each receive. We derive six quality criteria for CTEs and +propose a novel Monte-Carlo-based algorithm for generating CTEs that optimises +these quality criteria. Finally, we measure how informative the generated +explanations are to a proxy-human model by training it on CTEs. CTEs are +demonstrably informative for the proxy-human model, increasing the similarity +between its predictions and the reward function on unseen trajectories. +Further, it learns to accurately judge differences in rewards between +trajectories and generalises to out-of-distribution examples. Although CTEs do +not lead to a perfect understanding of the reward, our method, and more +generally the adaptation of XAI methods, are presented as a fruitful approach +for interpreting learned reward functions. + +
+
+
+
+
+ + ♻ ☆ Clustering Time-Evolving Networks Using the Spatio-Temporal Graph + Laplacian + + +
+ Time-evolving graphs arise frequently when modeling complex dynamical systems +such as social networks, traffic flow, and biological processes. Developing +techniques to identify and analyze communities in these time-varying graph +structures is an important challenge. In this work, we generalize existing +spectral clustering algorithms from static to dynamic graphs using canonical +correlation analysis (CCA) to capture the temporal evolution of clusters. Based +on this extended canonical correlation framework, we define the spatio-temporal +graph Laplacian and investigate its spectral properties. We connect these +concepts to dynamical systems theory via transfer operators, and illustrate the +advantages of our method on benchmark graphs by comparison with existing +methods. We show that the spatio-temporal graph Laplacian allows for a clear +interpretation of cluster structure evolution over time for directed and +undirected graphs. + +
+
+
+
+
+ + ♻ ☆ WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild + + +
+ The increasing availability of real-world conversation data offers exciting +opportunities for researchers to study user-chatbot interactions. However, the +sheer volume of this data makes manually examining individual conversations +impractical. To overcome this challenge, we introduce WildVis, an interactive +tool that enables fast, versatile, and large-scale conversation analysis. +WildVis provides search and visualization capabilities in the text and +embedding spaces based on a list of criteria. To manage million-scale datasets, +we implemented optimizations including search index construction, embedding +precomputation and compression, and caching to ensure responsive user +interactions within seconds. We demonstrate WildVis' utility through three case +studies: facilitating chatbot misuse research, visualizing and comparing topic +distributions across datasets, and characterizing user-specific conversation +patterns. WildVis is open-source and designed to be extendable, supporting +additional datasets and customized search and visualization functionalities. + +
+
+
+
+
+ + ♻ ☆ On the Expressivity of Recurrent Neural Cascades with Identity KR 2024 + + +
+ Recurrent Neural Cascades (RNC) are the class of recurrent neural networks +with no cyclic dependencies among recurrent neurons. Their subclass RNC+ with +positive recurrent weights has been shown to be closely connected to the +star-free regular languages, which are the expressivity of many +well-established temporal logics. The existing expressivity results show that +the regular languages captured by RNC+ are the star-free ones, and they leave +open the possibility that RNC+ may capture languages beyond regular. We exclude +this possibility for languages that include an identity element, i.e., an input +that can occur an arbitrary number of times without affecting the output. +Namely, in the presence of an identity element, we show that the languages +captured by RNC+ are exactly the star-free regular languages. Identity elements +are ubiquitous in temporal patterns, and hence our results apply to a large +number of applications. The implications of our results go beyond expressivity. +At their core, we establish a close structural correspondence between RNC+ and +semiautomata cascades, showing that every neuron can be equivalently captured +by a three-state semiautomaton. A notable consequence of this result is that +RNC+ are no more succinct than cascades of three-state semiautomata. + +
+
+ comment: Full version with appendix of a paper with the same title that will + appear in the proceedings of KR 2024 +
+
+
+
+
+ + ♻ ☆ How adversarial attacks can disrupt seemingly stable accurate + classifiers + + +
+ Adversarial attacks dramatically change the output of an otherwise accurate +learning system using a seemingly inconsequential modification to a piece of +input data. Paradoxically, empirical evidence indicates that even systems which +are robust to large random perturbations of the input data remain susceptible +to small, easily constructed, adversarial perturbations of their inputs. Here, +we show that this may be seen as a fundamental feature of classifiers working +with high dimensional input data. We introduce a simple generic and +generalisable framework for which key behaviours observed in practical systems +arise with high probability -- notably the simultaneous susceptibility of the +(otherwise accurate) model to easily constructed adversarial attacks, and +robustness to random perturbations of the input data. We confirm that the same +phenomena are directly observed in practical neural networks trained on +standard image classification problems, where even large additive random noise +fails to trigger the adversarial instability of the network. A surprising +takeaway is that even small margins separating a classifier's decision surface +from training and testing data can hide adversarial susceptibility from being +detected using randomly sampled perturbations. Counterintuitively, using +additive noise during training or testing is therefore inefficient for +eradicating or detecting adversarial examples, and more demanding adversarial +training is required. + +
+
+ comment: 11 pages, 8 figures, additional supplementary materials +
+
+
+
+
+ + ♻ ☆ Intrinsic Bayesian Cramér-Rao Bound with an Application to Covariance + Matrix Estimation + + +
+ This paper presents a new performance bound for estimation problems where the +parameter to estimate lies in a Riemannian manifold (a smooth manifold endowed +with a Riemannian metric) and follows a given prior distribution. In this +setup, the chosen Riemannian metric induces a geometry for the parameter +manifold, as well as an intrinsic notion of the estimation error measure. +Performance bound for such error measure were previously obtained in the +non-Bayesian case (when the unknown parameter is assumed to deterministic), and +referred to as \textit{intrinsic} Cram\'er-Rao bound. The presented result then +appears either as: \textit{a}) an extension of the intrinsic Cram\'er-Rao bound +to the Bayesian estimation framework; \textit{b}) a generalization of the +Van-Trees inequality (Bayesian Cram\'er-Rao bound) that accounts for the +aforementioned geometric structures. In a second part, we leverage this +formalism to study the problem of covariance matrix estimation when the data +follow a Gaussian distribution, and whose covariance matrix is drawn from an +inverse Wishart distribution. Performance bounds for this problem are obtained +for both the mean squared error (Euclidean metric) and the natural Riemannian +distance for Hermitian positive definite matrices (affine invariant metric). +Numerical simulation illustrate that assessing the error with the affine +invariant metric is revealing of interesting properties of the maximum a +posteriori and minimum mean square error estimator, which are not observed when +using the Euclidean metric. + +
+
+
+
+
+ + ♻ ☆ Deep Oscillatory Neural Network + + +
+ We propose a novel, brain-inspired deep neural network model known as the +Deep Oscillatory Neural Network (DONN). Deep neural networks like the Recurrent +Neural Networks indeed possess sequence processing capabilities but the +internal states of the network are not designed to exhibit brain-like +oscillatory activity. With this motivation, the DONN is designed to have +oscillatory internal dynamics. Neurons of the DONN are either nonlinear neural +oscillators or traditional neurons with sigmoidal or ReLU activation. The +neural oscillator used in the model is the Hopf oscillator, with the dynamics +described in the complex domain. Input can be presented to the neural +oscillator in three possible modes. The sigmoid and ReLU neurons also use +complex-valued extensions. All the weight stages are also complex-valued. +Training follows the general principle of weight change by minimizing the +output error and therefore has an overall resemblance to complex +backpropagation. A generalization of DONN to convolutional networks known as +the Oscillatory Convolutional Neural Network is also proposed. The two proposed +oscillatory networks are applied to a variety of benchmark problems in signal +and image/video processing. The performance of the proposed models is either +comparable or superior to published results on the same data sets. + +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning for Traveling Purchaser Problems + + +
+ The traveling purchaser problem (TPP) is an important combinatorial +optimization problem with broad applications. Due to the coupling between +routing and purchasing, existing works on TPPs commonly address route +construction and purchase planning simultaneously, which, however, leads to +exact methods with high computational cost and heuristics with sophisticated +design but limited performance. In sharp contrast, we propose a novel approach +based on deep reinforcement learning (DRL), which addresses route construction +and purchase planning separately, while evaluating and optimizing the solution +from a global perspective. The key components of our approach include a +bipartite graph representation for TPPs to capture the market-product +relations, and a policy network that extracts information from the bipartite +graph and uses it to sequentially construct the route. One significant benefit +of our framework is that we can efficiently construct the route using the +policy network, and once the route is determined, the associated purchasing +plan can be easily derived through linear programming, while, leveraging DRL, +we can train the policy network to optimize the global solution objective. +Furthermore, by introducing a meta-learning strategy, the policy network can be +trained stably on large-sized TPP instances, and generalize well across +instances of varying sizes and distributions, even to much larger instances +that are never seen during training. Experiments on various synthetic TPP +instances and the TPPLIB benchmark demonstrate that our DRL-based approach can +significantly outperform well-established TPP heuristics, reducing the +optimality gap by 40%-90%, and also showing an advantage in runtime, especially +on large-sized instances. + +
+
+
+
+
+ + ♻ ☆ Jailbreaking Text-to-Image Models with LLM-Based Agents + + +
+ Recent advancements have significantly improved automated task-solving +capabilities using autonomous agents powered by large language models (LLMs). +However, most LLM-based agents focus on dialogue, programming, or specialized +domains, leaving their potential for addressing generative AI safety tasks +largely unexplored. In this paper, we propose Atlas, an advanced LLM-based +multi-agent framework targeting generative AI models, specifically focusing on +jailbreak attacks against text-to-image (T2I) models with built-in safety +filters. Atlas consists of two agents, namely the mutation agent and the +selection agent, each comprising four key modules: a vision-language model +(VLM) or LLM brain, planning, memory, and tool usage. The mutation agent uses +its VLM brain to determine whether a prompt triggers the T2I model's safety +filter. It then collaborates iteratively with the LLM brain of the selection +agent to generate new candidate jailbreak prompts with the highest potential to +bypass the filter. In addition to multi-agent communication, we leverage +in-context learning (ICL) memory mechanisms and the chain-of-thought (COT) +approach to learn from past successes and failures, thereby enhancing Atlas's +performance. Our evaluation demonstrates that Atlas successfully jailbreaks +several state-of-the-art T2I models equipped with multi-modal safety filters in +a black-box setting. Additionally, Atlas outperforms existing methods in both +query efficiency and the quality of generated images. This work convincingly +demonstrates the successful application of LLM-based agents in studying the +safety vulnerabilities of popular text-to-image generation models. We urge the +community to consider advanced techniques like ours in response to the rapidly +evolving text-to-image generation field. + +
+
+
+
+
+ + ♻ ☆ Reinforcement Unlearning NDSS 2025 + + +
+ Machine unlearning refers to the process of mitigating the influence of +specific training data on machine learning models based on removal requests +from data owners. However, one important area that has been largely overlooked +in the research of unlearning is reinforcement learning. Reinforcement learning +focuses on training an agent to make optimal decisions within an environment to +maximize its cumulative rewards. During the training, the agent tends to +memorize the features of the environment, which raises a significant concern +about privacy. As per data protection regulations, the owner of the environment +holds the right to revoke access to the agent's training data, thus +necessitating the development of a novel and pressing research field, known as +\emph{reinforcement unlearning}. Reinforcement unlearning focuses on revoking +entire environments rather than individual data samples. This unique +characteristic presents three distinct challenges: 1) how to propose unlearning +schemes for environments; 2) how to avoid degrading the agent's performance in +remaining environments; and 3) how to evaluate the effectiveness of unlearning. +To tackle these challenges, we propose two reinforcement unlearning methods. +The first method is based on decremental reinforcement learning, which aims to +erase the agent's previously acquired knowledge gradually. The second method +leverages environment poisoning attacks, which encourage the agent to learn +new, albeit incorrect, knowledge to remove the unlearning environment. +Particularly, to tackle the third challenge, we introduce the concept of +``environment inference attack'' to evaluate the unlearning outcomes. + +
+
+ comment: Accepted by NDSS 2025 +
+
+
+
+
+ + ♻ ☆ Understanding Fairness in Recommender Systems: A Healthcare Perspective + + +
+ Fairness in AI-driven decision-making systems has become a critical concern, +especially when these systems directly affect human lives. This paper explores +the public's comprehension of fairness in healthcare recommendations. We +conducted a survey where participants selected from four fairness metrics -- +Demographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive +Value -- across different healthcare scenarios to assess their understanding of +these concepts. Our findings reveal that fairness is a complex and often +misunderstood concept, with a generally low level of public understanding +regarding fairness metrics in recommender systems. This study highlights the +need for enhanced information and education on algorithmic fairness to support +informed decision-making in using these systems. Furthermore, the results +suggest that a one-size-fits-all approach to fairness may be insufficient, +pointing to the importance of context-sensitive designs in developing equitable +AI systems. + +
+
+ comment: Accepted to the 18th ACM Conference on Recommender Systems +
+
+
+
+
+ + ♻ ☆ FreeCG: Free the Design Space of Clebsch-Gordan Transform for Machine + Learning Force Fields + + +
+ Machine Learning Force Fields (MLFFs) are of great importance for chemistry, +physics, materials science, and many other related fields. The Clebsch-Gordan +Transform (CG transform) effectively encodes many-body interactions and is thus +an important building block for many models of MLFFs. However, the +permutation-equivariance requirement of MLFFs limits the design space of CG +transform, that is, intensive CG transform has to be conducted for each +neighboring edge and the operations should be performed in the same manner for +all edges. This constraint results in reduced expressiveness of the model while +simultaneously increasing computational demands. To overcome this challenge, we +first implement the CG transform layer on the permutation-invariant abstract +edges generated from real edge information. We show that this approach allows +complete freedom in the design of the layer without compromising the crucial +symmetry. Developing on this free design space, we further propose group CG +transform with sparse path, abstract edges shuffling, and attention enhancer to +form a powerful and efficient CG transform layer. Our method, known as FreeCG, +achieves state-of-the-art (SOTA) results in force prediction for MD17, rMD17, +MD22, and is well extended to property prediction in QM9 datasets with several +improvements greater than 15% and the maximum beyond 20%. The extensive +real-world applications showcase high practicality. FreeCG introduces a novel +paradigm for carrying out efficient and expressive CG transform in future +geometric neural network designs. To demonstrate this, the recent SOTA, +QuinNet, is also enhanced under our paradigm. Code will be publicly available. + +
+
+ comment: 25 pages, 8 tables, 11 figures +
+
+
+
+
+ + ♻ ☆ Local Universal Explainer (LUX) -- a rule-based explainer with factual, + counterfactual and visual explanations + + +
+ Explainable artificial intelligence (XAI) is one of the most intensively +developed area of AI in recent years. It is also one of the most fragmented +with multiple methods that focus on different aspects of explanations. This +makes difficult to obtain the full spectrum of explanation at once in a compact +and consistent way. To address this issue, we present Local Universal Explainer +(LUX), which is a rule-based explainer that can generate factual, +counterfactual and visual explanations. It is based on a modified version of +decision tree algorithms that allows for oblique splits and integration with +feature importance XAI methods such as SHAP. It limits the use data generation +in opposite to other algorithms, but is focused on selecting local concepts in +a form of high-density clusters of real data that have the highest impact on +forming the decision boundary of the explained model and generating artificial +samples with novel SHAP-guided sampling algorithm. We tested our method on real +and synthetic datasets and compared it with state-of-the-art rule-based +explainers such as LORE, EXPLAN and Anchor. Our method outperforms the existing +approaches in terms of simplicity, fidelity, representativeness, and +consistency. + +
+
+
+
+
+ + ♻ ☆ CL4KGE: A Curriculum Learning Method for Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) constitutes a foundational task, directed +towards learning representations for entities and relations within knowledge +graphs (KGs), with the objective of crafting representations comprehensive +enough to approximate the logical and symbolic interconnections among entities. +In this paper, we define a metric Z-counts to measure the difficulty of +training each triple ($<$head entity, relation, tail entity$>$) in KGs with +theoretical analysis. Based on this metric, we propose \textbf{CL4KGE}, an +efficient \textbf{C}urriculum \textbf{L}earning based training strategy for +\textbf{KGE}. This method includes a difficulty measurer and a training +scheduler that aids in the training of KGE models. Our approach possesses the +flexibility to act as a plugin within a wide range of KGE models, with the +added advantage of adaptability to the majority of KGs in existence. The +proposed method has been evaluated on popular KGE models, and the results +demonstrate that it enhances the state-of-the-art methods. The use of Z-counts +as a metric has enabled the identification of challenging triples in KGs, which +helps in devising effective training strategies. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ ChatQA 2: Bridging the Gap to Proprietary LLMs in Long Context and RAG + Capabilities + + +
+ In this work, we introduce ChatQA 2, an Llama 3.0-based model with a 128K +context window, designed to bridge the gap between open-source LLMs and leading +proprietary models (e.g., GPT-4-Turbo) in long-context understanding and +retrieval-augmented generation (RAG) capabilities. These two capabilities are +essential for LLMs to process large volumes of information that cannot fit into +a single prompt and are complementary to each other, depending on the +downstream tasks and computational budgets. We present a detailed continued +training recipe to extend the context window of Llama3-70B-base from 8K to 128K +tokens, along with a three-stage instruction tuning process to enhance the +model's instruction-following, RAG performance, and long-context understanding +capabilities. Our results demonstrate that the Llama3-ChatQA-2-70B model +outperforms most existing state-of-the-art models, including +GPT-4-Turbo-2024-04-09, Qwen2-72B-Instruct, and Llama3.1-70B-Instruct, on +ultra-long tasks beyond 100K tokens, as well as on the RAG benchmark using only +a 4K context window, showing the strong long context capability across varying +sequence lengths. We further provide extensive comparisons between direct +long-context and RAG solutions using the same state-of-the-art long-context +LLMs. Interestingly, we find that the performance of strong long-context LLMs +using RAG improves when retrieving a larger number of chunks. With a large set +of top-k chunks, RAG consistently outperforms direct long-context solution +using the same state-of-the-art long-context models (e.g., Llama3-ChatQA-2-70B +and Qwen2-72B-Instruct) on both 32K benchmarks and real-world 128K tasks. To +advance research in this field, we open-sourced the model weights, training +data, and the evaluation setup for the for the community: +https://chatqa2-project.github.io/ + +
+
+ comment: v2: major update with significantly improved results +
+
+
+
+
+ + ♻ ☆ Assessing the Impact of Upselling in Online Fantasy Sports + + +
+ This study explores the impact of upselling on user engagement. We model +users' deposit behaviour on the fantasy sports platform Dream11. Subsequently, +we develop an experimental framework to evaluate the effect of upselling using +an intensity parameter. Our live experiments on user deposit behaviour reveal +decreased user recall with heightened upselling intensity. Our findings +indicate that increased upselling intensity improves user deposit metrics and +concurrently diminishes user satisfaction and conversion rates. We conduct +robust counterfactual analysis and train causal meta-learners to personalise +users' upselling intensity levels to reach an optimal trade-off point. + +
+
+
+
+
+ + ♻ ☆ A Greedy Hierarchical Approach to Whole-Network Filter-Pruning in CNNs + + +
+ Deep convolutional neural networks (CNNs) have achieved impressive +performance in many computer vision tasks. However, their large model sizes +require heavy computational resources, making pruning redundant filters from +existing pre-trained CNNs an essential task in developing efficient models for +resource-constrained devices. Whole-network filter pruning algorithms prune +varying fractions of filters from each layer, hence providing greater +flexibility. Current whole-network pruning methods are either computationally +expensive due to the need to calculate the loss for each pruned filter using a +training dataset, or use various heuristic / learned criteria for determining +the pruning fractions for each layer. This paper proposes a two-level +hierarchical approach for whole-network filter pruning which is efficient and +uses the classification loss as the final criterion. The lower-level algorithm +(called filter-pruning) uses a sparse-approximation formulation based on linear +approximation of filter weights. We explore two algorithms: orthogonal matching +pursuit-based greedy selection and a greedy backward pruning approach. The +backward pruning algorithm uses a novel closed-form error criterion for +efficiently selecting the optimal filter at each stage, thus making the whole +algorithm much faster. The higher-level algorithm (called layer-selection) +greedily selects the best-pruned layer (pruning using the filter-selection +algorithm) using a global pruning criterion. We propose algorithms for two +different global-pruning criteria: (1) layer-wise relative error (HBGS), and +(2) final classification error (HBGTS). Our suite of algorithms outperforms +state-of-the-art pruning methods on ResNet18, ResNet32, ResNet56, VGG16, and +ResNext101. Our method reduces the RAM requirement for ResNext101 from 7.6 GB +to 1.5 GB and achieves a 94% reduction in FLOPS without losing accuracy on +CIFAR-10. + +
+
+ comment: Accepted in TMLR 2024 +
+
+
+
+
+ + ♻ ☆ Disentangling Length from Quality in Direct Preference Optimization + + +
+ Reinforcement Learning from Human Feedback (RLHF) has been a crucial +component in the recent success of Large Language Models. However, RLHF is know +to exploit biases in human preferences, such as verbosity. A well-formatted and +eloquent answer is often more highly rated by users, even when it is less +helpful and objective. A number of approaches have been developed to control +those biases in the classical RLHF literature, but the problem remains +relatively under-explored for Direct Alignment Algorithms such as Direct +Preference Optimization (DPO). Unlike classical RLHF, DPO does not train a +separate reward model or use reinforcement learning directly, so previous +approaches developed to control verbosity cannot be directly applied to this +setting. Our work makes several contributions. For the first time, we study the +length problem in the DPO setting, showing significant exploitation in DPO and +linking it to out-of-distribution bootstrapping. We then develop a principled +but simple regularization strategy that prevents length exploitation, while +still maintaining improvements in model quality. We demonstrate these effects +across datasets on summarization and dialogue, where we achieve up to 20\% +improvement in win rates when controlling for length, despite the GPT4 judge's +well-known verbosity bias. + +
+
+
+
+
+ + ♻ ☆ Boosting Certificate Robustness for Time Series Classification with + Efficient Self-Ensemble + + +
+ Recently, the issue of adversarial robustness in the time series domain has +garnered significant attention. However, the available defense mechanisms +remain limited, with adversarial training being the predominant approach, +though it does not provide theoretical guarantees. Randomized Smoothing has +emerged as a standout method due to its ability to certify a provable lower +bound on robustness radius under $\ell_p$-ball attacks. Recognizing its +success, research in the time series domain has started focusing on these +aspects. However, existing research predominantly focuses on time series +forecasting, or under the non-$\ell_p$ robustness in statistic feature +augmentation for time series classification~(TSC). Our review found that +Randomized Smoothing performs modestly in TSC, struggling to provide effective +assurances on datasets with poor robustness. Therefore, we propose a +self-ensemble method to enhance the lower bound of the probability confidence +of predicted labels by reducing the variance of classification margins, thereby +certifying a larger radius. This approach also addresses the computational +overhead issue of Deep Ensemble~(DE) while remaining competitive and, in some +cases, outperforming it in terms of robustness. Both theoretical analysis and +experimental results validate the effectiveness of our method, demonstrating +superior performance in robustness testing compared to baseline approaches. + +
+
+ comment: 6 figures, 4 tables, 10 pages +
+
+
+
+
+ + ♻ ☆ An Economic Solution to Copyright Challenges of Generative AI + + +
+ Generative artificial intelligence (AI) systems are trained on large data +corpora to generate new pieces of text, images, videos, and other media. There +is growing concern that such systems may infringe on the copyright interests of +training data contributors. To address the copyright challenges of generative +AI, we propose a framework that compensates copyright owners proportionally to +their contributions to the creation of AI-generated content. The metric for +contributions is quantitatively determined by leveraging the probabilistic +nature of modern generative AI models and using techniques from cooperative +game theory in economics. This framework enables a platform where AI developers +benefit from access to high-quality training data, thus improving model +performance. Meanwhile, copyright owners receive fair compensation, driving the +continued provision of relevant data for generative model training. Experiments +demonstrate that our framework successfully identifies the most relevant data +sources used in artwork generation, ensuring a fair and interpretable +distribution of revenues among copyright owners. + +
+
+ comment: Add additional experiments on language domain +
+
+
+
+
+ + ♻ ☆ PINN surrogate of Li-ion battery models for parameter inference. Part I: + Implementation and multi-fidelity hierarchies for the single-particle model + + +
+ To plan and optimize energy storage demands that account for Li-ion battery +aging dynamics, techniques need to be developed to diagnose battery internal +states accurately and rapidly. This study seeks to reduce the computational +resources needed to determine a battery's internal states by replacing +physics-based Li-ion battery models -- such as the single-particle model (SPM) +and the pseudo-2D (P2D) model -- with a physics-informed neural network (PINN) +surrogate. The surrogate model makes high-throughput techniques, such as +Bayesian calibration, tractable to determine battery internal parameters from +voltage responses. This manuscript is the first of a two-part series that +introduces PINN surrogates of Li-ion battery models for parameter inference +(i.e., state-of-health diagnostics). In this first part, a method is presented +for constructing a PINN surrogate of the SPM. A multi-fidelity hierarchical +training, where several neural nets are trained with multiple physics-loss +fidelities is shown to significantly improve the surrogate accuracy when only +training on the governing equation residuals. The implementation is made +available in a companion repository (https://github.com/NREL/pinnstripes). The +techniques used to develop a PINN surrogate of the SPM are extended in Part II +for the PINN surrogate for the P2D battery model, and explore the Bayesian +calibration capabilities of both surrogates. + +
+
+
+
+
+ + ♻ ☆ 360VFI: A Dataset and Benchmark for Omnidirectional Video Frame + Interpolation + + +
+ Head-mounted 360{\deg} displays and portable 360{\deg} cameras have +significantly progressed, providing viewers a realistic and immersive +experience. However, many omnidirectional videos have low frame rates that can +lead to visual fatigue, and the prevailing plane frame interpolation +methodologies are unsuitable for omnidirectional video interpolation because +they are designed solely for traditional videos. This paper introduces the +benchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We +present a practical implementation that introduces a distortion prior from +omnidirectional video into the network to modulate distortions. Specifically, +we propose a pyramid distortion-sensitive feature extractor that uses the +unique characteristics of equirectangular projection (ERP) format as prior +information. Moreover, we devise a decoder that uses an affine transformation +to further facilitate the synthesis of intermediate frames. 360VFI is the first +dataset and benchmark that explores the challenge of Omnidirectional Video +Frame Interpolation. Through our benchmark analysis, we present four different +distortion condition scenes in the proposed 360VFI dataset to evaluate the +challenges triggered by distortion during interpolation. Besides, experimental +results demonstrate that Omnidirectional Video Interpolation can be effectively +improved by modeling for omnidirectional distortion. + +
+
+ comment: This is a preprint version +
+
+
+
+
+ + ♻ ☆ Technical Report of HelixFold3 for Biomolecular Structure Prediction + + +
+ The AlphaFold series has transformed protein structure prediction with +remarkable accuracy, often matching experimental methods. AlphaFold2, +AlphaFold-Multimer, and the latest AlphaFold3 represent significant strides in +predicting single protein chains, protein complexes, and biomolecular +structures. While AlphaFold2 and AlphaFold-Multimer are open-sourced, +facilitating rapid and reliable predictions, AlphaFold3 remains partially +accessible through a limited online server and has not been open-sourced, +restricting further development. To address these challenges, the PaddleHelix +team is developing HelixFold3, aiming to replicate AlphaFold3's capabilities. +Using insights from previous models and extensive datasets, HelixFold3 achieves +an accuracy comparable to AlphaFold3 in predicting the structures of +conventional ligands, nucleic acids, and proteins. The initial release of +HelixFold3 is available as open source on GitHub for academic research, +promising to advance biomolecular research and accelerate discoveries. We also +provide online service at PaddleHelix website at +https://paddlehelix.baidu.com/app/all/helixfold3/forecast. + +
+
+
+
+
+ + ♻ ☆ Efficient Imitation Without Demonstrations via Value-Penalized Auxiliary + Control from Examples ICRA'25 + + +
+ Learning from examples of success is an ap pealing approach to reinforcement +learning but it presents a challenging exploration problem, especially for +complex or long-horizon tasks. This work introduces value-penalized auxiliary +control from examples (VPACE), an algorithm that significantly improves +exploration in example-based control by adding examples of simple auxiliary +tasks. For instance, a manipulation task may have auxiliary examples of an +object being reached for, grasped, or lifted. We show that the na\"{i}ve +application of scheduled auxiliary control to example-based learning can lead +to value overestimation and poor performance. We resolve the problem with an +above-success-level value penalty. Across both simulated and real robotic +environments, we show that our approach substantially improves learning +efficiency for challenging tasks, while maintaining bounded value estimates. We +compare with existing approaches to example-based learning, inverse +reinforcement learning, and an exploration bonus. Preliminary results also +suggest that VPACE may learn more efficiently than the more common approaches +of using full trajectories or true sparse rewards. Videos, code, and datasets: +https://papers.starslab.ca/vpace. + +
+
+ comment: Submitted to IEEE International Conference on Robotics and Automation + (ICRA'25), Atlanta, USA, May 19-23, 2025 +
+
+
+
+
+ + ♻ ☆ A Fourier Approach to the Parameter Estimation Problem for + One-dimensional Gaussian Mixture Models + + +
+ The purpose of this paper is twofold. First, we propose a novel algorithm for +estimating parameters in one-dimensional Gaussian mixture models (GMMs). The +algorithm takes advantage of the Hankel structure inherent in the Fourier data +obtained from independent and identically distributed (i.i.d) samples of the +mixture. For GMMs with a unified variance, a singular value ratio functional +using the Fourier data is introduced and used to resolve the variance and +component number simultaneously. The consistency of the estimator is derived. +Compared to classic algorithms such as the method of moments and the maximum +likelihood method, the proposed algorithm does not require prior knowledge of +the number of Gaussian components or good initial guesses. Numerical +experiments demonstrate its superior performance in estimation accuracy and +computational cost. Second, we reveal that there exists a fundamental limit to +the problem of estimating the number of Gaussian components or model order in +the mixture model if the number of i.i.d samples is finite. For the case of a +single variance, we show that the model order can be successfully estimated +only if the minimum separation distance between the component means exceeds a +certain threshold value and can fail if below. We derive a lower bound for this +threshold value, referred to as the computational resolution limit, in terms of +the number of i.i.d samples, the variance, and the number of Gaussian +components. Numerical experiments confirm this phase transition phenomenon in +estimating the model order. Moreover, we demonstrate that our algorithm +achieves better scores in likelihood, AIC, and BIC when compared to the EM +algorithm. + +
+
+
+
+
+ + ♻ ☆ Shared Latent Space by Both Languages in Non-Autoregressive Neural + Machine Translation + + +
+ Non-autoregressive neural machine translation (NAT) offers substantial +translation speed up compared to autoregressive neural machine translation (AT) +at the cost of translation quality. Latent variable modeling has emerged as a +promising approach to bridge this quality gap, particularly for addressing the +chronic multimodality problem in NAT. In the previous works that used latent +variable modeling, they added an auxiliary model to estimate the posterior +distribution of the latent variable conditioned on the source and target +sentences. However, it causes several disadvantages, such as redundant +information extraction in the latent variable, increasing the number of +parameters, and a tendency to ignore some information from the inputs. In this +paper, we propose a novel latent variable modeling that integrates a dual +reconstruction perspective and an advanced hierarchical latent modeling with a +shared intermediate latent space across languages. This latent variable +modeling hypothetically alleviates or prevents the above disadvantages. In our +experiment results, we present comprehensive demonstrations that our proposed +approach infers superior latent variables which lead better translation +quality. Finally, in the benchmark translation tasks, such as WMT, we +demonstrate that our proposed method significantly improves translation quality +compared to previous NAT baselines including the state-of-the-art NAT model. + +
+
+
+
+
+ + ♻ ☆ A Survey on Self-Supervised Learning for Non-Sequential Tabular Data ACML-24 + + +
+ Self-supervised learning (SSL) has been incorporated into many +state-of-the-art models in various domains, where SSL defines pretext tasks +based on unlabeled datasets to learn contextualized and robust representations. +Recently, SSL has become a new trend in exploring the representation learning +capability in the realm of tabular data, which is more challenging due to not +having explicit relations for learning descriptive representations. This survey +aims to systematically review and summarize the recent progress and challenges +of SSL for non-sequential tabular data (SSL4NS-TD). We first present a formal +definition of NS-TD and clarify its correlation to related studies. Then, these +approaches are categorized into three groups - predictive learning, contrastive +learning, and hybrid learning, with their motivations and strengths of +representative methods in each direction. Moreover, application issues of +SSL4NS-TD are presented, including automatic data engineering, cross-table +transferability, and domain knowledge integration. In addition, we elaborate on +existing benchmarks and datasets for NS-TD applications to analyze the +performance of existing tabular models. Finally, we discuss the challenges of +SSL4NS-TD and provide potential directions for future research. We expect our +work to be useful in terms of encouraging more research on lowering the barrier +to entry SSL for the tabular domain, and of improving the foundations for +implicit tabular data. + +
+
+ comment: ACML-24 Journal Track. The paper list can be found at + https://github.com/wwweiwei/awesome-self-supervised-learning-for-tabular-data +
+
+
+
+
+ + ♻ ☆ Spatial Craving Patterns in Marijuana Users: Insights from fMRI Brain + Connectivity Analysis with High-Order Graph Attention Neural Networks + + +
+ The excessive consumption of marijuana can induce substantial psychological +and social consequences. In this investigation, we propose an elucidative +framework termed high-order graph attention neural networks (HOGANN) for the +classification of Marijuana addiction, coupled with an analysis of localized +brain network communities exhibiting abnormal activities among chronic +marijuana users. HOGANN integrates dynamic intrinsic functional brain networks, +estimated from functional magnetic resonance imaging (fMRI), using graph +attention-based long short-term memory (GAT-LSTM) to capture temporal network +dynamics. We employ a high-order attention module for information fusion and +message passing among neighboring nodes, enhancing the network community +analysis. Our model is validated across two distinct data cohorts, yielding +substantially higher classification accuracy than benchmark algorithms. +Furthermore, we discern the most pertinent subnetworks and cognitive regions +affected by persistent marijuana consumption, indicating adverse effects on +functional brain networks, particularly within the dorsal attention and +frontoparietal networks. Intriguingly, our model demonstrates superior +performance in cohorts exhibiting prolonged dependence, implying that prolonged +marijuana usage induces more pronounced alterations in brain networks. The +model proficiently identifies craving brain maps, thereby delineating critical +brain regions for analysis + +
+
+
+
+
+ + ♻ ☆ Keypoint Action Tokens Enable In-Context Imitation Learning in Robotics + + +
+ We show that off-the-shelf text-based Transformers, with no additional +training, can perform few-shot in-context visual imitation learning, mapping +visual observations to action sequences that emulate the demonstrator's +behaviour. We achieve this by transforming visual observations (inputs) and +trajectories of actions (outputs) into sequences of tokens that a +text-pretrained Transformer (GPT-4 Turbo) can ingest and generate, via a +framework we call Keypoint Action Tokens (KAT). Despite being trained only on +language, we show that these Transformers excel at translating tokenised visual +keypoint observations into action trajectories, performing on par or better +than state-of-the-art imitation learning (diffusion policies) in the low-data +regime on a suite of real-world, everyday tasks. Rather than operating in the +language domain as is typical, KAT leverages text-based Transformers to operate +in the vision and action domains to learn general patterns in demonstration +data for highly efficient imitation learning, indicating promising new avenues +for repurposing natural language models for embodied tasks. Videos are +available at https://www.robot-learning.uk/keypoint-action-tokens. + +
+
+ comment: Published at Robotics: Science and Systems (RSS) 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive Class Emergence Training: Enhancing Neural Network Stability + and Generalization through Progressive Target Evolution + + +
+ Recent advancements in artificial intelligence, particularly deep neural +networks, have pushed the boundaries of what is achievable in complex tasks. +Traditional methods for training neural networks in classification problems +often rely on static target outputs, such as one-hot encoded vectors, which can +lead to unstable optimization and difficulties in handling non-linearities +within data. In this paper, we propose a novel training methodology that +progressively evolves the target outputs from a null vector to one-hot encoded +vectors throughout the training process. This gradual transition allows the +network to adapt more smoothly to the increasing complexity of the +classification task, maintaining an equilibrium state that reduces the risk of +overfitting and enhances generalization. Our approach, inspired by concepts +from structural equilibrium in finite element analysis, has been validated +through extensive experiments on both synthetic and real-world datasets. The +results demonstrate that our method achieves faster convergence, improved +accuracy, and better generalization, especially in scenarios with high data +complexity and noise. This progressive training framework offers a robust +alternative to classical methods, opening new perspectives for more efficient +and stable neural network training. + +
+
+ comment: 15 pages, 9 figures, 2 tables +
+
+
+
+
+
+
+
+ + Multimedia 15 + +
+
+
+ + ☆ A CLIP-based siamese approach for meme classification + + +
+ Memes are an increasingly prevalent element of online discourse in social +networks, especially among young audiences. They carry ideas and messages that +range from humorous to hateful, and are widely consumed. Their potentially high +impact requires adequate means of control to moderate their use in large scale. +In this work, we propose SimCLIP a deep learning-based architecture for +cross-modal understanding of memes, leveraging a pre-trained CLIP encoder to +produce context-aware embeddings and a Siamese fusion technique to capture the +interactions between text and image. We perform an extensive experimentation on +seven meme classification tasks across six datasets. We establish a new state +of the art in Memotion7k with a 7.25% relative F1-score improvement, and +achieve super-human performance on Harm-P with 13.73% F1-Score improvement. Our +approach demonstrates the potential for compact meme classification models, +enabling accurate and efficient meme monitoring. We share our code at +https://github.com/jahuerta92/meme-classification-simclip + +
+
+
+
+
+ + ☆ A Toolkit for Joint Speaker Diarization and Identification with + Application to Speaker-Attributed ASR + + +
+ We present a modular toolkit to perform joint speaker diarization and speaker +identification. The toolkit can leverage on multiple models and algorithms +which are defined in a configuration file. Such flexibility allows our system +to work properly in various conditions (e.g., multiple registered speakers' +sets, acoustic conditions and languages) and across application domains (e.g. +media monitoring, institutional, speech analytics). In this demonstration we +show a practical use-case in which speaker-related information is used jointly +with automatic speech recognition engines to generate speaker-attributed +transcriptions. To achieve that, we employ a user-friendly web-based interface +to process audio and video inputs with the chosen configuration. + +
+
+ comment: Show and Tell paper. Presented at Interspeech 2024 +
+
+
+
+
+ + ☆ Audio-Visual Speaker Diarization: Current Databases, Approaches and + Challenges + + +
+ Nowadays, the large amount of audio-visual content available has fostered the +need to develop new robust automatic speaker diarization systems to analyse and +characterise it. This kind of system helps to reduce the cost of doing this +process manually and allows the use of the speaker information for different +applications, as a huge quantity of information is present, for example, images +of faces, or audio recordings. Therefore, this paper aims to address a critical +area in the field of speaker diarization systems, the integration of +audio-visual content of different domains. This paper seeks to push beyond +current state-of-the-art practices by developing a robust audio-visual speaker +diarization framework adaptable to various data domains, including TV +scenarios, meetings, and daily activities. Unlike most of the existing +audio-visual speaker diarization systems, this framework will also include the +proposal of an approach to lead the precise assignment of specific identities +in TV scenarios where celebrities appear. In addition, in this work, we have +conducted an extensive compilation of the current state-of-the-art approaches +and the existing databases for developing audio-visual speaker diarization. + +
+
+
+
+
+ + ☆ CustomContrast: A Multilevel Contrastive Perspective For Subject-Driven + Text-to-Image Customization + + +
+ Subject-driven text-to-image (T2I) customization has drawn significant +interest in academia and industry. This task enables pre-trained models to +generate novel images based on unique subjects. Existing studies adopt a +self-reconstructive perspective, focusing on capturing all details of a single +image, which will misconstrue the specific image's irrelevant attributes (e.g., +view, pose, and background) as the subject intrinsic attributes. This +misconstruction leads to both overfitting or underfitting of irrelevant and +intrinsic attributes of the subject, i.e., these attributes are +over-represented or under-represented simultaneously, causing a trade-off +between similarity and controllability. In this study, we argue an ideal +subject representation can be achieved by a cross-differential perspective, +i.e., decoupling subject intrinsic attributes from irrelevant attributes via +contrastive learning, which allows the model to focus more on intrinsic +attributes through intra-consistency (features of the same subject are +spatially closer) and inter-distinctiveness (features of different subjects +have distinguished differences). Specifically, we propose CustomContrast, a +novel framework, which includes a Multilevel Contrastive Learning (MCL) +paradigm and a Multimodal Feature Injection (MFI) Encoder. The MCL paradigm is +used to extract intrinsic features of subjects from high-level semantics to +low-level appearance through crossmodal semantic contrastive learning and +multiscale appearance contrastive learning. To facilitate contrastive learning, +we introduce the MFI encoder to capture cross-modal representations. Extensive +experiments show the effectiveness of CustomContrast in subject similarity and +text controllability. + +
+
+
+
+
+ + ☆ Exploring Rich Subjective Quality Information for Image Quality + Assessment in the Wild + + +
+ Traditional in the wild image quality assessment (IQA) models are generally +trained with the quality labels of mean opinion score (MOS), while missing the +rich subjective quality information contained in the quality ratings, for +example, the standard deviation of opinion scores (SOS) or even distribution of +opinion scores (DOS). In this paper, we propose a novel IQA method named +RichIQA to explore the rich subjective rating information beyond MOS to predict +image quality in the wild. RichIQA is characterized by two key novel designs: +(1) a three-stage image quality prediction network which exploits the powerful +feature representation capability of the Convolutional vision Transformer (CvT) +and mimics the short-term and long-term memory mechanisms of human brain; (2) a +multi-label training strategy in which rich subjective quality information like +MOS, SOS and DOS are concurrently used to train the quality prediction network. +Powered by these two novel designs, RichIQA is able to predict the image +quality in terms of a distribution, from which the mean image quality can be +subsequently obtained. Extensive experimental results verify that the +three-stage network is tailored to predict rich quality information, while the +multi-label training strategy can fully exploit the potentials within +subjective quality rating and enhance the prediction performance and +generalizability of the network. RichIQA outperforms state-of-the-art +competitors on multiple large-scale in the wild IQA databases with rich +subjective rating labels. The code of RichIQA will be made publicly available +on GitHub. + +
+
+
+
+
+ + ☆ Educational Virtual Field Trips based on Social VR and 360° Spaces + + +
+ Virtual field trips (VFTs) have proven to be valuable learning tools. Such +applications are mostly based on 360{\deg} technology and are to be +characterized as single-user applications in technological terms. In contrast, +Social VR applications are characterized by multi-user capability and +user-specific avatars. From a learning perspective, the concepts of +collaborative learning and embodiment have long been proposed as conducive to +learning. Both concepts might be supported using Social VR. However, little is +currently known about the use of Social VR for VFTs. Accordingly, the research +questions are to what extent VFTs can be implemented in Social VR environments +and how these Social VR-based VFTs are perceived by learners. This article +presents an evaluation study on the development and evaluation of a VFT +environment using the Social VR platform Mozilla Hubs. It describes the design +decisions to create the environment and evaluation results from a mixed-method +study (N=16) using a questionnaire and focus group discussions. The study +highlighted the opportunities offered by Social VR-based VFTs but also revealed +several challenges that need to be addressed to embrace the potential of Social +VR-based VFTs to be utilized regularly in education. + +
+
+ comment: 9 pages, 7 figures, 1 table, submitted to Games and Learning Alliance + Conference +
+
+
+
+
+ + ☆ A Survey of Multimodal Composite Editing and Retrieval + + +
+ In the real world, where information is abundant and diverse across different +modalities, understanding and utilizing various data types to improve retrieval +systems is a key focus of research. Multimodal composite retrieval integrates +diverse modalities such as text, image and audio, etc. to provide more +accurate, personalized, and contextually relevant results. To facilitate a +deeper understanding of this promising direction, this survey explores +multimodal composite editing and retrieval in depth, covering image-text +composite editing, image-text composite retrieval, and other multimodal +composite retrieval. In this survey, we systematically organize the application +scenarios, methods, benchmarks, experiments, and future directions. Multimodal +learning is a hot topic in large model era, and have also witnessed some +surveys in multimodal learning and vision-language models with transformers +published in the PAMI journal. To the best of our knowledge, this survey is the +first comprehensive review of the literature on multimodal composite retrieval, +which is a timely complement of multimodal fusion to existing reviews. To help +readers' quickly track this field, we build the project page for this survey, +which can be found at +https://github.com/fuxianghuang1/Multimodal-Composite-Editing-and-Retrieval. + +
+
+ comment: 22 pages, 3 figures, and 11 tables +
+
+
+
+
+ + ☆ Look One and More: Distilling Hybrid Order Relational Knowledge for + Cross-Resolution Image Recognition AAAI 2020 + + +
+ In spite of great success in many image recognition tasks achieved by recent +deep models, directly applying them to recognize low-resolution images may +suffer from low accuracy due to the missing of informative details during +resolution degradation. However, these images are still recognizable for +subjects who are familiar with the corresponding high-resolution ones. Inspired +by that, we propose a teacher-student learning approach to facilitate +low-resolution image recognition via hybrid order relational knowledge +distillation. The approach refers to three streams: the teacher stream is +pretrained to recognize high-resolution images in high accuracy, the student +stream is learned to identify low-resolution images by mimicking the teacher's +behaviors, and the extra assistant stream is introduced as bridge to help +knowledge transfer across the teacher to the student. To extract sufficient +knowledge for reducing the loss in accuracy, the learning of student is +supervised with multiple losses, which preserves the similarities in various +order relational structures. In this way, the capability of recovering missing +details of familiar low-resolution images can be effectively enhanced, leading +to a better knowledge transfer. Extensive experiments on metric learning, +low-resolution image classification and low-resolution face recognition tasks +show the effectiveness of our approach, while taking reduced models. + +
+
+ comment: Accepted by AAAI 2020 +
+
+
+
+
+ + ☆ KAN-Based Fusion of Dual-Domain for Audio-Driven Facial Landmarks + Generation + + +
+ Audio-driven talking face generation is a widely researched topic due to its +high applicability. Reconstructing a talking face using audio significantly +contributes to fields such as education, healthcare, online conversations, +virtual assistants, and virtual reality. Early studies often focused solely on +changing the mouth movements, which resulted in outcomes with limited practical +applications. Recently, researchers have proposed a new approach of +constructing the entire face, including face pose, neck, and shoulders. To +achieve this, they need to generate through landmarks. However, creating stable +landmarks that align well with the audio is a challenge. In this paper, we +propose the KFusion of Dual-Domain model, a robust model that generates +landmarks from audio. We separate the audio into two distinct domains to learn +emotional information and facial context, then use a fusion mechanism based on +the KAN model. Our model demonstrates high efficiency compared to recent +models. This will lay the groundwork for the development of the audio-driven +talking face generation problem in the future. + +
+
+
+
+
+ + ☆ Adaptive Offloading and Enhancement for Low-Light Video Analytics on + Mobile Devices + + +
+ In this paper, we explore adaptive offloading and enhancement strategies for +video analytics tasks on computing-constrained mobile devices in low-light +conditions. We observe that the accuracy of low-light video analytics varies +from different enhancement algorithms. The root cause could be the disparities +in the effectiveness of enhancement algorithms for feature extraction in +analytic models. Specifically, the difference in class activation maps (CAMs) +between enhanced and low-light frames demonstrates a positive correlation with +video analytics accuracy. Motivated by such observations, a novel enhancement +quality assessment method is proposed on CAMs to evaluate the effectiveness of +different enhancement algorithms for low-light videos. Then, we design a +multi-edge system, which adaptively offloads and enhances low-light video +analytics tasks from mobile devices. To achieve the trade-off between the +enhancement quality and the latency for all system-served mobile devices, we +propose a genetic-based scheduling algorithm, which can find a near-optimal +solution in a reasonable time to meet the latency requirement. Thereby, the +offloading strategies and the enhancement algorithms are properly selected +under the condition of limited end-edge bandwidth and edge computation +resources. Simulation experiments demonstrate the superiority of the proposed +system, improving accuracy up to 20.83\% compared to existing benchmarks. + +
+
+
+
+
+ + ♻ ☆ HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale + Space Using Wearable IMUs and LiDAR + + +
+ We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture +method, aimed at accurately and efficiently creating a dynamic digital world, +containing large-scale indoor-outdoor scenes, diverse human motions, rich +human-human interactions, and human-environment interactions. By utilizing +body-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human +motions in unconstrained space without the need for external devices and +pre-built maps. This affords great flexibility and accessibility for +human-centered interaction and 4D scene capturing in various environments. +Taking into account that IMUs can capture human spatially unrestricted poses +but are prone to drifting for long-period using, and while LiDAR is stable for +global localization but rough for local positions and orientations, HiSC4D +employs a joint optimization method, harmonizing all sensors and utilizing +environment cues, yielding promising results for long-term capture in large +scenes. To promote research of egocentric human interaction in large scenes and +facilitate downstream tasks, we also present a dataset, containing 8 sequences +in 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D +human motions with SMPL annotations and dynamic scenes, 31k frames of cropped +human point clouds, and scene mesh of the environment. A variety of scenarios, +such as the basketball gym and commercial street, alongside challenging human +motions, such as daily greeting, one-on-one basketball playing, and tour +guiding, demonstrate the effectiveness and the generalization ability of +HiSC4D. The dataset and code will be publicated on +www.lidarhumanmotion.net/hisc4d available for research purposes. + +
+
+ comment: 17 pages, 10 figures, Jornal +
+
+
+
+
+ + ♻ ☆ Auto-ACD: A Large-scale Dataset for Audio-Language Representation + Learning ACM MM 2024 + + +
+ Recently, the AI community has made significant strides in developing +powerful foundation models, driven by large-scale multimodal datasets. However, +for audio representation learning, existing datasets suffer from limitations in +the following aspects: insufficient volume, simplistic content, and arduous +collection procedures. To establish an audio dataset with high-quality +captions, we propose an innovative, automatic approach leveraging multimodal +inputs, such as video frames, audio streams. Specifically, we construct a +large-scale, high-quality, audio-language dataset, named as Auto-ACD, +comprising over 1.5M audio-text pairs. We exploit a series of pre-trained +models or APIs, to determine audio-visual synchronisation, generate image +captions, object detection, or audio tags for specific videos. Subsequently, we +employ LLM to paraphrase a congruent caption for each audio, guided by the +extracted multi-modality clues. To demonstrate the effectiveness of the +proposed dataset, we train widely used models on our dataset and show +performance improvement on various downstream tasks, for example, +audio-language retrieval, audio captioning, zero-shot classification. In +addition, we establish a novel benchmark with environmental information and +provide a benchmark for audio-text tasks. + +
+
+ comment: Accepted by ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Question-Answering Dense Video Events AAAI 2025 + + +
+ Multimodal Large Language Models (MLLMs) have shown excellent performance in +question-answering of single-event videos. In this paper, we present +question-answering dense video events, a novel task that requires answering and +grounding the dense-event questions in long videos, thus challenging MLLMs to +faithfully comprehend and reason about multiple events occurring over extended +time periods. To facilitate the study, we construct DeVE-QA - a dataset +featuring 78K questions about 26K events on 10.6K long videos. We then +benchmark and show that existing MLLMs excelling at single-event QA struggle to +perform well in DeVE-QA. For improvement, we propose DeVi, a novel +training-free MLLM approach that highlights a hierarchical captioning module, a +temporal event memory module, and a self-consistency checking module to +respectively detect, contextualize and memorize, and ground dense-events in +long videos for question answering. Extensive experiments show that DeVi is +superior at answering dense-event questions and grounding relevant video +moments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1 +percent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA +respectively. + +
+
+ comment: Submitted to AAAI 2025 +
+
+
+
+
+ + ♻ ☆ Concept Conductor: Orchestrating Multiple Personalized Concepts in + Text-to-Image Synthesis + + +
+ The customization of text-to-image models has seen significant advancements, +yet generating multiple personalized concepts remains a challenging task. +Current methods struggle with attribute leakage and layout confusion when +handling multiple concepts, leading to reduced concept fidelity and semantic +consistency. In this work, we introduce a novel training-free framework, +Concept Conductor, designed to ensure visual fidelity and correct layout in +multi-concept customization. Concept Conductor isolates the sampling processes +of multiple custom models to prevent attribute leakage between different +concepts and corrects erroneous layouts through self-attention-based spatial +guidance. Additionally, we present a concept injection technique that employs +shape-aware masks to specify the generation area for each concept. This +technique injects the structure and appearance of personalized concepts through +feature fusion in the attention layers, ensuring harmony in the final image. +Extensive qualitative and quantitative experiments demonstrate that Concept +Conductor can consistently generate composite images with accurate layouts +while preserving the visual details of each concept. Compared to existing +baselines, Concept Conductor shows significant performance improvements. Our +method supports the combination of any number of concepts and maintains high +fidelity even when dealing with visually similar concepts. The code and models +are available at https://github.com/Nihukat/Concept-Conductor. + +
+
+ comment: Github Page: https://github.com/Nihukat/Concept-Conductor +
+
+
+
+
+ + ♻ ☆ 360VFI: A Dataset and Benchmark for Omnidirectional Video Frame + Interpolation + + +
+ Head-mounted 360{\deg} displays and portable 360{\deg} cameras have +significantly progressed, providing viewers a realistic and immersive +experience. However, many omnidirectional videos have low frame rates that can +lead to visual fatigue, and the prevailing plane frame interpolation +methodologies are unsuitable for omnidirectional video interpolation because +they are designed solely for traditional videos. This paper introduces the +benchmark dataset, 360VFI, for Omnidirectional Video Frame Interpolation. We +present a practical implementation that introduces a distortion prior from +omnidirectional video into the network to modulate distortions. Specifically, +we propose a pyramid distortion-sensitive feature extractor that uses the +unique characteristics of equirectangular projection (ERP) format as prior +information. Moreover, we devise a decoder that uses an affine transformation +to further facilitate the synthesis of intermediate frames. 360VFI is the first +dataset and benchmark that explores the challenge of Omnidirectional Video +Frame Interpolation. Through our benchmark analysis, we present four different +distortion condition scenes in the proposed 360VFI dataset to evaluate the +challenges triggered by distortion during interpolation. Besides, experimental +results demonstrate that Omnidirectional Video Interpolation can be effectively +improved by modeling for omnidirectional distortion. + +
+
+ comment: This is a preprint version +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 38 + +
+
+
+ + ☆ Socially Responsible Data for Large Multilingual Language Models + + +
+ Large Language Models (LLMs) have rapidly increased in size and apparent +capabilities in the last three years, but their training data is largely +English text. There is growing interest in multilingual LLMs, and various +efforts are striving for models to accommodate languages of communities outside +of the Global North, which include many languages that have been historically +underrepresented in digital realms. These languages have been coined as "low +resource languages" or "long-tail languages", and LLMs performance on these +languages is generally poor. While expanding the use of LLMs to more languages +may bring many potential benefits, such as assisting cross-community +communication and language preservation, great care must be taken to ensure +that data collection on these languages is not extractive and that it does not +reproduce exploitative practices of the past. Collecting data from languages +spoken by previously colonized people, indigenous people, and non-Western +languages raises many complex sociopolitical and ethical questions, e.g., +around consent, cultural safety, and data sovereignty. Furthermore, linguistic +complexity and cultural nuances are often lost in LLMs. This position paper +builds on recent scholarship, and our own work, and outlines several relevant +social, cultural, and ethical considerations and potential ways to mitigate +them through qualitative research, community partnerships, and participatory +design approaches. We provide twelve recommendations for consideration when +collecting language data on underrepresented language communities outside of +the Global North. + +
+
+
+
+
+ + ☆ Exploring Intrinsic Language-specific Subspaces in Fine-tuning + Multilingual Neural Machine Translation + + +
+ Multilingual neural machine translation models support fine-tuning hundreds +of languages simultaneously. However, fine-tuning on full parameters solely is +inefficient potentially leading to negative interactions among languages. In +this work, we demonstrate that the fine-tuning for a language occurs in its +intrinsic language-specific subspace with a tiny fraction of entire parameters. +Thus, we propose language-specific LoRA to isolate intrinsic language-specific +subspaces. Furthermore, we propose architecture learning techniques and +introduce a gradual pruning schedule during fine-tuning to exhaustively explore +the optimal setting and the minimal intrinsic subspaces for each language, +resulting in a lightweight yet effective fine-tuning procedure. The +experimental results on a 12-language subset and a 30-language subset of +FLORES-101 show that our methods not only outperform full-parameter fine-tuning +up to 2.25 spBLEU scores but also reduce trainable parameters to $0.4\%$ for +high and medium-resource languages and $1.6\%$ for low-resource ones. + +
+
+
+
+
+ + ☆ Interactive Machine Teaching by Labeling Rules and Instances ACL 2024 + + +
+ Weakly supervised learning aims to reduce the cost of labeling data by using +expert-designed labeling rules. However, existing methods require experts to +design effective rules in a single shot, which is difficult in the absence of +proper guidance and tooling. Therefore, it is still an open question whether +experts should spend their limited time writing rules or instead providing +instance labels via active learning. In this paper, we investigate how to +exploit an expert's limited time to create effective supervision. First, to +develop practical guidelines for rule creation, we conduct an exploratory +analysis of diverse collections of existing expert-designed rules and find that +rule precision is more important than coverage across datasets. Second, we +compare rule creation to individual instance labeling via active learning and +demonstrate the importance of both across 6 datasets. Third, we propose an +interactive learning framework, INTERVAL, that achieves efficiency by +automatically extracting candidate rules based on rich patterns (e.g., by +prompting a language model), and effectiveness by soliciting expert feedback on +both candidate rules and individual instances. Across 6 datasets, INTERVAL +outperforms state-of-the-art weakly supervised approaches by 7% in F1. +Furthermore, it requires as few as 10 queries for expert feedback to reach F1 +values that existing active learning methods cannot match even with 100 +queries. + +
+
+ comment: Accepted to TACL 2024 +
+
+
+
+
+ + ☆ Seemingly Plausible Distractors in Multi-Hop Reasoning: Are Large + Language Models Attentive Readers? + + +
+ State-of-the-art Large Language Models (LLMs) are accredited with an +increasing number of different capabilities, ranging from reading +comprehension, over advanced mathematical and reasoning skills to possessing +scientific knowledge. In this paper we focus on their multi-hop reasoning +capability: the ability to identify and integrate information from multiple +textual sources. + Given the concerns with the presence of simplifying cues in existing +multi-hop reasoning benchmarks, which allow models to circumvent the reasoning +requirement, we set out to investigate, whether LLMs are prone to exploiting +such simplifying cues. We find evidence that they indeed circumvent the +requirement to perform multi-hop reasoning, but they do so in more subtle ways +than what was reported about their fine-tuned pre-trained language model (PLM) +predecessors. Motivated by this finding, we propose a challenging multi-hop +reasoning benchmark, by generating seemingly plausible multi-hop reasoning +chains, which ultimately lead to incorrect answers. We evaluate multiple open +and proprietary state-of-the-art LLMs, and find that their performance to +perform multi-hop reasoning is affected, as indicated by up to 45% relative +decrease in F1 score when presented with such seemingly plausible alternatives. +We conduct a deeper analysis and find evidence that while LLMs tend to ignore +misleading lexical cues, misleading reasoning paths indeed present a +significant challenge. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs + + +
+ Despite the recent advancements in Large Language Models (LLMs), which have +significantly enhanced the generative capabilities for various NLP tasks, LLMs +still face limitations in directly handling retrieval tasks. However, many +practical applications demand the seamless integration of both retrieval and +generation. This paper introduces a novel and efficient One-pass Generation and +retrieval framework (OneGen), designed to improve LLMs' performance on tasks +that require both generation and retrieval. The proposed framework bridges the +traditionally separate training approaches for generation and retrieval by +incorporating retrieval tokens generated autoregressively. This enables a +single LLM to handle both tasks simultaneously in a unified forward pass. We +conduct experiments on two distinct types of composite tasks, RAG and Entity +Linking, to validate the pluggability, effectiveness, and efficiency of OneGen +in training and inference. Furthermore, our results show that integrating +generation and retrieval within the same context preserves the generative +capabilities of LLMs while improving retrieval performance. To the best of our +knowledge, OneGen is the first to enable LLMs to conduct vector retrieval +during the generation. + +
+
+ comment: Work in progress; code is available at + https://github.com/zjunlp/OneGen +
+
+
+
+
+ + ☆ Better Spanish Emotion Recognition In-the-wild: Bringing Attention to + Deep Spectrum Voice Analysis + + +
+ Within the context of creating new Socially Assistive Robots, emotion +recognition has become a key development factor, as it allows the robot to +adapt to the user's emotional state in the wild. In this work, we focused on +the analysis of two voice recording Spanish datasets: ELRA-S0329 and +EmoMatchSpanishDB. Specifically, we centered our work in the paralanguage, +e.~g. the vocal characteristics that go along with the message and clarifies +the meaning. We proposed the use of the DeepSpectrum method, which consists of +extracting a visual representation of the audio tracks and feeding them to a +pretrained CNN model. For the classification task, DeepSpectrum is often paired +with a Support Vector Classifier --DS-SVC--, or a Fully-Connected deep-learning +classifier --DS-FC--. We compared the results of the DS-SVC and DS-FC +architectures with the state-of-the-art (SOTA) for ELRA-S0329 and +EmoMatchSpanishDB. Moreover, we proposed our own classifier based upon +Attention Mechanisms, namely DS-AM. We trained all models against both +datasets, and we found that our DS-AM model outperforms the SOTA models for the +datasets and the SOTA DeepSpectrum architectures. Finally, we trained our DS-AM +model in one dataset and tested it in the other, to simulate real-world +conditions on how biased is the model to the dataset. + +
+
+
+
+
+ + ☆ READoc: A Unified Benchmark for Realistic Document Structured Extraction + + +
+ Document Structured Extraction (DSE) aims to extract structured content from +raw documents. Despite the emergence of numerous DSE systems, their unified +evaluation remains inadequate, significantly hindering the field's advancement. +This problem is largely attributed to existing benchmark paradigms, which +exhibit fragmented and localized characteristics. To address these limitations +and offer a thorough evaluation of DSE systems, we introduce a novel benchmark +named READoc, which defines DSE as a realistic task of converting unstructured +PDFs into semantically rich Markdown. The READoc dataset is derived from 2,233 +diverse and real-world documents from arXiv and GitHub. In addition, we develop +a DSE Evaluation S$^3$uite comprising Standardization, Segmentation and Scoring +modules, to conduct a unified evaluation of state-of-the-art DSE approaches. By +evaluating a range of pipeline tools, expert visual models, and general VLMs, +we identify the gap between current work and the unified, realistic DSE +objective for the first time. We aspire that READoc will catalyze future +research in DSE, fostering more comprehensive and practical solutions. + +
+
+
+
+
+ + ☆ MHS-STMA: Multimodal Hate Speech Detection via Scalable + Transformer-Based Multilevel Attention Framework + + +
+ Social media has a significant impact on people's lives. Hate speech on +social media has emerged as one of society's most serious issues recently. Text +and pictures are two forms of multimodal data distributed within articles. +Unimodal analysis has been the primary emphasis of earlier approaches. +Additionally, when doing multimodal analysis, researchers neglect to preserve +the distinctive qualities associated with each modality. The present article +suggests a scalable architecture for multimodal hate content detection called +transformer-based multilevel attention (STMA) to address these shortcomings. +This architecture consists of three main parts: a combined attention-based deep +learning mechanism, a vision attention mechanism encoder, and a caption +attention-mechanism encoder. To identify hate content, each component uses +various attention processes and uniquely handles multimodal data. Several +studies employing multiple assessment criteria on three hate speech datasets: +Hateful memes, MultiOff, and MMHS150K, validate the suggested architecture's +efficacy. The outcomes demonstrate that on all three datasets, the suggested +strategy performs better than the baseline approaches. + +
+
+
+
+
+ + ☆ Hate Content Detection via Novel Pre-Processing Sequencing and Ensemble + Methods + + +
+ Social media, particularly Twitter, has seen a significant increase in +incidents like trolling and hate speech. Thus, identifying hate speech is the +need of the hour. This paper introduces a computational framework to curb the +hate content on the web. Specifically, this study presents an exhaustive study +of pre-processing approaches by studying the impact of changing the sequence of +text pre-processing operations for the identification of hate content. The +best-performing pre-processing sequence, when implemented with popular +classification approaches like Support Vector Machine, Random Forest, Decision +Tree, Logistic Regression and K-Neighbor provides a considerable boost in +performance. Additionally, the best pre-processing sequence is used in +conjunction with different ensemble methods, such as bagging, boosting and +stacking to improve the performance further. Three publicly available benchmark +datasets (WZ-LS, DT, and FOUNTA), were used to evaluate the proposed approach +for hate speech identification. The proposed approach achieves a maximum +accuracy of 95.14% highlighting the effectiveness of the unique pre-processing +approach along with an ensemble classifier. + +
+
+
+
+
+ + ☆ WaterSeeker: Efficient Detection of Watermarked Segments in Large + Documents + + +
+ Watermarking algorithms for large language models (LLMs) have attained high +accuracy in detecting LLM-generated text. However, existing methods primarily +focus on distinguishing fully watermarked text from non-watermarked text, +overlooking real-world scenarios where LLMs generate only small sections within +large documents. In this scenario, balancing time complexity and detection +performance poses significant challenges. This paper presents WaterSeeker, a +novel approach to efficiently detect and locate watermarked segments amid +extensive natural text. It first applies an efficient anomaly extraction method +to preliminarily locate suspicious watermarked regions. Following this, it +conducts a local traversal and performs full-text detection for more precise +verification. Theoretical analysis and experimental results demonstrate that +WaterSeeker achieves a superior balance between detection accuracy and +computational efficiency. Moreover, WaterSeeker's localization ability supports +the development of interpretable AI detection systems. This work pioneers a new +direction in watermarked segment detection, facilitating more reliable +AI-generated content identification. + +
+
+ comment: 18 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ EdaCSC: Two Easy Data Augmentation Methods for Chinese Spelling + Correction + + +
+ Chinese Spelling Correction (CSC) aims to detect and correct spelling errors +in Chinese sentences caused by phonetic or visual similarities. While current +CSC models integrate pinyin or glyph features and have shown significant +progress,they still face challenges when dealing with sentences containing +multiple typos and are susceptible to overcorrection in real-world scenarios. +In contrast to existing model-centric approaches, we propose two data +augmentation methods to address these limitations. Firstly, we augment the +dataset by either splitting long sentences into shorter ones or reducing typos +in sentences with multiple typos. Subsequently, we employ different training +processes to select the optimal model. Experimental evaluations on the SIGHAN +benchmarks demonstrate the superiority of our approach over most existing +models, achieving state-of-the-art performance on the SIGHAN15 test set. + +
+
+ comment: 18 pages, 2 figures +
+
+
+
+
+ + ☆ LLM-based Abstraction and Concretization for GUI Test Migration + + +
+ GUI test migration aims to produce test cases with events and assertions to +test specific functionalities of a target app. Existing migration approaches +typically focus on the widget-mapping paradigm that maps widgets from source +apps to target apps. However, since different apps may implement the same +functionality in different ways, direct mapping may result in incomplete or +buggy test cases, thus significantly impacting the effectiveness of testing +target functionality and the practical applicability. + In this paper, we propose a new migration paradigm (i.e., +abstraction-concretization paradigm) that first abstracts the test logic for +the target functionality and then utilizes this logic to generate the concrete +GUI test case. Furthermore, we introduce MACdroid, the first approach that +migrates GUI test cases based on this paradigm. Specifically, we propose an +abstraction technique that utilizes source test cases from source apps +targeting the same functionality to extract a general test logic for that +functionality. Then, we propose a concretization technique that utilizes the +general test logic to guide an LLM in generating the corresponding GUI test +case (including events and assertions) for the target app. We evaluate MACdroid +on two widely-used datasets (including 31 apps, 34 functionalities, and 123 +test cases). On the FrUITeR dataset, the test cases generated by MACdroid +successfully test 64% of the target functionalities, improving the baselines by +191%. On the Lin dataset, MACdroid successfully tests 75% of the target +functionalities, outperforming the baselines by 42%. These results underscore +the effectiveness of MACdroid in GUI test migration. + +
+
+
+
+
+ + ☆ Vision-fused Attack: Advancing Aggressive and Stealthy Adversarial Text + against Neural Machine Translation IJCAI 2024 + + +
+ While neural machine translation (NMT) models achieve success in our daily +lives, they show vulnerability to adversarial attacks. Despite being harmful, +these attacks also offer benefits for interpreting and enhancing NMT models, +thus drawing increased research attention. However, existing studies on +adversarial attacks are insufficient in both attacking ability and human +imperceptibility due to their sole focus on the scope of language. This paper +proposes a novel vision-fused attack (VFA) framework to acquire powerful +adversarial text, i.e., more aggressive and stealthy. Regarding the attacking +ability, we design the vision-merged solution space enhancement strategy to +enlarge the limited semantic solution space, which enables us to search for +adversarial candidates with higher attacking ability. For human +imperceptibility, we propose the perception-retained adversarial text selection +strategy to align the human text-reading mechanism. Thus, the finally selected +adversarial text could be more deceptive. Extensive experiments on various +models, including large language models (LLMs) like LLaMA and GPT-3.5, strongly +support that VFA outperforms the comparisons by large margins (up to 81%/14% +improvements on ASR/SSIM). + +
+
+ comment: IJCAI 2024 +
+
+
+
+
+ + ☆ Towards Patronizing and Condescending Language in Chinese Videos: A + Multimodal Dataset and Detector ICASSP 2025 + + +
+ Patronizing and Condescending Language (PCL) is a form of discriminatory +toxic speech targeting vulnerable groups, threatening both online and offline +safety. While toxic speech research has mainly focused on overt toxicity, such +as hate speech, microaggressions in the form of PCL remain underexplored. +Additionally, dominant groups' discriminatory facial expressions and attitudes +toward vulnerable communities can be more impactful than verbal cues, yet these +frame features are often overlooked. In this paper, we introduce the PCLMM +dataset, the first Chinese multimodal dataset for PCL, consisting of 715 +annotated videos from Bilibili, with high-quality PCL facial frame spans. We +also propose the MultiPCL detector, featuring a facial expression detection +module for PCL recognition, demonstrating the effectiveness of modality +complementarity in this challenging task. Our work makes an important +contribution to advancing microaggression detection within the domain of toxic +speech. + +
+
+ comment: Under review in ICASSP 2025 +
+
+
+
+
+ + ☆ InstInfer: In-Storage Attention Offloading for Cost-Effective + Long-Context LLM Inference + + +
+ The widespread of Large Language Models (LLMs) marks a significant milestone +in generative AI. Nevertheless, the increasing context length and batch size in +offline LLM inference escalate the memory requirement of the key-value (KV) +cache, which imposes a huge burden on the GPU VRAM, especially for +resource-constraint scenarios (e.g., edge computing and personal devices). +Several cost-effective solutions leverage host memory or SSDs to reduce storage +costs for offline inference scenarios and improve the throughput. Nevertheless, +they suffer from significant performance penalties imposed by intensive KV +cache accesses due to limited PCIe bandwidth. To address these issues, we +propose InstInfer, a novel LLM inference system that offloads the most +performance-critical computation (i.e., attention in decoding phase) and data +(i.e., KV cache) parts to Computational Storage Drives (CSDs), which minimize +the enormous KV transfer overheads. InstInfer designs a dedicated flash-aware +in-storage attention engine with KV cache management mechanisms to exploit the +high internal bandwidths of CSDs instead of being limited by the PCIe +bandwidth. The optimized P2P transmission between GPU and CSDs further reduces +data migration overheads. Experimental results demonstrate that for a 13B model +using an NVIDIA A6000 GPU, InstInfer improves throughput for long-sequence +inference by up to 11.1$\times$, compared to existing SSD-based solutions such +as FlexGen. + +
+
+
+
+
+ + ☆ Evaluation of Google Translate for Mandarin Chinese translation using + sentiment and semantic analysis + + +
+ Machine translation using large language models (LLMs) is having a +significant global impact, making communication easier. Mandarin Chinese is the +official language used for communication by the government, education +institutes, and media in China. In this study, we provide an automated +assessment of machine translation models with human experts using sentiment and +semantic analysis. In order to demonstrate our framework, we select classic +early twentieth-century novel 'The True Story of Ah Q' with selected Mandarin +Chinese to English translations. We also us Google Translate to generate the +given text into English and then conduct a chapter-wise sentiment analysis and +semantic analysis to compare the extracted sentiments across the different +translations. We utilise LLMs for semantic and sentiment analysis. Our results +indicate that the precision of Google Translate differs both in terms of +semantic and sentiment analysis when compared to human expert translations. We +find that Google Translate is unable to translate some of the specific words or +phrases in Chinese, such as Chinese traditional allusions. The mistranslations +have to its lack of contextual significance and historical knowledge of China. +Thus, this framework brought us some new insights about machine translation for +Chinese Mandarin. The future work can explore other languages or types of texts +with this framework. + +
+
+
+
+
+ + ♻ ☆ Knowledge-Aware Conversation Derailment Forecasting Using Graph + Convolutional Networks + + +
+ Online conversations are particularly susceptible to derailment, which can +manifest itself in the form of toxic communication patterns including +disrespectful comments and abuse. Forecasting conversation derailment predicts +signs of derailment in advance enabling proactive moderation of conversations. +State-of-the-art approaches to conversation derailment forecasting sequentially +encode conversations and use graph neural networks to model dialogue user +dynamics. However, existing graph models are not able to capture complex +conversational characteristics such as context propagation and emotional +shifts. The use of common sense knowledge enables a model to capture such +characteristics, thus improving performance. Following this approach, here we +derive commonsense statements from a knowledge base of dialogue contextual +information to enrich a graph neural network classification architecture. We +fuse the multi-source information on utterance into capsules, which are used by +a transformer-based forecaster to predict conversation derailment. Our model +captures conversation dynamics and context propagation, outperforming the +state-of-the-art models on the CGA and CMV benchmark datasets + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2306.12982; + text overlap with arXiv:2106.01071 by other authors +
+
+
+
+
+ + ♻ ☆ Self-Reflection in LLM Agents: Effects on Problem-Solving Performance + + +
+ In this study, we investigated the effects of self-reflection in large +language models (LLMs) on problem-solving performance. We instructed nine +popular LLMs to answer a series of multiple-choice questions to provide a +performance baseline. For each incorrectly answered question, we instructed +eight types of self-reflecting LLM agents to reflect on their mistakes and +provide themselves with guidance to improve problem-solving. Then, using this +guidance, each self-reflecting agent attempted to re-answer the same questions. +Our results indicate that LLM agents are able to significantly improve their +problem-solving performance through self-reflection ($p < 0.001$). In addition, +we compared the various types of self-reflection to determine their individual +contribution to performance. All code and data are available on GitHub at +https://github.com/matthewrenze/self-reflection + +
+
+
+
+
+ + ♻ ☆ ProGRes: Prompted Generative Rescoring on ASR n-Best + + +
+ Large Language Models (LLMs) have shown their ability to improve the +performance of speech recognizers by effectively rescoring the n-best +hypotheses generated during the beam search process. However, the best way to +exploit recent generative instruction-tuned LLMs for hypothesis rescoring is +still unclear. This paper proposes a novel method that uses instruction-tuned +LLMs to dynamically expand the n-best speech recognition hypotheses with new +hypotheses generated through appropriately-prompted LLMs. Specifically, we +introduce a new zero-shot method for ASR n-best rescoring, which combines +confidence scores, LLM sequence scoring, and prompt-based hypothesis +generation. We compare Llama-3-Instruct, GPT-3.5 Turbo, and GPT-4 Turbo as +prompt-based generators with Llama-3 as sequence scorer LLM. We evaluated our +approach using different speech recognizers and observed significant relative +improvement in the word error rate (WER) ranging from 5% to 25%. + +
+
+ comment: IEEE Spoken Language Technology Workshop +
+
+
+
+
+ + ♻ ☆ Using LLMs to Establish Implicit User Sentiment of Software Desirability + + +
+ This study explores the use of LLMs for providing quantitative zero-shot +sentiment analysis of implicit software desirability, addressing a critical +challenge in product evaluation where traditional review scores, though +convenient, fail to capture the richness of qualitative user feedback. +Innovations include establishing a method that 1) works with qualitative user +experience data without the need for explicit review scores, 2) focuses on +implicit user satisfaction, and 3) provides scaled numerical sentiment +analysis, offering a more nuanced understanding of user sentiment, instead of +simply classifying sentiment as positive, neutral, or negative. + Data is collected using the Microsoft Product Desirability Toolkit (PDT), a +well-known qualitative user experience analysis tool. For initial exploration, +the PDT metric was given to users of two software systems. PDT data was fed +through several LLMs (Claude Sonnet 3 and 3.5, GPT4, and GPT4o) and through a +leading transfer learning technique, Twitter-Roberta-Base-Sentiment, and Vader, +a leading sentiment analysis tool. Each system was asked to evaluate the data +in two ways, by looking at the sentiment expressed in the PDT word/explanation +pairs; and by looking at the sentiment expressed by the users in their grouped +selection of five words and explanations, as a whole. Each LLM provided a +sentiment score, its confidence (low, medium, high) in the score, and an +explanation of the score. + All LLMs tested were able to statistically detect user sentiment from the +users' grouped data, whereas TRBS and Vader were not. The confidence and +explanation of confidence provided by the LLMs assisted in understanding user +sentiment. This study adds deeper understanding of evaluating user experiences, +toward the goal of creating a universal tool that quantifies implicit +sentiment. + +
+
+ comment: 6 pages, 2 figures, 2 tables, updated to incorporate feedback +
+
+
+
+
+ + ♻ ☆ Fact-and-Reflection (FaR) Improves Confidence Calibration of Large + Language Models + + +
+ For a LLM to be trustworthy, its confidence level should be well-calibrated +with its actual performance. While it is now common sense that LLM performances +are greatly impacted by prompts, the confidence calibration in prompting LLMs +has yet to be thoroughly explored. In this paper, we explore how different +prompting strategies influence LLM confidence calibration and how it could be +improved. We conduct extensive experiments on six prompting methods in the +question-answering context and we observe that, while these methods help +improve the expected LLM calibration, they also trigger LLMs to be +over-confident when responding to some instances. Inspired by human cognition, +we propose Fact-and-Reflection (FaR) prompting, which improves the LLM +calibration in two steps. First, FaR elicits the known "facts" that are +relevant to the input prompt from the LLM. And then it asks the model to +"reflect" over them to generate the final answer. Experiments show that FaR +prompting achieves significantly better calibration; it lowers the Expected +Calibration Error by 23.5% on our multi-purpose QA tasks. Notably, FaR +prompting even elicits the capability of verbally expressing concerns in less +confident scenarios, which helps trigger retrieval augmentation for solving +these harder instances. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ SS-GEN: A Social Story Generation Framework with Large Language Models + + +
+ Children with Autism Spectrum Disorder (ASD) often misunderstand social +situations and struggle to participate in daily routines. Social Stories are +traditionally crafted by psychology experts under strict constraints to address +these challenges but are costly and limited in diversity. As Large Language +Models (LLMs) advance, there's an opportunity to develop more automated, +affordable, and accessible methods to generate Social Stories in real-time with +broad coverage. However, adapting LLMs to meet the unique and strict +constraints of Social Stories is a challenging issue. To this end, we propose +\textbf{SS-GEN}, a \textbf{S}ocial \textbf{S}tory \textbf{GEN}eration framework +with LLMs. Firstly, we develop a constraint-driven sophisticated strategy named +\textbf{\textsc{StarSow}} to hierarchically prompt LLMs to generate Social +Stories at scale, followed by rigorous human filtering to build a high-quality +dataset. Additionally, we introduce \textbf{quality assessment criteria} to +evaluate the effectiveness of these generated stories. Considering that +powerful closed-source large models require very complex instructions and +expensive API fees, we finally fine-tune smaller language models with our +curated high-quality dataset, achieving comparable results at lower costs and +with simpler instruction and deployment. This work marks a significant step in +leveraging AI to personalize Social Stories cost-effectively for autistic +children at scale, which we hope can encourage future research. The prompt, +code and data will release in the \texttt{Technical Appendix} and \texttt{Code +\& Data Appendix} at \url{https://github.com/MIMIFY/SS-GEN}. + +
+
+
+
+
+ + ♻ ☆ Predictability maximization and the origins of word order harmony + + +
+ We address the linguistic problem of the sequential arrangement of a head and +its dependents from an information theoretic perspective. In particular, we +consider the optimal placement of a head that maximizes the predictability of +the sequence. We assume that dependents are statistically independent given a +head, in line with the open-choice principle and the core assumptions of +dependency grammar. We demonstrate the optimality of harmonic order, i.e., +placing the head last maximizes the predictability of the head whereas placing +the head first maximizes the predictability of dependents. We also show that +postponing the head is the optimal strategy to maximize its predictability +while bringing it forward is the optimal strategy to maximize the +predictability of dependents. We unravel the advantages of the strategy of +maximizing the predictability of the head over maximizing the predictability of +dependents. Our findings shed light on the placements of the head adopted by +real languages or emerging in different kinds of experiments. + +
+
+ comment: Local reorganization of the text; many typos corrected +
+
+
+
+
+ + ♻ ☆ Data Alignment for Zero-Shot Concept Generation in Dermatology AI ICLR 2024 + + +
+ AI in dermatology is evolving at a rapid pace but the major limitation to +training trustworthy classifiers is the scarcity of data with ground-truth +concept level labels, which are meta-labels semantically meaningful to humans. +Foundation models like CLIP providing zero-shot capabilities can help alleviate +this challenge by leveraging vast amounts of image-caption pairs available on +the internet. CLIP can be fine-tuned using domain specific image-caption pairs +to improve classification performance. However, CLIP's pre-training data is not +well-aligned with the medical jargon that clinicians use to perform diagnoses. +The development of large language models (LLMs) in recent years has led to the +possibility of leveraging the expressive nature of these models to generate +rich text. Our goal is to use these models to generate caption text that aligns +well with both the clinical lexicon and with the natural human language used in +CLIP's pre-training data. Starting with captions used for images in PubMed +articles, we extend them by passing the raw captions through an LLM fine-tuned +on the field's several textbooks. We find that using captions generated by an +expressive fine-tuned LLM like GPT-3.5 improves downstream zero-shot concept +classification performance. + +
+
+ comment: Accepted as a workshop paper to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Distributed Speculative Inference of Large Language Models is Provably + Faster + + +
+ Accelerating the inference of large language models (LLMs) is an important +challenge in artificial intelligence. This paper introduces Distributed +Speculative Inference (DSI), a novel distributed inference algorithm that is +provably faster than speculative inference (SI) +[leviathan2023fast,chen2023accelerating,miao2023specinfer] and traditional +autoregressive inference (non-SI). Like other SI algorithms, DSI works on +frozen LLMs, requiring no training or architectural modifications, and it +preserves the target distribution. Prior studies on SI have demonstrated +empirical speedups (compared to non-SI) but require fast and accurate drafters, +which are often unavailable in practice. We identify a gap where SI can be +slower than non-SI given slower or less accurate drafters. We close this gap by +proving that DSI is faster than both SI and non-SI--given any drafters. DSI +introduces a novel type of task parallelism called Speculation Parallelism +(SP), which orchestrates target and drafter instances to overlap in time, +creating a new foundational tradeoff between computational resources and +latency. DSI is not only faster than SI but also supports LLMs that cannot be +accelerated with SI. Our simulations show speedups of off-the-shelf LLMs in +realistic single-node settings where DSI is 1.29-1.92x faster than SI. + +
+
+
+
+
+ + ♻ ☆ GEGA: Graph Convolutional Networks and Evidence Retrieval Guided + Attention for Enhanced Document-level Relation Extraction + + +
+ Document-level relation extraction (DocRE) aims to extract relations between +entities from unstructured document text. Compared to sentence-level relation +extraction, it requires more complex semantic understanding from a broader text +context. Currently, some studies are utilizing logical rules within evidence +sentences to enhance the performance of DocRE. However, in the data without +provided evidence sentences, researchers often obtain a list of evidence +sentences for the entire document through evidence retrieval (ER). Therefore, +DocRE suffers from two challenges: firstly, the relevance between evidence and +entity pairs is weak; secondly, there is insufficient extraction of complex +cross-relations between long-distance multi-entities. To overcome these +challenges, we propose GEGA, a novel model for DocRE. The model leverages graph +neural networks to construct multiple weight matrices, guiding attention +allocation to evidence sentences. It also employs multi-scale representation +aggregation to enhance ER. Subsequently, we integrate the most efficient +evidence information to implement both fully supervised and weakly supervised +training processes for the model. We evaluate the GEGA model on three widely +used benchmark datasets: DocRED, Re-DocRED, and Revisit-DocRED. The +experimental results indicate that our model has achieved comprehensive +improvements compared to the existing SOTA model. + +
+
+
+
+
+ + ♻ ☆ T2VSafetyBench: Evaluating the Safety of Text-to-Video Generative Models + + +
+ The recent development of Sora leads to a new era in text-to-video (T2V) +generation. Along with this comes the rising concern about its security risks. +The generated videos may contain illegal or unethical content, and there is a +lack of comprehensive quantitative understanding of their safety, posing a +challenge to their reliability and practical deployment. Previous evaluations +primarily focus on the quality of video generation. While some evaluations of +text-to-image models have considered safety, they cover fewer aspects and do +not address the unique temporal risk inherent in video generation. To bridge +this research gap, we introduce T2VSafetyBench, a new benchmark designed for +conducting safety-critical assessments of text-to-video models. We define 12 +critical aspects of video generation safety and construct a malicious prompt +dataset including real-world prompts, LLM-generated prompts and jailbreak +attack-based prompts. Based on our evaluation results, we draw several +important findings, including: 1) no single model excels in all aspects, with +different models showing various strengths; 2) the correlation between GPT-4 +assessments and manual reviews is generally high; 3) there is a trade-off +between the usability and safety of text-to-video generative models. This +indicates that as the field of video generation rapidly advances, safety risks +are set to surge, highlighting the urgency of prioritizing video safety. We +hope that T2VSafetyBench can provide insights for better understanding the +safety of video generation in the era of generative AI. + +
+
+
+
+
+ + ♻ ☆ No Train but Gain: Language Arithmetic for training-free Language + Adapters enhancement + + +
+ Modular deep learning is the state-of-the-art solution for lifting the curse +of multilinguality, preventing the impact of negative interference and enabling +cross-lingual performance in Multilingual Pre-trained Language Models. However, +a trade-off of this approach is the reduction in positive transfer learning +from closely related languages. In response, we introduce a novel method called +language arithmetic, which enables training-free post-processing to address +this limitation. Extending the task arithmetic framework, we apply learning via +addition to the language adapters, transitioning the framework from a +multi-task to a multilingual setup. The effectiveness of the proposed solution +is demonstrated on three downstream tasks in a MAD-X-based set of cross-lingual +schemes, acting as a post-processing procedure. Language arithmetic +consistently improves the baselines with significant gains, especially in the +most challenging case of zero-shot application. Our code and models are +available at https://github.com/mklimasz/language-arithmetic . + +
+
+
+
+
+ + ♻ ☆ DSLR: Document Refinement with Sentence-Level Re-ranking and + Reconstruction to Enhance Retrieval-Augmented Generation + + +
+ Recent advancements in Large Language Models (LLMs) have significantly +improved their performance across various Natural Language Processing (NLP) +tasks. However, LLMs still struggle with generating non-factual responses due +to limitations in their parametric memory. Retrieval-Augmented Generation (RAG) +systems address this issue by incorporating external knowledge with a retrieval +module. Despite their successes, however, current RAG systems face challenges +with retrieval failures and the limited ability of LLMs to filter out +irrelevant information. Therefore, in this work, we propose DSLR (Document +Refinement with Sentence-Level Re-ranking and Reconstruction), an unsupervised +framework that decomposes retrieved documents into sentences, filters out +irrelevant sentences, and reconstructs them again into coherent passages. We +experimentally validate DSLR on multiple open-domain QA datasets and the +results demonstrate that DSLR significantly enhances the RAG performance over +conventional fixed-size passage. Furthermore, our DSLR enhances performance in +specific, yet realistic scenarios without the need for additional training, +providing an effective and efficient solution for refining retrieved documents +in RAG systems. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Wireless Symbol Detection via + In-Context Learning + + +
+ Deep neural networks (DNNs) have made significant strides in tackling +challenging tasks in wireless systems, especially when an accurate wireless +model is not available. However, when available data is limited, traditional +DNNs often yield subpar results due to underfitting. At the same time, large +language models (LLMs) exemplified by GPT-3, have remarkably showcased their +capabilities across a broad range of natural language processing tasks. But +whether and how LLMs can benefit challenging non-language tasks in wireless +systems is unexplored. In this work, we propose to leverage the in-context +learning ability (a.k.a. prompting) of LLMs to solve wireless tasks in the low +data regime without any training or fine-tuning, unlike DNNs which require +training. We further demonstrate that the performance of LLMs varies +significantly when employed with different prompt templates. To solve this +issue, we employ the latest LLM calibration methods. Our results reveal that +using LLMs via ICL methods generally outperforms traditional DNNs on the symbol +demodulation task and yields highly confident predictions when coupled with +calibration techniques. + +
+
+ comment: Accepted at IEEE GLOBECOM 2024 +
+
+
+
+
+ + ♻ ☆ Tasks People Prompt: A Taxonomy of LLM Downstream Tasks in Software + Verification and Falsification Approaches + + +
+ Prompting has become one of the main approaches to leverage emergent +capabilities of Large Language Models [Brown et al. NeurIPS 2020, Wei et al. +TMLR 2022, Wei et al. NeurIPS 2022]. Recently, researchers and practitioners +have been "playing" with prompts (e.g., In-Context Learning) to see how to make +the most of pre-trained Language Models. By homogeneously dissecting more than +a hundred articles, we investigate how software testing and verification +research communities have leveraged LLMs capabilities. First, we validate that +downstream tasks are adequate to convey a nontrivial modular blueprint of +prompt-based proposals in scope. Moreover, we name and classify the concrete +downstream tasks we recover in both validation research papers and solution +proposals. In order to perform classification, mapping, and analysis, we also +develop a novel downstream-task taxonomy. The main taxonomy requirement is to +highlight commonalities while exhibiting variation points of task types that +enable pinpointing emerging patterns in a varied spectrum of Software +Engineering problems that encompasses testing, fuzzing, fault localization, +vulnerability detection, static analysis, and program verification approaches. +Avenues for future research are also discussed based on conceptual clusters +induced by the taxonomy. + +
+
+
+
+
+ + ♻ ☆ GuideWalk: A Novel Graph-Based Word Embedding for Enhanced Text + Classification + + +
+ One of the prime problems of computer science and machine learning is to +extract information efficiently from large-scale, heterogeneous data. Text +data, with its syntax, semantics, and even hidden information content, +possesses an exceptional place among the data types in concern. The processing +of the text data requires embedding, a method of translating the content of the +text to numeric vectors. A correct embedding algorithm is the starting point +for obtaining the full information content of the text data. In this work, a +new text embedding approach, namely the Guided Transition Probability Matrix +(GTPM) model is proposed. The model uses the graph structure of sentences to +capture different types of information from text data, such as syntactic, +semantic, and hidden content. Using random walks on a weighted word graph, GTPM +calculates transition probabilities to derive text embedding vectors. The +proposed method is tested with real-world data sets and eight well-known and +successful embedding algorithms. GTPM shows significantly better classification +performance for binary and multi-class datasets than well-known algorithms. +Additionally, the proposed method demonstrates superior robustness, maintaining +performance with limited (only $10\%$) training data, showing an $8\%$ decline +compared to $15-20\%$ for baseline methods. + +
+
+
+
+
+ + ♻ ☆ A Generative Marker Enhanced End-to-End Framework for Argument Mining + + +
+ Argument Mining (AM) involves identifying and extracting Argumentative +Components (ACs) and their corresponding Argumentative Relations (ARs). Most of +the prior works have broken down these tasks into multiple sub-tasks. Existing +end-to-end setups primarily use the dependency parsing approach. This work +introduces a generative paradigm-based end-to-end framework argTANL. argTANL +frames the argumentative structures into label-augmented text, called Augmented +Natural Language (ANL). This framework jointly extracts both ACs and ARs from a +given argumentative text. Additionally, this study explores the impact of +Argumentative and Discourse markers on enhancing the model's performance within +the proposed framework. Two distinct frameworks, Marker-Enhanced argTANL +(ME-argTANL) and argTANL with specialized Marker-Based Fine-Tuning, are +proposed to achieve this. Extensive experiments are conducted on three standard +AM benchmarks to demonstrate the superior performance of the ME-argTANL. + +
+
+
+
+
+ + ♻ ☆ Toward Understanding BERT-Like Pre-Training for DNA Foundation Models + + +
+ With the success of large-scale pre-training in language tasks, there is an +increasing trend of applying it to the domain of life sciences. In particular, +pre-training methods based on DNA sequences have received increasing attention +because of their potential to capture general information about genes. However, +existing pre-training methods for DNA sequences largely rely on direct +adoptions of BERT pre-training from NLP, lacking a comprehensive understanding +and a specifically tailored approach. To address this research gap, we provide +the first empirical study with three insightful observations. Based on the +empirical study, we notice that overlapping tokenizer can benefit the +fine-tuning of downstream tasks but leads to inadequate pre-training with fast +convergence. To unleash the pre-training potential, we introduce a novel +approach called RandomMask, which gradually increases the task difficulty of +BERT-like pre-training by continuously expanding its mask boundary, forcing the +model to learn more knowledge. RandomMask is simple but effective, achieving +state-of-the-art performance across 6 downstream tasks. RandomMask achieves a +staggering 68.16\% in Matthew's correlation coefficient for Epigenetic Mark +Prediction, a groundbreaking increase of 19.85\% over the baseline and a +remarkable 3.69\% improvement over the previous state-of-the-art result. + +
+
+
+
+
+ + ♻ ☆ CodeIP: A Grammar-Guided Multi-Bit Watermark for Large Language Models + of Code + + +
+ Large Language Models (LLMs) have achieved remarkable progress in code +generation. It now becomes crucial to identify whether the code is AI-generated +and to determine the specific model used, particularly for purposes such as +protecting Intellectual Property (IP) in industry and preventing cheating in +programming exercises. To this end, several attempts have been made to insert +watermarks into machine-generated code. However, existing approaches are +limited to inserting only a single bit of information or overly depending on +particular code patterns. In this paper, we introduce CodeIP, a novel multi-bit +watermarking technique that embeds additional information to preserve crucial +provenance details, such as the vendor ID of an LLM, thereby safeguarding the +IPs of LLMs in code generation. Furthermore, to ensure the syntactical +correctness of the generated code, we propose constraining the sampling process +for predicting the next token by training a type predictor. Experiments +conducted on a real-world dataset across five programming languages demonstrate +the effectiveness of CodeIP in watermarking LLMs for code generation while +maintaining the syntactical correctness of code. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ TF-Attack: Transferable and Fast Adversarial Attacks on Large Language + Models + + +
+ With the great advancements in large language models (LLMs), adversarial +attacks against LLMs have recently attracted increasing attention. We found +that pre-existing adversarial attack methodologies exhibit limited +transferability and are notably inefficient, particularly when applied to LLMs. +In this paper, we analyze the core mechanisms of previous predominant +adversarial attack methods, revealing that 1) the distributions of importance +score differ markedly among victim models, restricting the transferability; 2) +the sequential attack processes induces substantial time overheads. Based on +the above two insights, we introduce a new scheme, named TF-Attack, for +Transferable and Fast adversarial attacks on LLMs. TF-Attack employs an +external LLM as a third-party overseer rather than the victim model to identify +critical units within sentences. Moreover, TF-Attack introduces the concept of +Importance Level, which allows for parallel substitutions of attacks. We +conduct extensive experiments on 6 widely adopted benchmarks, evaluating the +proposed method through both automatic and human metrics. Results show that our +method consistently surpasses previous methods in transferability and delivers +significant speed improvements, up to 20 times faster than earlier attack +strategies. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Improving Retrieval-Augmented Generation in Medicine with Iterative + Follow-up Questions + + +
+ The emergent abilities of large language models (LLMs) have demonstrated +great potential in solving medical questions. They can possess considerable +medical knowledge, but may still hallucinate and are inflexible in the +knowledge updates. While Retrieval-Augmented Generation (RAG) has been proposed +to enhance the medical question-answering capabilities of LLMs with external +knowledge bases, it may still fail in complex cases where multiple rounds of +information-seeking are required. To address such an issue, we propose +iterative RAG for medicine (i-MedRAG), where LLMs can iteratively ask follow-up +queries based on previous information-seeking attempts. In each iteration of +i-MedRAG, the follow-up queries will be answered by a conventional RAG system +and they will be further used to guide the query generation in the next +iteration. Our experiments show the improved performance of various LLMs +brought by i-MedRAG compared with conventional RAG on complex questions from +clinical vignettes in the United States Medical Licensing Examination (USMLE), +as well as various knowledge tests in the Massive Multitask Language +Understanding (MMLU) dataset. Notably, our zero-shot i-MedRAG outperforms all +existing prompt engineering and fine-tuning methods on GPT-3.5, achieving an +accuracy of 69.68\% on the MedQA dataset. In addition, we characterize the +scaling properties of i-MedRAG with different iterations of follow-up queries +and different numbers of queries per iteration. Our case studies show that +i-MedRAG can flexibly ask follow-up queries to form reasoning chains, providing +an in-depth analysis of medical questions. To the best of our knowledge, this +is the first-of-its-kind study on incorporating follow-up queries into medical +RAG. + +
+
+
+
+
+ + ♻ ☆ Generalization Measures for Zero-Shot Cross-Lingual Transfer + + +
+ A model's capacity to generalize its knowledge to interpret unseen inputs +with different characteristics is crucial to build robust and reliable machine +learning systems. Language model evaluation tasks lack information metrics +about model generalization and their applicability in a new setting is measured +using task and language-specific downstream performance, which is often lacking +in many languages and tasks. In this paper, we explore a set of efficient and +reliable measures that could aid in computing more information related to the +generalization capability of language models in cross-lingual zero-shot +settings. In addition to traditional measures such as variance in parameters +after training and distance from initialization, we also measure the +effectiveness of sharpness in loss landscape in capturing the success in +cross-lingual transfer and propose a novel and stable algorithm to reliably +compute the sharpness of a model optimum that correlates to generalization. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 5 + +
+
+
+ + ☆ Mamba-Enhanced Text-Audio-Video Alignment Network for Emotion + Recognition in Conversations + + +
+ Emotion Recognition in Conversations (ERCs) is a vital area within multimodal +interaction research, dedicated to accurately identifying and classifying the +emotions expressed by speakers throughout a conversation. Traditional ERC +approaches predominantly rely on unimodal cues\-such as text, audio, or visual +data\-leading to limitations in their effectiveness. These methods encounter +two significant challenges: 1) Consistency in multimodal information. Before +integrating various modalities, it is crucial to ensure that the data from +different sources is aligned and coherent. 2) Contextual information capture. +Successfully fusing multimodal features requires a keen understanding of the +evolving emotional tone, especially in lengthy dialogues where emotions may +shift and develop over time. To address these limitations, we propose a novel +Mamba-enhanced Text-Audio-Video alignment network (MaTAV) for the ERC task. +MaTAV is with the advantages of aligning unimodal features to ensure +consistency across different modalities and handling long input sequences to +better capture contextual multimodal information. The extensive experiments on +the MELD and IEMOCAP datasets demonstrate that MaTAV significantly outperforms +existing state-of-the-art methods on the ERC task with a big margin. + +
+
+
+
+
+ + ☆ A Low-Computational Video Synopsis Framework with a Standard Dataset + + +
+ Video synopsis is an efficient method for condensing surveillance videos. +This technique begins with the detection and tracking of objects, followed by +the creation of object tubes. These tubes consist of sequences, each containing +chronologically ordered bounding boxes of a unique object. To generate a +condensed video, the first step involves rearranging the object tubes to +maximize the number of non-overlapping objects in each frame. Then, these tubes +are stitched to a background image extracted from the source video. The lack of +a standard dataset for the video synopsis task hinders the comparison of +different video synopsis models. This paper addresses this issue by introducing +a standard dataset, called SynoClip, designed specifically for the video +synopsis task. SynoClip includes all the necessary features needed to evaluate +various models directly and effectively. Additionally, this work introduces a +video synopsis model, called FGS, with low computational cost. The model +includes an empty-frame object detector to identify frames empty of any +objects, facilitating efficient utilization of the deep object detector. +Moreover, a tube grouping algorithm is proposed to maintain relationships among +tubes in the synthesized video. This is followed by a greedy tube rearrangement +algorithm, which efficiently determines the start time of each tube. Finally, +the proposed model is evaluated using the proposed dataset. The source code, +fine-tuned object detection model, and tutorials are available at +https://github.com/Ramtin-ma/VideoSynopsis-FGS. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Comparison of Two Augmentation Methods in Improving Detection Accuracy + of Hemarthrosis + + +
+ With the increase of computing power, machine learning models in medical +imaging have been introduced to help in rending medical diagnosis and +inspection, like hemophilia, a rare disorder in which blood cannot clot +normally. Often, one of the bottlenecks of detecting hemophilia is the lack of +data available to train the algorithm to increase the accuracy. As a possible +solution, this research investigated whether introducing augmented data by data +synthesis or traditional augmentation techniques can improve model accuracy, +helping to diagnose the diseases. To tackle this research, features of +ultrasound images were extracted by the pre-trained VGG-16, and similarities +were compared by cosine similarity measure based on extracted features in +different distributions among real images, synthetic images, and augmentation +images (Real vs. Real, Syn vs. Syn, Real vs. Different Batches of Syn, Real vs. +Augmentation Techniques). Model testing performance was investigated using +EffientNet-B4 to recognize "blood" images with two augmentation methods. In +addition, a gradient-weighted class activation mapping (Grad-CAM) visualization +was used to interpret the unexpected results like loss of accuracy. Synthetic +and real images do not show high similarity, with a mean similarity score of +0.4737. Synthetic batch 1 dataset and images by horizontal flip are more +similar to the original images. Classic augmentation techniques and data +synthesis can improve model accuracy, and data by traditional augmentation +techniques have a better performance than synthetic data. In addition, the +Grad-CAM heatmap figured out the loss of accuracy is due to a shift in the +domain. Overall, this research found that two augmentation methods, data +synthesis and traditional augmentation techniques, both can improve accuracy to +a certain extent to help to diagnose rare diseases. + +
+
+
+
+
+ + ☆ A Survey on Mixup Augmentations and Beyond + + +
+ As Deep Neural Networks have achieved thrilling breakthroughs in the past +decade, data augmentations have garnered increasing attention as regularization +techniques when massive labeled data are unavailable. Among existing +augmentations, Mixup and relevant data-mixing methods that convexly combine +selected samples and the corresponding labels are widely adopted because they +yield high performances by generating data-dependent virtual data while easily +migrating to various domains. This survey presents a comprehensive review of +foundational mixup methods and their applications. We first elaborate on the +training pipeline with mixup augmentations as a unified framework containing +modules. A reformulated framework could contain various mixup methods and give +intuitive operational procedures. Then, we systematically investigate the +applications of mixup augmentations on vision downstream tasks, various data +modalities, and some analysis \& theorems of mixup. Meanwhile, we conclude the +current status and limitations of mixup research and point out further work for +effective and efficient mixup augmentations. This survey can provide +researchers with the current state of the art in mixup methods and provide some +insights and guidance roles in the mixup arena. An online project with this +survey is available at \url{https://github.com/Westlake-AI/Awesome-Mixup}. + +
+
+ comment: Preprint V1 with 27 pages main text. Online project at + https://github.com/Westlake-AI/Awesome-Mixup +
+
+
+
+
+ + ☆ Lung-DETR: Deformable Detection Transformer for Sparse Lung Nodule + Anomaly Detection + + +
+ Accurate lung nodule detection for computed tomography (CT) scan imagery is +challenging in real-world settings due to the sparse occurrence of nodules and +similarity to other anatomical structures. In a typical positive case, nodules +may appear in as few as 3% of CT slices, complicating detection. To address +this, we reframe the problem as an anomaly detection task, targeting rare +nodule occurrences in a predominantly normal dataset. We introduce a novel +solution leveraging custom data preprocessing and Deformable Detection +Transformer (Deformable- DETR). A 7.5mm Maximum Intensity Projection (MIP) is +utilized to combine adjacent lung slices into single images, reducing the slice +count and decreasing nodule sparsity. This enhances spatial context, allowing +for better differentiation between nodules and other structures such as complex +vascular structures and bronchioles. Deformable-DETR is employed to detect +nodules, with a custom focal loss function to better handle the imbalanced +dataset. Our model achieves state-of-the-art performance on the LUNA16 dataset +with an F1 score of 94.2% (95.2% recall, 93.3% precision) on a dataset sparsely +populated with lung nodules that is reflective of real-world clinical data. + +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ OneGen: Efficient One-Pass Unified Generation and Retrieval for LLMs + + +
+ Despite the recent advancements in Large Language Models (LLMs), which have +significantly enhanced the generative capabilities for various NLP tasks, LLMs +still face limitations in directly handling retrieval tasks. However, many +practical applications demand the seamless integration of both retrieval and +generation. This paper introduces a novel and efficient One-pass Generation and +retrieval framework (OneGen), designed to improve LLMs' performance on tasks +that require both generation and retrieval. The proposed framework bridges the +traditionally separate training approaches for generation and retrieval by +incorporating retrieval tokens generated autoregressively. This enables a +single LLM to handle both tasks simultaneously in a unified forward pass. We +conduct experiments on two distinct types of composite tasks, RAG and Entity +Linking, to validate the pluggability, effectiveness, and efficiency of OneGen +in training and inference. Furthermore, our results show that integrating +generation and retrieval within the same context preserves the generative +capabilities of LLMs while improving retrieval performance. To the best of our +knowledge, OneGen is the first to enable LLMs to conduct vector retrieval +during the generation. + +
+
+ comment: Work in progress; code is available at + https://github.com/zjunlp/OneGen +
+
+
+
+
+ + ☆ A Survey on Diffusion Models for Recommender Systems + + +
+ While traditional recommendation techniques have made significant strides in +the past decades, they still suffer from limited generalization performance +caused by factors like inadequate collaborative signals, weak latent +representations, and noisy data. In response, diffusion models (DMs) have +emerged as promising solutions for recommender systems due to their robust +generative capabilities, solid theoretical foundations, and improved training +stability. To this end, in this paper, we present the first comprehensive +survey on diffusion models for recommendation, and draw a bird's-eye view from +the perspective of the whole pipeline in real-world recommender systems. We +systematically categorize existing research works into three primary domains: +(1) diffusion for data engineering & encoding, focusing on data augmentation +and representation enhancement; (2) diffusion as recommender models, employing +diffusion models to directly estimate user preferences and rank items; and (3) +diffusion for content presentation, utilizing diffusion models to generate +personalized content such as fashion and advertisement creatives. Our taxonomy +highlights the unique strengths of diffusion models in capturing complex data +distributions and generating high-quality, diverse samples that closely align +with user preferences. We also summarize the core characteristics of the +adapting diffusion models for recommendation, and further identify key areas +for future exploration, which helps establish a roadmap for researchers and +practitioners seeking to advance recommender systems through the innovative +application of diffusion models. To further facilitate the research community +of recommender systems based on diffusion models, we actively maintain a GitHub +repository for papers and other related resources in this rising direction +https://github.com/CHIANGEL/Awesome-Diffusion-for-RecSys. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Sequential Recommendation via Adaptive Robust Attention with + Multi-dimensional Embeddings + + +
+ Sequential recommendation models have achieved state-of-the-art performance +using self-attention mechanism. It has since been found that moving beyond only +using item ID and positional embeddings leads to a significant accuracy boost +when predicting the next item. In recent literature, it was reported that a +multi-dimensional kernel embedding with temporal contextual kernels to capture +users' diverse behavioral patterns results in a substantial performance +improvement. In this study, we further improve the sequential recommender +model's robustness and generalization by introducing a mix-attention mechanism +with a layer-wise noise injection (LNI) regularization. We refer to our +proposed model as adaptive robust sequential recommendation framework (ADRRec), +and demonstrate through extensive experiments that our model outperforms +existing self-attention architectures. + +
+
+
+
+
+ + ♻ ☆ Keyword-driven Retrieval-Augmented Large Language Models for Cold-start + User Recommendations + + +
+ Recent advancements in Large Language Models (LLMs) have shown significant +potential in enhancing recommender systems. However, addressing the cold-start +recommendation problem, where users lack historical data, remains a +considerable challenge. In this paper, we introduce KALM4Rec (Keyword-driven +Retrieval-Augmented Large Language Models for Cold-start User Recommendations), +a novel framework specifically designed to tackle this problem by requiring +only a few input keywords from users in a practical scenario of cold-start user +restaurant recommendations. KALM4Rec operates in two main stages: candidates +retrieval and LLM-based candidates re-ranking. In the first stage, +keyword-driven retrieval models are used to identify potential candidates, +addressing LLMs' limitations in processing extensive tokens and reducing the +risk of generating misleading information. In the second stage, we employ LLMs +with various prompting strategies, including zero-shot and few-shot techniques, +to re-rank these candidates by integrating multiple examples directly into the +LLM prompts. Our evaluation, using a Yelp restaurant dataset with user reviews +from three English-speaking cities, shows that our proposed framework +significantly improves recommendation quality. Specifically, the integration of +in-context instructions with LLMs for re-ranking markedly enhances the +performance of the cold-start user recommender system. + +
+
+ comment: 10 pages, 10 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ DREAM: A Dual Representation Learning Model for Multimodal + Recommendation + + +
+ Multimodal recommendation focuses primarily on effectively exploiting both +behavioral and multimodal information for the recommendation task. However, +most existing models suffer from the following issues when fusing information +from two different domains: (1) Previous works do not pay attention to the +sufficient utilization of modal information by only using direct concatenation, +addition, or simple linear layers for modal information extraction. (2) +Previous works treat modal features as learnable embeddings, which causes the +modal embeddings to gradually deviate from the original modal features during +learning. We refer to this issue as Modal Information Forgetting. (3) Previous +approaches fail to account for the significant differences in the distribution +between behavior and modality, leading to the issue of representation +misalignment. To address these challenges, this paper proposes a novel Dual +REpresentAtion learning model for Multimodal Recommendation called DREAM. For +sufficient information extraction, we introduce separate dual lines, including +Behavior Line and Modal Line, in which the Modal-specific Encoder is applied to +empower modal representations. To address the issue of Modal Information +Forgetting, we introduce the Similarity Supervised Signal to constrain the +modal representations. Additionally, we design a Behavior-Modal Alignment +module to fuse the dual representations through Intra-Alignment and +Inter-Alignment. Extensive experiments on three public datasets demonstrate +that the proposed DREAM method achieves state-of-the-art (SOTA) results. The +source code will be available upon acceptance. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+
+
+
+ + Machine Learning 21 + +
+
+
+ + ☆ Empowering Bayesian Neural Networks with Functional Priors through + Anchored Ensembling for Mechanics Surrogate Modeling Applications + + +
+ In recent years, neural networks (NNs) have become increasingly popular for +surrogate modeling tasks in mechanics and materials modeling applications. +While traditional NNs are deterministic functions that rely solely on data to +learn the input--output mapping, casting NN training within a Bayesian +framework allows to quantify uncertainties, in particular epistemic +uncertainties that arise from lack of training data, and to integrate a priori +knowledge via the Bayesian prior. However, the high dimensionality and +non-physicality of the NN parameter space, and the complex relationship between +parameters (NN weights) and predicted outputs, renders both prior design and +posterior inference challenging. In this work we present a novel BNN training +scheme based on anchored ensembling that can integrate a priori information +available in the function space, from e.g. low-fidelity models. The anchoring +scheme makes use of low-rank correlations between NN parameters, learnt from +pre-training to realizations of the functional prior. We also perform a study +to demonstrate how correlations between NN weights, which are often neglected +in existing BNN implementations, is critical to appropriately transfer +knowledge between the function-space and parameter-space priors. Performance of +our novel BNN algorithm is first studied on a small 1D example to illustrate +the algorithm's behavior in both interpolation and extrapolation settings. +Then, a thorough assessment is performed on a multi--input--output materials +surrogate modeling example, where we demonstrate the algorithm's capabilities +both in terms of accuracy and quality of the uncertainty estimation, for both +in-distribution and out-of-distribution data. + +
+
+ comment: 24 pages, 14 figures +
+
+
+
+
+ + ☆ BBS: Bi-directional Bit-level Sparsity for Deep Learning Acceleration MICRO 2024 + + +
+ Bit-level sparsity methods skip ineffectual zero-bit operations and are +typically applicable within bit-serial deep learning accelerators. This type of +sparsity at the bit-level is especially interesting because it is both +orthogonal and compatible with other deep neural network (DNN) efficiency +methods such as quantization and pruning. In this work, we improve the +practicality and efficiency of bitlevel sparsity through a novel algorithmic +bit-pruning, averaging, and compression method, and a co-designed efficient +bit-serial hardware accelerator. On the algorithmic side, we introduce +bidirectional bit sparsity (BBS). The key insight of BBS is that we can +leverage bit sparsity in a symmetrical way to prune either zero-bits or +one-bits. This significantly improves the load balance of bit-serial computing +and guarantees the level of sparsity to be more than 50%. On top of BBS, we +further propose two bit-level binary pruning methods that require no +retraining, and can be seamlessly applied to quantized DNNs. Combining binary +pruning with a new tensor encoding scheme, BBS can both skip computation and +reduce the memory footprint associated with bi-directional sparse bit columns. +On the hardware side, we demonstrate the potential of BBS through BitVert, a +bitserial architecture with an efficient PE design to accelerate DNNs with low +overhead, exploiting our proposed binary pruning. Evaluation on seven +representative DNN models shows that our approach achieves: (1) on average +1.66$\times$ reduction in model sizewith negligible accuracy loss of < 0.5%; +(2) up to 3.03$\times$ speedupand 2.44$\times$ energy saving compared to prior +DNN accelerators. + +
+
+ comment: Accepted by IEEE/ACM MICRO 2024 +
+
+
+
+
+ + ☆ Synthetic Tabular Data Generation for Class Imbalance and Fairness: A + Comparative Study ECML + + +
+ Due to their data-driven nature, Machine Learning (ML) models are susceptible +to bias inherited from data, especially in classification problems where class +and group imbalances are prevalent. Class imbalance (in the classification +target) and group imbalance (in protected attributes like sex or race) can +undermine both ML utility and fairness. Although class and group imbalances +commonly coincide in real-world tabular datasets, limited methods address this +scenario. While most methods use oversampling techniques, like interpolation, +to mitigate imbalances, recent advancements in synthetic tabular data +generation offer promise but have not been adequately explored for this +purpose. To this end, this paper conducts a comparative analysis to address +class and group imbalances using state-of-the-art models for synthetic tabular +data generation and various sampling strategies. Experimental results on four +datasets, demonstrate the effectiveness of generative models for bias +mitigation, creating opportunities for further exploration in this direction. + +
+
+ comment: Accepted at the ECML PKDD 2024, 4th Workshop on Bias and Fairness in + AI +
+
+
+
+
+ + ☆ ICML Topological Deep Learning Challenge 2024: Beyond the Graph Domain ICML 2024 + + +
+ This paper describes the 2nd edition of the ICML Topological Deep Learning +Challenge that was hosted within the ICML 2024 ELLIS Workshop on +Geometry-grounded Representation Learning and Generative Modeling (GRaM). The +challenge focused on the problem of representing data in different discrete +topological domains in order to bridge the gap between Topological Deep +Learning (TDL) and other types of structured datasets (e.g. point clouds, +graphs). Specifically, participants were asked to design and implement +topological liftings, i.e. mappings between different data structures and +topological domains --like hypergraphs, or simplicial/cell/combinatorial +complexes. The challenge received 52 submissions satisfying all the +requirements. This paper introduces the main scope of the challenge, and +summarizes the main results and findings. + +
+
+ comment: Proceedings of the Geometry-grounded Representation Learning and + Generative Modeling Workshop (GRaM) at ICML 2024 +
+
+
+
+
+ + ☆ Influence-based Attributions can be Manipulated + + +
+ Influence Functions are a standard tool for attributing predictions to +training data in a principled manner and are widely used in applications such +as data valuation and fairness. In this work, we present realistic incentives +to manipulate influencebased attributions and investigate whether these +attributions can be systematically tampered by an adversary. We show that this +is indeed possible and provide efficient attacks with backward-friendly +implementations. Our work raises questions on the reliability of +influence-based attributions under adversarial circumstances. + +
+
+
+
+
+ + ☆ Low Latency Transformer Inference on FPGAs for Physics Applications with + hls4ml + + +
+ This study presents an efficient implementation of transformer architectures +in Field-Programmable Gate Arrays(FPGAs) using hls4ml. We demonstrate the +strategy for implementing the multi-head attention, softmax, and normalization +layer and evaluate three distinct models. Their deployment on VU13P FPGA chip +achieved latency less than 2us, demonstrating the potential for real-time +applications. HLS4ML compatibility with any TensorFlow-built transformer model +further enhances the scalability and applicability of this work. Index Terms: +FPGAs, machine learning, transformers, high energy physics, LIGO + +
+
+
+
+
+ + ☆ SEF: A Method for Computing Prediction Intervals by Shifting the Error + Function in Neural Networks + + +
+ In today's era, Neural Networks (NN) are applied in various scientific fields +such as robotics, medicine, engineering, etc. However, the predictions of +neural networks themselves contain a degree of uncertainty that must always be +taken into account before any decision is made. This is why many researchers +have focused on developing different ways to quantify the uncertainty of neural +network predictions. Some of these methods are based on generating prediction +intervals (PI) via neural networks for the requested target values. The SEF +(Shifting the Error Function) method presented in this paper is a new method +that belongs to this category of methods. The proposed approach involves +training a single neural network three times, thus generating an estimate along +with the corresponding upper and lower bounds for a given problem. A pivotal +aspect of the method is the calculation of a parameter from the initial +network's estimates, which is then integrated into the loss functions of the +other two networks. This innovative process effectively produces PIs, resulting +in a robust and efficient technique for uncertainty quantification. To evaluate +the effectiveness of our method, a comparison in terms of successful PI +generation between the SEF, PI3NN and PIVEN methods was made using two +synthetic datasets. + +
+
+ comment: The paper has been accepted at the 2024 International Conference on + Computer and Applications (ICCA24), Cairo, Egypt, December 17-19, 2024. + https://icca-conf.info/icca-2024 +
+
+
+
+
+ + ☆ A Survey on Mixup Augmentations and Beyond + + +
+ As Deep Neural Networks have achieved thrilling breakthroughs in the past +decade, data augmentations have garnered increasing attention as regularization +techniques when massive labeled data are unavailable. Among existing +augmentations, Mixup and relevant data-mixing methods that convexly combine +selected samples and the corresponding labels are widely adopted because they +yield high performances by generating data-dependent virtual data while easily +migrating to various domains. This survey presents a comprehensive review of +foundational mixup methods and their applications. We first elaborate on the +training pipeline with mixup augmentations as a unified framework containing +modules. A reformulated framework could contain various mixup methods and give +intuitive operational procedures. Then, we systematically investigate the +applications of mixup augmentations on vision downstream tasks, various data +modalities, and some analysis \& theorems of mixup. Meanwhile, we conclude the +current status and limitations of mixup research and point out further work for +effective and efficient mixup augmentations. This survey can provide +researchers with the current state of the art in mixup methods and provide some +insights and guidance roles in the mixup arena. An online project with this +survey is available at \url{https://github.com/Westlake-AI/Awesome-Mixup}. + +
+
+ comment: Preprint V1 with 27 pages main text. Online project at + https://github.com/Westlake-AI/Awesome-Mixup +
+
+
+
+
+ + ☆ Lung-DETR: Deformable Detection Transformer for Sparse Lung Nodule + Anomaly Detection + + +
+ Accurate lung nodule detection for computed tomography (CT) scan imagery is +challenging in real-world settings due to the sparse occurrence of nodules and +similarity to other anatomical structures. In a typical positive case, nodules +may appear in as few as 3% of CT slices, complicating detection. To address +this, we reframe the problem as an anomaly detection task, targeting rare +nodule occurrences in a predominantly normal dataset. We introduce a novel +solution leveraging custom data preprocessing and Deformable Detection +Transformer (Deformable- DETR). A 7.5mm Maximum Intensity Projection (MIP) is +utilized to combine adjacent lung slices into single images, reducing the slice +count and decreasing nodule sparsity. This enhances spatial context, allowing +for better differentiation between nodules and other structures such as complex +vascular structures and bronchioles. Deformable-DETR is employed to detect +nodules, with a custom focal loss function to better handle the imbalanced +dataset. Our model achieves state-of-the-art performance on the LUNA16 dataset +with an F1 score of 94.2% (95.2% recall, 93.3% precision) on a dataset sparsely +populated with lung nodules that is reflective of real-world clinical data. + +
+
+
+
+
+ + ☆ Bellwether Trades: Characteristics of Trades influential in Predicting + Future Price Movements in Markets + + +
+ In this study, we leverage powerful non-linear machine learning methods to +identify the characteristics of trades that contain valuable information. +First, we demonstrate the effectiveness of our optimized neural network +predictor in accurately predicting future market movements. Then, we utilize +the information from this successful neural network predictor to pinpoint the +individual trades within each data point (trading window) that had the most +impact on the optimized neural network's prediction of future price movements. +This approach helps us uncover important insights about the heterogeneity in +information content provided by trades of different sizes, venues, trading +contexts, and over time. + +
+
+ comment: 49 Pages +
+
+
+
+
+ + ☆ Generalization of Geometric Graph Neural Networks + + +
+ In this paper, we study the generalization capabilities of geometric graph +neural networks (GNNs). We consider GNNs over a geometric graph constructed +from a finite set of randomly sampled points over an embedded manifold with +topological information captured. We prove a generalization gap between the +optimal empirical risk and the optimal statistical risk of this GNN, which +decreases with the number of sampled points from the manifold and increases +with the dimension of the underlying manifold. This generalization gap ensures +that the GNN trained on a graph on a set of sampled points can be utilized to +process other unseen graphs constructed from the same underlying manifold. The +most important observation is that the generalization capability can be +realized with one large graph instead of being limited to the size of the graph +as in previous results. The generalization gap is derived based on the +non-asymptotic convergence result of a GNN on the sampled graph to the +underlying manifold neural networks (MNNs). We verify this theoretical result +with experiments on both Arxiv dataset and Cora dataset. + +
+
+ comment: 12 pages, 4 figures. arXiv admin note: text overlap with + arXiv:2406.05225 +
+
+
+
+
+ + ☆ Learning to Classify Quantum Phases of Matter with a Few Measurements + + +
+ We study the identification of quantum phases of matter, at zero temperature, +when only part of the phase diagram is known in advance. Following a supervised +learning approach, we show how to use our previous knowledge to construct an +observable capable of classifying the phase even in the unknown region. By +using a combination of classical and quantum techniques, such as tensor +networks, kernel methods, generalization bounds, quantum algorithms, and shadow +estimators, we show that, in some cases, the certification of new ground states +can be obtained with a polynomial number of measurements. An important +application of our findings is the classification of the phases of matter +obtained in quantum simulators, e.g., cold atom experiments, capable of +efficiently preparing ground states of complex many-particle systems and +applying simple measurements, e.g., single qubit measurements, but unable to +perform a universal set of gates. + +
+
+
+
+
+ + ☆ Sliding-Window Thompson Sampling for Non-Stationary Settings + + +
+ $\textit{Restless Bandits}$ describe sequential decision-making problems in +which the rewards evolve with time independently from the actions taken by the +policy-maker. It has been shown that classical Bandit algorithms fail when the +underlying environment is changing, making clear that in order to tackle more +challenging scenarios specifically crafted algorithms are needed. In this +paper, extending and correcting the work by \cite{trovo2020sliding}, we analyze +two Thompson-Sampling inspired algorithms, namely $\texttt{BETA-SWTS}$ and +$\texttt{$\gamma$-SWGTS}$, introduced to face the additional complexity given +by the non-stationary nature of the settings; in particular we derive a general +formulation for the regret in $\textit{any}$ arbitrary restless environment for +both Bernoulli and Subgaussian rewards, and, through the introduction of new +quantities, we delve in what contribution lays the deeper foundations of the +error made by the algorithms. Finally, we infer from the general formulation +the regret for two of the most common non-stationary settings: the +$\textit{Abruptly Changing}$ and the $\textit{Smoothly Changing}$ environments. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ SpinMultiNet: Neural Network Potential Incorporating Spin Degrees of + Freedom with Multi-Task Learning + + +
+ Neural Network Potentials (NNPs) have attracted significant attention as a +method for accelerating density functional theory (DFT) calculations. However, +conventional NNP models typically do not incorporate spin degrees of freedom, +limiting their applicability to systems where spin states critically influence +material properties, such as transition metal oxides. This study introduces +SpinMultiNet, a novel NNP model that integrates spin degrees of freedom through +multi-task learning. SpinMultiNet achieves accurate predictions without relying +on correct spin values obtained from DFT calculations. Instead, it utilizes +initial spin estimates as input and leverages multi-task learning to optimize +the spin latent representation while maintaining both $E(3)$ and time-reversal +equivariance. Validation on a dataset of transition metal oxides demonstrates +the high predictive accuracy of SpinMultiNet. The model successfully reproduces +the energy ordering of stable spin configurations originating from +superexchange interactions and accurately captures the rhombohedral distortion +of the rocksalt structure. These results pave the way for new possibilities in +materials simulations that consider spin degrees of freedom, promising future +applications in large-scale simulations of various material systems, including +magnetic materials. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Balancing Immediate Revenue and Future Off-Policy Evaluation in Coupon + Allocation + + +
+ Coupon allocation drives customer purchases and boosts revenue. However, it +presents a fundamental trade-off between exploiting the current optimal policy +to maximize immediate revenue and exploring alternative policies to collect +data for future policy improvement via off-policy evaluation (OPE). To balance +this trade-off, we propose a novel approach that combines a model-based revenue +maximization policy and a randomized exploration policy for data collection. +Our framework enables flexible adjustment of the mixture ratio between these +two policies to optimize the balance between short-term revenue and future +policy improvement. We formulate the problem of determining the optimal mixture +ratio as multi-objective optimization, enabling quantitative evaluation of this +trade-off. We empirically verified the effectiveness of the proposed mixed +policy using synthetic data. Our main contributions are: (1) Demonstrating a +mixed policy combining deterministic and probabilistic policies, flexibly +adjusting the data collection vs. revenue trade-off. (2) Formulating the +optimal mixture ratio problem as multi-objective optimization, enabling +quantitative evaluation of this trade-off. + +
+
+
+
+
+ + ♻ ☆ Evaluating the design space of diffusion-based generative models + + +
+ Most existing theoretical investigations of the accuracy of diffusion models, +albeit significant, assume the score function has been approximated to a +certain accuracy, and then use this a priori bound to control the error of +generation. This article instead provides a first quantitative understanding of +the whole generation process, i.e., both training and sampling. More precisely, +it conducts a non-asymptotic convergence analysis of denoising score matching +under gradient descent. In addition, a refined sampling error analysis for +variance exploding models is also provided. The combination of these two +results yields a full error analysis, which elucidates (again, but this time +theoretically) how to design the training and sampling processes for effective +generation. For instance, our theory implies a preference toward noise +distribution and loss weighting in training that qualitatively agree with the +ones used in [Karras et al. 2022]. It also provides perspectives on the choices +of time and variance schedules in sampling: when the score is well trained, the +design in [Song et al. 2020] is more preferable, but when it is less trained, +the design in [Karras et al. 2022] becomes more preferable. + +
+
+ comment: Comments are welcome. Out of admiration we titled our paper after + EDM, and hoped theorists' humor is not too corny +
+
+
+
+
+ + ♻ ☆ Actively Learning Reinforcement Learning: A Stochastic Optimal Control + Approach + + +
+ In this paper we propose a framework towards achieving two intertwined +objectives: (i) equipping reinforcement learning with active exploration and +deliberate information gathering, such that it regulates state and parameter +uncertainties resulting from modeling mismatches and noisy sensory; and (ii) +overcoming the computational intractability of stochastic optimal control. We +approach both objectives by using reinforcement learning to compute the +stochastic optimal control law. On one hand, we avoid the curse of +dimensionality prohibiting the direct solution of the stochastic dynamic +programming equation. On the other hand, the resulting stochastic optimal +control reinforcement learning agent admits caution and probing, that is, +optimal online exploration and exploitation. Unlike fixed exploration and +exploitation balance, caution and probing are employed automatically by the +controller in real-time, even after the learning process is terminated. We +conclude the paper with a numerical simulation, illustrating how a Linear +Quadratic Regulator with the certainty equivalence assumption may lead to poor +performance and filter divergence, while our proposed approach is stabilizing, +of an acceptable performance, and computationally convenient. + +
+
+
+
+
+ + ♻ ☆ Knowledge-Aware Conversation Derailment Forecasting Using Graph + Convolutional Networks + + +
+ Online conversations are particularly susceptible to derailment, which can +manifest itself in the form of toxic communication patterns including +disrespectful comments and abuse. Forecasting conversation derailment predicts +signs of derailment in advance enabling proactive moderation of conversations. +State-of-the-art approaches to conversation derailment forecasting sequentially +encode conversations and use graph neural networks to model dialogue user +dynamics. However, existing graph models are not able to capture complex +conversational characteristics such as context propagation and emotional +shifts. The use of common sense knowledge enables a model to capture such +characteristics, thus improving performance. Following this approach, here we +derive commonsense statements from a knowledge base of dialogue contextual +information to enrich a graph neural network classification architecture. We +fuse the multi-source information on utterance into capsules, which are used by +a transformer-based forecaster to predict conversation derailment. Our model +captures conversation dynamics and context propagation, outperforming the +state-of-the-art models on the CGA and CMV benchmark datasets + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2306.12982; + text overlap with arXiv:2106.01071 by other authors +
+
+
+
+
+ + ♻ ☆ Using LLMs to Establish Implicit User Sentiment of Software Desirability + + +
+ This study explores the use of LLMs for providing quantitative zero-shot +sentiment analysis of implicit software desirability, addressing a critical +challenge in product evaluation where traditional review scores, though +convenient, fail to capture the richness of qualitative user feedback. +Innovations include establishing a method that 1) works with qualitative user +experience data without the need for explicit review scores, 2) focuses on +implicit user satisfaction, and 3) provides scaled numerical sentiment +analysis, offering a more nuanced understanding of user sentiment, instead of +simply classifying sentiment as positive, neutral, or negative. + Data is collected using the Microsoft Product Desirability Toolkit (PDT), a +well-known qualitative user experience analysis tool. For initial exploration, +the PDT metric was given to users of two software systems. PDT data was fed +through several LLMs (Claude Sonnet 3 and 3.5, GPT4, and GPT4o) and through a +leading transfer learning technique, Twitter-Roberta-Base-Sentiment, and Vader, +a leading sentiment analysis tool. Each system was asked to evaluate the data +in two ways, by looking at the sentiment expressed in the PDT word/explanation +pairs; and by looking at the sentiment expressed by the users in their grouped +selection of five words and explanations, as a whole. Each LLM provided a +sentiment score, its confidence (low, medium, high) in the score, and an +explanation of the score. + All LLMs tested were able to statistically detect user sentiment from the +users' grouped data, whereas TRBS and Vader were not. The confidence and +explanation of confidence provided by the LLMs assisted in understanding user +sentiment. This study adds deeper understanding of evaluating user experiences, +toward the goal of creating a universal tool that quantifies implicit +sentiment. + +
+
+ comment: 6 pages, 2 figures, 2 tables, updated to incorporate feedback +
+
+
+
+
+ + ♻ ☆ Review of Interpretable Machine Learning Models for Disease Prognosis + + +
+ In response to the COVID-19 pandemic, the integration of interpretable +machine learning techniques has garnered significant attention, offering +transparent and understandable insights crucial for informed clinical decision +making. This literature review delves into the applications of interpretable +machine learning in predicting the prognosis of respiratory diseases, +particularly focusing on COVID-19 and its implications for future research and +clinical practice. We reviewed various machine learning models that are not +only capable of incorporating existing clinical domain knowledge but also have +the learning capability to explore new information from the data. These models +and experiences not only aid in managing the current crisis but also hold +promise for addressing future disease outbreaks. By harnessing interpretable +machine learning, healthcare systems can enhance their preparedness and +response capabilities, thereby improving patient outcomes and mitigating the +impact of respiratory diseases in the years to come. + +
+
+
+
+
+ + ♻ ☆ TimelyGPT: Extrapolatable Transformer Pre-training for Long-term + Time-Series Forecasting in Healthcare + + +
+ Large-scale pre-trained models (PTMs) such as BERT and GPT have recently +achieved great success in Natural Language Processing and Computer Vision +domains. However, the development of PTMs on healthcare time-series data is +lagging behind.This underscores the limitations of the existing +transformer-based architectures, particularly their scalability to handle +large-scale time series and ability to capture long-term temporal dependencies. +In this study, we present Timely Generative Pre-trained Transformer +(TimelyGPT). TimelyGPT employs an extrapolatable position (xPos) embedding to +encode trend and periodic patterns into time-series representations. It also +integrates recurrent attention and temporal convolution modules to effectively +capture global-local temporal dependencies. We evaluated TimelyGPT on two +large-scale healthcare time series datasets corresponding to continuous +biosignals and irregularly-sampled time series, respectively. Our experiments +show that during pre-training, TimelyGPT excels in learning time-series +representations from continuously monitored biosignals and irregularly-sampled +time series data commonly observed in longitudinal electronic health records +(EHRs). In forecasting continuous biosignals, TimelyGPT achieves accurate +extrapolation up to 6,000 timesteps of body temperature during the sleep stage +transition, given a short look-up window (i.e., prompt) containing only 2,000 +timesteps. For irregularly-sampled time series, TimelyGPT with a proposed +time-specific inference demonstrates high top recall scores in predicting +future diagnoses using early diagnostic records, effectively handling irregular +intervals between clinical records. Together, we envision TimelyGPT to be +useful in a broad spectrum of health domains, including long-term patient +health state forecasting and patient risk trajectory prediction. + +
+
+ comment: 17 pages +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Visual Grounding with Multi-modal Conditional Adaptation ACM MM 2024 + + +
+ Visual grounding is the task of locating objects specified by natural +language expressions. Existing methods extend generic object detection +frameworks to tackle this task. They typically extract visual and textual +features separately using independent visual and textual encoders, then fuse +these features in a multi-modal decoder for final prediction. However, visual +grounding presents unique challenges. It often involves locating objects with +different text descriptions within the same image. Existing methods struggle +with this task because the independent visual encoder produces identical visual +features for the same image, limiting detection performance. Some recently +approaches propose various language-guided visual encoders to address this +issue, but they mostly rely solely on textual information and require +sophisticated designs. In this paper, we introduce Multi-modal Conditional +Adaptation (MMCA), which enables the visual encoder to adaptively update +weights, directing its focus towards text-relevant regions. Specifically, we +first integrate information from different modalities to obtain multi-modal +embeddings. Then we utilize a set of weighting coefficients, which generated +from the multimodal embeddings, to reorganize the weight update matrices and +apply them to the visual encoder of the visual grounding model. Extensive +experiments on four widely used datasets demonstrate that MMCA achieves +significant improvements and state-of-the-art results. Ablation experiments +further demonstrate the lightweight and efficiency of our method. Our source +code is available at: https://github.com/Mr-Bigworth/MMCA. + +
+
+ comment: Accepted by ACM MM 2024 [Oral] +
+
+
+
+
+ + ♻ ☆ Estimating Indoor Scene Depth Maps from Ultrasonic Echoes ICIP 2024 + + +
+ Measuring 3D geometric structures of indoor scenes requires dedicated depth +sensors, which are not always available. Echo-based depth estimation has +recently been studied as a promising alternative solution. All previous studies +have assumed the use of echoes in the audible range. However, one major problem +is that audible echoes cannot be used in quiet spaces or other situations where +producing audible sounds is prohibited. In this paper, we consider echo-based +depth estimation using inaudible ultrasonic echoes. While ultrasonic waves +provide high measurement accuracy in theory, the actual depth estimation +accuracy when ultrasonic echoes are used has remained unclear, due to its +disadvantage of being sensitive to noise and susceptible to attenuation. We +first investigate the depth estimation accuracy when the frequency of the sound +source is restricted to the high-frequency band, and found that the accuracy +decreased when the frequency was limited to ultrasonic ranges. Based on this +observation, we propose a novel deep learning method to improve the accuracy of +ultrasonic echo-based depth estimation by using audible echoes as auxiliary +data only during training. Experimental results with a public dataset +demonstrate that our method improves the estimation accuracy. + +
+
+ comment: ICIP 2024 +
+
+
+
+
+ + ♻ ☆ FakeBench: Probing Explainable Fake Image Detection via Large Multimodal + Models + + +
+ The ability to distinguish whether an image is generated by artificial +intelligence (AI) is a crucial ingredient in human intelligence, usually +accompanied by a complex and dialectical forensic and reasoning process. +However, current fake image detection models and databases focus on binary +classification without understandable explanations for the general populace. +This weakens the credibility of authenticity judgment and may conceal potential +model biases. Meanwhile, large multimodal models (LMMs) have exhibited immense +visual-text capabilities on various tasks, bringing the potential for +explainable fake image detection. Therefore, we pioneer the probe of LMMs for +explainable fake image detection by presenting a multimodal database +encompassing textual authenticity descriptions, the FakeBench. For +construction, we first introduce a fine-grained taxonomy of generative visual +forgery concerning human perception, based on which we collect forgery +descriptions in human natural language with a human-in-the-loop strategy. +FakeBench examines LMMs with four evaluation criteria: detection, reasoning, +interpretation and fine-grained forgery analysis, to obtain deeper insights +into image authenticity-relevant capabilities. Experiments on various LMMs +confirm their merits and demerits in different aspects of fake image detection +tasks. This research presents a paradigm shift towards transparency for the +fake image detection area and reveals the need for greater emphasis on forensic +elements in visual-language research and AI risk control. FakeBench will be +available at https://github.com/Yixuan423/FakeBench. + +
+
+
+
+
+ + ♻ ☆ DREAM: A Dual Representation Learning Model for Multimodal + Recommendation + + +
+ Multimodal recommendation focuses primarily on effectively exploiting both +behavioral and multimodal information for the recommendation task. However, +most existing models suffer from the following issues when fusing information +from two different domains: (1) Previous works do not pay attention to the +sufficient utilization of modal information by only using direct concatenation, +addition, or simple linear layers for modal information extraction. (2) +Previous works treat modal features as learnable embeddings, which causes the +modal embeddings to gradually deviate from the original modal features during +learning. We refer to this issue as Modal Information Forgetting. (3) Previous +approaches fail to account for the significant differences in the distribution +between behavior and modality, leading to the issue of representation +misalignment. To address these challenges, this paper proposes a novel Dual +REpresentAtion learning model for Multimodal Recommendation called DREAM. For +sufficient information extraction, we introduce separate dual lines, including +Behavior Line and Modal Line, in which the Modal-specific Encoder is applied to +empower modal representations. To address the issue of Modal Information +Forgetting, we introduce the Similarity Supervised Signal to constrain the +modal representations. Additionally, we design a Behavior-Modal Alignment +module to fuse the dual representations through Intra-Alignment and +Inter-Alignment. Extensive experiments on three public datasets demonstrate +that the proposed DREAM method achieves state-of-the-art (SOTA) results. The +source code will be available upon acceptance. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 28 + +
+
+
+ + ☆ Maximizing Relation Extraction Potential: A Data-Centric Study to Unveil + Challenges and Opportunities + + +
+ Relation extraction is a Natural Language Processing task aiming to extract +relationships from textual data. It is a critical step for information +extraction. Due to its wide-scale applicability, research in relation +extraction has rapidly scaled to using highly advanced neural networks. Despite +their computational superiority, modern relation extractors fail to handle +complicated extraction scenarios. However, a comprehensive performance analysis +of the state-of-the-art relation extractors that compile these challenges has +been missing from the literature, and this paper aims to bridge this gap. The +goal has been to investigate the possible data-centric characteristics that +impede neural relation extraction. Based on extensive experiments conducted +using 15 state-of-the-art relation extraction algorithms ranging from recurrent +architectures to large language models and seven large-scale datasets, this +research suggests that modern relation extractors are not robust to complex +data and relation characteristics. It emphasizes pivotal issues, such as +contextual ambiguity, correlating relations, long-tail data, and fine-grained +relation distributions. In addition, it sets a marker for future directions to +alleviate these issues, thereby proving to be a critical resource for novice +and advanced researchers. Efficient handling of the challenges described can +have significant implications for the field of information extraction, which is +a critical part of popular systems such as search engines and chatbots. Data +and relevant code can be found at https://github.com/anushkasw/MaxRE. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Just ASR + LLM? A Study on Speech Large Language Models' Ability to + Identify and Understand Speaker in Spoken Dialogue + + +
+ In recent years, we have observed a rapid advancement in speech language +models (SpeechLLMs), catching up with humans' listening and reasoning +abilities. Remarkably, SpeechLLMs have demonstrated impressive spoken dialogue +question-answering (SQA) performance in benchmarks like Gaokao, the English +listening test of the college entrance exam in China, which seemingly requires +understanding both the spoken content and voice characteristics of speakers in +a conversation. However, after carefully examining Gaokao's questions, we find +the correct answers to many questions can be inferred from the conversation +context alone without identifying the speaker asked in the question. Our +evaluation of state-of-the-art models Qwen-Audio and WavLLM in both Gaokao and +our proposed "What Do You Like?" dataset shows a significantly higher accuracy +in these context-based questions than in identity-critical questions, which can +only be answered correctly with correct speaker identification. Our results and +analysis suggest that when solving SQA, the current SpeechLLMs exhibit limited +speaker awareness from the audio and behave similarly to an LLM reasoning from +the conversation transcription without sound. We propose that our definitions +and automated classification of context-based and identity-critical questions +could offer a more accurate evaluation framework of SpeechLLMs in SQA tasks. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ Achieving Peak Performance for Large Language Models: A Systematic + Review + + +
+ In recent years, large language models (LLMs) have achieved remarkable +success in natural language processing (NLP). LLMs require an extreme amount of +parameters to attain high performance. As models grow into the +trillion-parameter range, computational and memory costs increase +significantly. This makes it difficult for many researchers to access the +resources needed to train or apply these models. Optimizing LLM performance +involves two main approaches: fine-tuning pre-trained models for specific tasks +to achieve state-of-the-art performance, and reducing costs or improving +training time while maintaining similar performance. This paper presents a +systematic literature review (SLR) following the Preferred Reporting Items for +Systematic Reviews and Meta-Analyses (PRISMA) statement. We reviewed 65 +publications out of 983 from 2017 to December 2023, retrieved from 5 databases. +The study presents methods to optimize and accelerate LLMs while achieving +cutting-edge results without sacrificing accuracy. We begin with an overview of +the development of language modeling, followed by a detailed explanation of +commonly used frameworks and libraries, and a taxonomy for improving and +speeding up LLMs based on three classes: LLM training, LLM inference, and +system serving. We then delve into recent optimization and acceleration +strategies such as training optimization, hardware optimization, scalability +and reliability, accompanied by the taxonomy and categorization of these +strategies. Finally, we provide an in-depth comparison of each class and +strategy, with two case studies on optimizing model training and enhancing +inference efficiency. These case studies showcase practical approaches to +address LLM resource limitations while maintaining performance. + +
+
+ comment: 34 pages, 7 figures, 8 tables. Journal Article: IEEE Access +
+
+
+
+
+ + ☆ MILE: A Mutation Testing Framework of In-Context Learning Systems + + +
+ In-context Learning (ICL) has achieved notable success in the applications of +large language models (LLMs). By adding only a few input-output pairs that +demonstrate a new task, the LLM can efficiently learn the task during inference +without modifying the model parameters. Such mysterious ability of LLMs has +attracted great research interests in understanding, formatting, and improving +the in-context demonstrations, while still suffering from drawbacks like +black-box mechanisms and sensitivity against the selection of examples. In this +work, inspired by the foundations of adopting testing techniques in machine +learning (ML) systems, we propose a mutation testing framework designed to +characterize the quality and effectiveness of test data for ICL systems. First, +we propose several mutation operators specialized for ICL demonstrations, as +well as corresponding mutation scores for ICL test sets. With comprehensive +experiments, we showcase the effectiveness of our framework in evaluating the +reliability and quality of ICL test suites. Our code is available at +https://github.com/weizeming/MILE. + +
+
+
+
+
+ + ☆ Exploring Straightforward Conversational Red-Teaming + + +
+ Large language models (LLMs) are increasingly used in business dialogue +systems but they pose security and ethical risks. Multi-turn conversations, +where context influences the model's behavior, can be exploited to produce +undesired responses. In this paper, we examine the effectiveness of utilizing +off-the-shelf LLMs in straightforward red-teaming approaches, where an attacker +LLM aims to elicit undesired output from a target LLM, comparing both +single-turn and conversational red-teaming tactics. Our experiments offer +insights into various usage strategies that significantly affect their +performance as red teamers. They suggest that off-the-shelf models can act as +effective red teamers and even adjust their attack strategy based on past +attempts, although their effectiveness decreases with greater alignment. + +
+
+
+
+
+ + ☆ Phrase-Level Adversarial Training for Mitigating Bias in Neural + Network-based Automatic Essay Scoring + + +
+ Automatic Essay Scoring (AES) is widely used to evaluate candidates for +educational purposes. However, due to the lack of representative data, most +existing AES systems are not robust, and their scoring predictions are biased +towards the most represented data samples. In this study, we propose a +model-agnostic phrase-level method to generate an adversarial essay set to +address the biases and robustness of AES models. Specifically, we construct an +attack test set comprising samples from the original test set and adversarially +generated samples using our proposed method. To evaluate the effectiveness of +the attack strategy and data augmentation, we conducted a comprehensive +analysis utilizing various neural network scoring models. Experimental results +show that the proposed approach significantly improves AES model performance in +the presence of adversarial examples and scenarios without such attacks. + +
+
+
+
+
+ + ☆ Selective Self-Rehearsal: A Fine-Tuning Approach to Improve + Generalization in Large Language Models + + +
+ Fine-tuning Large Language Models (LLMs) on specific datasets is a common +practice to improve performance on target tasks. However, this performance gain +often leads to overfitting, where the model becomes too specialized in either +the task or the characteristics of the training data, resulting in a loss of +generalization. This paper introduces Selective Self-Rehearsal (SSR), a +fine-tuning approach that achieves performance comparable to the standard +supervised fine-tuning (SFT) while improving generalization. SSR leverages the +fact that there can be multiple valid responses to a query. By utilizing the +model's correct responses, SSR reduces model specialization during the +fine-tuning stage. SSR first identifies the correct model responses from the +training set by deploying an appropriate LLM as a judge. Then, it fine-tunes +the model using the correct model responses and the gold response for the +remaining samples. The effectiveness of SSR is demonstrated through experiments +on the task of identifying unanswerable queries across various datasets. The +results show that standard SFT can lead to an average performance drop of up to +$16.7\%$ on multiple benchmarks, such as MMLU and TruthfulQA. In contrast, SSR +results in close to $2\%$ drop on average, indicating better generalization +capabilities compared to standard SFT. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ LoCa: Logit Calibration for Knowledge Distillation ECAI 2024 + + +
+ Knowledge Distillation (KD), aiming to train a better student model by +mimicking the teacher model, plays an important role in model compression. One +typical way is to align the output logits. However, we find a common issue +named mis-instruction, that the student would be misled when the predictions +based on teacher logits do not follow the labels. Meanwhile, there is other +useful dark knowledge in the logits such as the class discriminability, which +is vital for distillation. In this paper, we propose a simple yet effective +Logit Calibration (LoCa) method, which calibrates the logits from the teacher +model based on the ground-truth labels. The key insight is to correct the +prediction (to address the mis-instruction issue) and maintain useful dark +knowledge simultaneously. Our proposed LoCa does not require any additional +parameters. Empirical results on image classification and text generation tasks +demonstrate that LoCa can effectively improve the performance of baselines. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ☆ Untie the Knots: An Efficient Data Augmentation Strategy for + Long-Context Pre-Training in Language Models + + +
+ Large language models (LLM) have prioritized expanding the context window +from which models can incorporate more information. However, training models to +handle long contexts presents significant challenges. These include the +scarcity of high-quality natural long-context data, the potential for +performance degradation on short-context tasks, and the reduced training +efficiency associated with attention mechanisms. In this paper, we introduce +Untie the Knots (\textbf{UtK}), a novel data augmentation strategy employed +during the continue pre-training phase, designed to efficiently enable LLMs to +gain long-context capabilities without the need to modify the existing data +mixture. In particular, we chunk the documents, shuffle the chunks, and create +a complex and knotted structure of long texts; LLMs are then trained to untie +these knots and identify relevant segments within seemingly chaotic token +sequences. This approach greatly improves the model's performance by accurately +attending to relevant information in long context and the training efficiency +is also largely increased. We conduct extensive experiments on models with 7B +and 72B parameters, trained on 20 billion tokens, demonstrating that UtK +achieves 75\% and 84.5\% accurracy on RULER at 128K context length, +significantly outperforming other long context strategies. The trained models +will open-source for further research. + +
+
+
+
+
+ + ☆ Late Chunking: Contextual Chunk Embeddings Using Long-Context Embedding + Models + + +
+ Many use cases require retrieving smaller portions of text, and dense +vector-based retrieval systems often perform better with shorter text segments, +as the semantics are less likely to be "over-compressed" in the embeddings. +Consequently, practitioners often split text documents into smaller chunks and +encode them separately. However, chunk embeddings created in this way can lose +contextual information from surrounding chunks, resulting in suboptimal +representations. In this paper, we introduce a novel method called "late +chunking," which leverages long context embedding models to first embed all +tokens of the long text, with chunking applied after the transformer model and +just before mean pooling. The resulting chunk embeddings capture the full +contextual information, leading to superior results across various retrieval +tasks without the need for additional training. Moreover, our method is generic +enough to be applied to any long-context embedding model. + +
+
+ comment: 4 pages, early draft +
+
+
+
+
+ + ☆ QueryBuilder: Human-in-the-Loop Query Development for Information + Retrieval + + +
+ Frequently, users of an Information Retrieval (IR) system start with an +overarching information need (a.k.a., an analytic task) and proceed to define +finer-grained queries covering various important aspects (i.e., sub-topics) of +that analytic task. We present a novel, interactive system called +$\textit{QueryBuilder}$, which allows a novice, English-speaking user to create +queries with a small amount of effort, through efficient exploration of an +English development corpus in order to rapidly develop cross-lingual +information retrieval queries corresponding to the user's information needs. +QueryBuilder performs near real-time retrieval of documents based on +user-entered search terms; the user looks through the retrieved documents and +marks sentences as relevant to the information needed. The marked sentences are +used by the system as additional information in query formation and refinement: +query terms (and, optionally, event features, which capture event $'triggers'$ +(indicator terms) and agent/patient roles) are appropriately weighted, and a +neural-based system, which better captures textual meaning, retrieves other +relevant content. The process of retrieval and marking is repeated as many +times as desired, giving rise to increasingly refined queries in each +iteration. The final product is a fine-grained query used in Cross-Lingual +Information Retrieval (CLIR). Our experiments using analytic tasks and requests +from the IARPA BETTER IR datasets show that with a small amount of effort (at +most 10 minutes per sub-topic), novice users can form $\textit{useful}$ +fine-grained queries including in languages they don't understand. QueryBuilder +also provides beneficial capabilities to the traditional corpus exploration and +query formation process. A demonstration video is released at +https://vimeo.com/734795835 + +
+
+
+
+
+ + ♻ ☆ Synthetic Dataset for Evaluating Complex Compositional Knowledge for + Natural Language Inference ACL 2023 + + +
+ We introduce a synthetic dataset called Sentences Involving Complex +Compositional Knowledge (SICCK) and a novel analysis that investigates the +performance of Natural Language Inference (NLI) models to understand +compositionality in logic. We produce 1,304 sentence pairs by modifying 15 +examples from the SICK dataset (Marelli et al., 2014). To this end, we modify +the original texts using a set of phrases - modifiers that correspond to +universal quantifiers, existential quantifiers, negation, and other concept +modifiers in Natural Logic (NL) (MacCartney, 2009). We use these phrases to +modify the subject, verb, and object parts of the premise and hypothesis. +Lastly, we annotate these modified texts with the corresponding entailment +labels following NL rules. We conduct a preliminary verification of how well +the change in the structural and semantic composition is captured by neural NLI +models, in both zero-shot and fine-tuned scenarios. We found that the +performance of NLI models under the zero-shot setting is poor, especially for +modified sentences with negation and existential quantifiers. After fine-tuning +this dataset, we observe that models continue to perform poorly over negation, +existential and universal modifiers. + +
+
+ comment: Accepted to Natural Language Reasoning and Structured Explanations + (NLRSE) Workshop, ACL 2023. For dataset, please refer + https://github.com/sushmaakoju/clulab-releases/blob/master/acl2023-nlrse-sicck/README.md + and https://github.com/sushmaakoju/acl2023-nlrse-clulab-SICCK-dataset +
+
+
+
+
+ + ♻ ☆ Towards Generative Class Prompt Learning for Fine-grained Visual + Recognition BMVC 2024 + + +
+ Although foundational vision-language models (VLMs) have proven to be very +successful for various semantic discrimination tasks, they still struggle to +perform faithfully for fine-grained categorization. Moreover, foundational +models trained on one domain do not generalize well on a different domain +without fine-tuning. We attribute these to the limitations of the VLM's +semantic representations and attempt to improve their fine-grained visual +awareness using generative modeling. Specifically, we propose two novel +methods: Generative Class Prompt Learning (GCPL) and Contrastive Multi-class +Prompt Learning (CoMPLe). Utilizing text-to-image diffusion models, GCPL +significantly improves the visio-linguistic synergy in class embeddings by +conditioning on few-shot exemplars with learnable class prompts. CoMPLe builds +on this foundation by introducing a contrastive learning component that +encourages inter-class separation during the generative optimization process. +Our empirical results demonstrate that such a generative class prompt learning +approach substantially outperform existing methods, offering a better +alternative to few shot image recognition challenges. The source code will be +made available at: https://github.com/soumitri2001/GCPL. + +
+
+ comment: Accepted in BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Hierarchical Tree-structured Knowledge Graph For Academic Insight Survey + + +
+ Research surveys have always posed a challenge for beginner researchers who +lack of research training. These researchers struggle to understand the +directions within their research topic, and the discovery of new research +findings within a short time. One way to provide intuitive assistance to +beginner researchers is by offering relevant knowledge graphs(KG) and +recommending related academic papers. However, existing navigation knowledge +graphs primarily rely on keywords in the research field and often fail to +present the logical hierarchy among multiple related papers clearly. Moreover, +most recommendation systems for academic papers simply rely on high text +similarity, which can leave researchers confused as to why a particular article +is being recommended. They may lack of grasp important information about the +insight connection between "Issue resolved" and "Issue finding" that they hope +to obtain. To address these issues, this study aims to support research insight +surveys for beginner researchers by establishing a hierarchical tree-structured +knowledge graph that reflects the inheritance insight of research topics and +the relevance insight among the academic papers. + +
+
+ comment: This paper has been accepted by 'The 18TH International Conference on + INnovations in Intelligent SysTems and Applications (INISTA 2024)' +
+
+
+
+
+ + ♻ ☆ The Fellowship of the LLMs: Multi-Agent Workflows for Synthetic + Preference Optimization Dataset Generation + + +
+ This paper presents synthetic Preference Optimization (PO) datasets generated +using multi-agent workflows and evaluates the effectiveness and potential of +these workflows in the dataset generation process. PO dataset generation +requires two modules: (1) response evaluation, and (2) response generation. In +the response evaluation module, the responses from Large Language Models (LLMs) +are evaluated and ranked - a task typically carried out by human annotators +that we automate using LLMs. We assess the response evaluation module in a 2 +step process. In step 1, we assess LLMs as evaluators using three distinct +prompting strategies. In step 2, we apply the winning prompting strategy to +compare the performance of LLM-as-a-Judge, LLMs-as-a-Jury, and LLM Debate. In +each step, we use inter-rater agreement using Cohen's Kappa between human +annotators and LLMs. For the response generation module, we compare different +configurations for the LLM Feedback Loop using the identified LLM evaluator +configuration. We use the win rate (the fraction of times a generation +framework is selected as the best by an LLM evaluator) to determine the best +multi-agent configuration for generation. After identifying the best +configurations for both modules, we use models from the GPT, Gemma, and Llama +families to generate our PO datasets using the above pipeline. We generate two +types of PO datasets, one to improve the generation capabilities of individual +LLM and the other to improve the multi-agent workflow. Our evaluation shows +that GPT-4o-as-a-Judge is more consistent across datasets when the candidate +responses do not include responses from the GPT family. Additionally, we find +that the LLM Feedback Loop, with Llama as the generator and Gemma as the +reviewer, achieves a notable 71.8% and 73.8% win rate over single-agent Llama +and Gemma, respectively. + +
+
+
+
+
+ + ♻ ☆ Generalists vs. Specialists: Evaluating Large Language Models for Urdu + + +
+ In this paper, we compare general-purpose pretrained models, GPT-4-Turbo and +Llama-3-8b-Instruct with special-purpose models fine-tuned on specific tasks, +XLM-Roberta-large, mT5-large, and Llama-3-8b-Instruct. We focus on seven +classification and six generation tasks to evaluate the performance of these +models on Urdu language. Urdu has 70 million native speakers, yet it remains +underrepresented in Natural Language Processing (NLP). Despite the frequent +advancements in Large Language Models (LLMs), their performance in low-resource +languages, including Urdu, still needs to be explored. We also conduct a human +evaluation for the generation tasks and compare the results with the +evaluations performed by GPT-4-Turbo and Llama-3-8b-Instruct. We find that +special-purpose models consistently outperform general-purpose models across +various tasks. We also find that the evaluation done by GPT-4-Turbo for +generation tasks aligns more closely with human evaluation compared to the +evaluation by Llama-3-8b-Instruct. This paper contributes to the NLP community +by providing insights into the effectiveness of general and specific-purpose +LLMs for low-resource languages. + +
+
+
+
+
+ + ♻ ☆ Leveraging Open Knowledge for Advancing Task Expertise in Large Language + Models + + +
+ The cultivation of expertise for large language models (LLMs) to solve tasks +of specific areas often requires special-purpose tuning with calibrated +behaviors on the expected stable outputs. To avoid huge cost brought by manual +preparation of instruction datasets and training resources up to hundreds of +hours, the exploitation of open knowledge including a wealth of low rank +adaptation (LoRA) models and instruction datasets serves as a good starting +point. However, existing methods on model and data selection focus on the +performance of general-purpose capabilities while neglecting the knowledge gap +exposed in domain-specific deployment. In the present study, we propose to +bridge such gap by introducing few human-annotated samples (i.e., K-shot) for +advancing task expertise of LLMs with open knowledge. Specifically, we develop +an efficient and scalable pipeline to cost-efficiently produce task experts +where K-shot data intervene in selecting the most promising expert candidates +and the task-relevant instructions. A mixture-of-expert (MoE) system is built +to make the best use of individual-yet-complementary knowledge between multiple +experts. We unveil the two keys to the success of a MoE system, 1) the abidance +by K-shot, and 2) the insistence on diversity. For the former, we ensure that +models that truly possess problem-solving abilities on K-shot are selected +rather than those blind guessers. Besides, during data selection, instructions +that share task-relevant contexts with K-shot are prioritized. For the latter, +we highlight the diversity of constituting experts and that of the fine-tuning +instructions throughout the model and data selection process. Extensive +experimental results confirm the superiority of our approach over existing +methods on utilization of open knowledge across various tasks. Our codes will +be available at https://github.com/Yaphabates/Rocket. + +
+
+ comment: 29 pages, 12 tables, 10 figures +
+
+
+
+
+ + ♻ ☆ Emilia: An Extensive, Multilingual, and Diverse Speech Dataset for + Large-Scale Speech Generation + + +
+ Recent advancements in speech generation models have been significantly +driven by the use of large-scale training data. However, producing highly +spontaneous, human-like speech remains a challenge due to the scarcity of +large, diverse, and spontaneous speech datasets. In response, we introduce +Emilia, the first large-scale, multilingual, and diverse speech generation +dataset. Emilia starts with over 101k hours of speech across six languages, +covering a wide range of speaking styles to enable more natural and spontaneous +speech generation. To facilitate the scale-up of Emilia, we also present +Emilia-Pipe, the first open-source preprocessing pipeline designed to +efficiently transform raw, in-the-wild speech data into high-quality training +data with speech annotations. Experimental results demonstrate the +effectiveness of both Emilia and Emilia-Pipe. Demos are available at: +https://emilia-dataset.github.io/Emilia-Demo-Page/. + +
+
+ comment: Accepted in SLT 2024. Dataset available: + https://huggingface.co/datasets/amphion/Emilia-Dataset +
+
+
+
+
+ + ♻ ☆ The representation landscape of few-shot learning and fine-tuning in + large language models + + +
+ In-context learning (ICL) and supervised fine-tuning (SFT) are two common +strategies for improving the performance of modern large language models (LLMs) +on specific tasks. Despite their different natures, these strategies often lead +to comparable performance gains. However, little is known about whether they +induce similar representations inside LLMs. We approach this problem by +analyzing the probability landscape of their hidden representations in the two +cases. More specifically, we compare how LLMs solve the same question-answering +task, finding that ICL and SFT create very different internal structures, in +both cases undergoing a sharp transition in the middle of the network. In the +first half of the network, ICL shapes interpretable representations +hierarchically organized according to their semantic content. In contrast, the +probability landscape obtained with SFT is fuzzier and semantically mixed. In +the second half of the model, the fine-tuned representations develop +probability modes that better encode the identity of answers, while the +landscape of ICL representations is characterized by less defined peaks. Our +approach reveals the diverse computational strategies developed inside LLMs to +solve the same task across different conditions, allowing us to make a step +towards designing optimal methods to extract information from language models. + +
+
+
+
+
+ + ♻ ☆ Narrow Transformer: StarCoder-Based Java-LM For Desktop + + +
+ This paper presents NT-Java-1.1B, an open-source specialized code language +model built on StarCoderBase-1.1B, designed for coding tasks in Java +programming. NT-Java-1.1B achieves state-of-the-art performance, surpassing its +base model and majority of other models of similar size on MultiPL-E Java code +benchmark. While there have been studies on extending large, generic +pre-trained models to improve proficiency in specific programming languages +like Python, similar investigations on small code models for other programming +languages are lacking. Large code models require specialized hardware like GPUs +for inference, highlighting the need for research into building small code +models that can be deployed on developer desktops. This paper addresses this +research gap by focusing on the development of a small Java code model, +NT-Java-1.1B, and its quantized versions, which performs comparably to open +models around 1.1B on MultiPL-E Java code benchmarks, making them ideal for +desktop deployment. This paper establishes the foundation for specialized +models across languages and sizes for a family of NT Models. + +
+
+ comment: Updated Authors list +
+
+
+
+
+ + ♻ ☆ Cross-Data Knowledge Graph Construction for LLM-enabled Educational + Question-Answering System: A Case Study at HCMUT + + +
+ In today's rapidly evolving landscape of Artificial Intelligence, large +language models (LLMs) have emerged as a vibrant research topic. LLMs find +applications in various fields and contribute significantly. Despite their +powerful language capabilities, similar to pre-trained language models (PLMs), +LLMs still face challenges in remembering events, incorporating new +information, and addressing domain-specific issues or hallucinations. To +overcome these limitations, researchers have proposed Retrieval-Augmented +Generation (RAG) techniques, some others have proposed the integration of LLMs +with Knowledge Graphs (KGs) to provide factual context, thereby improving +performance and delivering more accurate feedback to user queries. + Education plays a crucial role in human development and progress. With the +technology transformation, traditional education is being replaced by digital +or blended education. Therefore, educational data in the digital environment is +increasing day by day. Data in higher education institutions are diverse, +comprising various sources such as unstructured/structured text, relational +databases, web/app-based API access, etc. Constructing a Knowledge Graph from +these cross-data sources is not a simple task. This article proposes a method +for automatically constructing a Knowledge Graph from multiple data sources and +discusses some initial applications (experimental trials) of KG in conjunction +with LLMs for question-answering tasks. + +
+
+ comment: 8 pages, 7 figures, Accepted at AIQAM '24: Proceedings of the 1st ACM + Workshop on AI-Powered Q&A Systems for Multimedia +
+
+
+
+
+ + ♻ ☆ Pixels and Predictions: Potential of GPT-4V in Meteorological Imagery + Analysis and Forecast Communication + + +
+ Generative AI, such as OpenAI's GPT-4V large-language model, has rapidly +entered mainstream discourse. Novel capabilities in image processing and +natural-language communication may augment existing forecasting methods. Large +language models further display potential to better communicate weather hazards +in a style honed for diverse communities and different languages. This study +evaluates GPT-4V's ability to interpret meteorological charts and communicate +weather hazards appropriately to the user, despite challenges of +hallucinations, where generative AI delivers coherent, confident, but incorrect +responses. We assess GPT-4V's competence via its web interface ChatGPT in two +tasks: (1) generating a severe-weather outlook from weather-chart analysis and +conducting self-evaluation, revealing an outlook that corresponds well with a +Storm Prediction Center human-issued forecast; and (2) producing hazard +summaries in Spanish and English from weather charts. Responses in Spanish, +however, resemble direct (not idiomatic) translations from English to Spanish, +yielding poorly translated summaries that lose critical idiomatic precision +required for optimal communication. Our findings advocate for cautious +integration of tools like GPT-4V in meteorology, underscoring the necessity of +human oversight and development of trustworthy, explainable AI. + +
+
+ comment: Supplementary material PDF attached. Submitted to Artificial + Intelligence for the Earth Systems (American Meteorological Society) on 18 + April 2024 +
+
+
+
+
+ + ♻ ☆ DOCE: Finding the Sweet Spot for Execution-Based Code Generation + + +
+ Recently, a diverse set of decoding and reranking procedures have been shown +effective for LLM-based code generation. However, a comprehensive framework +that links and experimentally compares these methods is missing. We address +this by proposing Decoding Objectives for Code Execution, a comprehensive +framework that includes candidate generation, $n$-best reranking, minimum Bayes +risk (MBR) decoding, and self-debugging as the core components. We then study +the contributions of these components through execution-based evaluation +metrics. Our findings highlight the importance of execution-based methods and +the difference gap between execution-based and execution-free methods. +Furthermore, we assess the impact of filtering based on trial unit tests, a +simple and effective strategy that has been often overlooked in prior works. We +also propose self-debugging on multiple candidates, obtaining state-of-the-art +performance on reranking for code generation. We expect our framework to +provide a solid guideline for future research on code generation. + +
+
+ comment: 10 pages (32 including appendix), 5 figures, 25 tables. To ensure + reproducibility, we present some prompts used in our experiments from + arXiv:2304.05128 that leads to text overlap +
+
+
+
+
+ + ♻ ☆ Fairness and Bias in Multimodal AI: A Survey + + +
+ The importance of addressing fairness and bias in artificial intelligence +(AI) systems cannot be over-emphasized. Mainstream media has been awashed with +news of incidents around stereotypes and other types of bias in many of these +systems in recent years. In this survey, we fill a gap with regards to the +relatively minimal study of fairness and bias in Large Multimodal Models (LMMs) +compared to Large Language Models (LLMs), providing 50 examples of datasets and +models related to both types of AI along with the challenges of bias affecting +them. We discuss the less-mentioned category of mitigating bias, preprocessing +(with particular attention on the first part of it, which we call preuse). The +method is less-mentioned compared to the two well-known ones in the literature: +intrinsic and extrinsic mitigation methods. We critically discuss the various +ways researchers are addressing these challenges. Our method involved two +slightly different search queries on two reputable search engines, Google +Scholar and Web of Science (WoS), which revealed that for the queries 'Fairness +and bias in Large Multimodal Models' and 'Fairness and bias in Large Language +Models', 33,400 and 538,000 links are the initial results, respectively, for +Scholar while 4 and 50 links are the initial results, respectively, for WoS. +For reproducibility and verification, we provide links to the search results +and the citations to all the final reviewed papers. We believe this work +contributes to filling this gap and providing insight to researchers and other +stakeholders on ways to address the challenges of fairness and bias in +multimodal and language AI. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Goal-guided Generative Prompt Injection Attack on Large Language Models + + +
+ Current large language models (LLMs) provide a strong foundation for +large-scale user-oriented natural language tasks. A large number of users can +easily inject adversarial text or instructions through the user interface, thus +causing LLMs model security challenges. Although there is currently a large +amount of research on prompt injection attacks, most of these black-box attacks +use heuristic strategies. It is unclear how these heuristic strategies relate +to the success rate of attacks and thus effectively improve model robustness. +To solve this problem, we redefine the goal of the attack: to maximize the KL +divergence between the conditional probabilities of the clean text and the +adversarial text. Furthermore, we prove that maximizing the KL divergence is +equivalent to maximizing the Mahalanobis distance between the embedded +representation $x$ and $x'$ of the clean text and the adversarial text when the +conditional probability is a Gaussian distribution and gives a quantitative +relationship on $x$ and $x'$. Then we designed a simple and effective +goal-guided generative prompt injection strategy (G2PIA) to find an injection +text that satisfies specific constraints to achieve the optimal attack effect +approximately. It is particularly noteworthy that our attack method is a +query-free black-box attack method with low computational cost. Experimental +results on seven LLM models and four datasets show the effectiveness of our +attack method. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Exploring the Mystery of Influential Data for Mathematical Reasoning + + +
+ Selecting influential data for fine-tuning on downstream tasks is a key +factor for both performance and computation efficiency. Recent works have shown +that training with only limited data can show a superior performance on general +tasks. However, the feasibility on mathematical reasoning tasks has not been +validated. To go further, there exist two open questions for mathematical +reasoning: how to select influential data and what is an influential data +composition. For the former one, we propose a Quality-aware Diverse Selection +(QaDS) strategy adaptable for mathematical reasoning. A comparison with other +selection strategies validates the superiority of QaDS. For the latter one, we +first enlarge our setting and explore the influential data composition. We +conduct a series of experiments and highlight: scaling up reasoning data, and +training with general data selected by QaDS is helpful. Then, we define our +optimal mixture as OpenMathMix, an influential data mixture with open-source +data selected by QaDS. With OpenMathMix, we achieve a state-of-the-art 48.8% +accuracy on MATH with 7B base model. Additionally, we showcase the use of QaDS +in creating efficient fine-tuning mixtures with various selection ratios, and +analyze the quality of a wide range of open-source datasets, which can perform +as a reference for future works on mathematical reasoning tasks. + +
+
+ comment: Accepted by COLM 2024 +
+
+
+
+
+ + ♻ ☆ MiniCache: KV Cache Compression in Depth Dimension for Large Language + Models + + +
+ A critical approach for efficiently deploying computationally demanding large +language models (LLMs) is Key-Value (KV) caching. The KV cache stores key-value +states of previously generated tokens, significantly reducing the need for +repetitive computations and thereby lowering latency in autoregressive +generation. However, the size of the KV cache grows linearly with sequence +length, posing challenges for applications requiring long context input and +extensive sequence generation. In this paper, we present a simple yet effective +approach, called MiniCache, to compress the KV cache across layers from a novel +depth perspective, significantly reducing the memory footprint for LLM +inference. Our approach is based on the observation that KV cache states +exhibit high similarity between the adjacent layers in the middle-to-deep +portion of LLMs. To facilitate merging, we propose disentangling the states +into the magnitude and direction components, interpolating the directions of +the state vectors while preserving their lengths unchanged. Furthermore, we +introduce a token retention strategy to keep highly distinct state pairs +unmerged, thus preserving the information with minimal additional storage +overhead. Our MiniCache is training-free and general, complementing existing KV +cache compression strategies, such as quantization and sparsity. We conduct a +comprehensive evaluation of MiniCache utilizing various models including +LLaMA-2, LLaMA-3, Phi-3, Mistral, and Mixtral across multiple benchmarks, +demonstrating its exceptional performance in achieving superior compression +ratios and high throughput. On the ShareGPT dataset, LLaMA-2-7B with 4-bit +MiniCache achieves a remarkable compression ratio of up to 5.02x, enhances +inference throughput by approximately 5x, and reduces the memory footprint by +41% compared to the FP16 full cache baseline, all while maintaining +near-lossless performance. + +
+
+ comment: Project is available at https://minicache.vmv.re +
+
+
+
+
+ + ♻ ☆ PSST: A Benchmark for Evaluation-driven Text Public-Speaking Style + Transfer + + +
+ Language style is necessary for AI systems to understand and generate diverse +human language accurately. However, previous text style transfer primarily +focused on sentence-level data-driven approaches, limiting exploration of +potential problems in large language models (LLMs) and the ability to meet +complex application needs. To overcome these limitations, we introduce a novel +task called Public-Speaking Style Transfer (PSST), which aims to simulate +humans to transform passage-level, official texts into a public-speaking style. +Grounded in the analysis of real-world data from a linguistic perspective, we +decompose public-speaking style into key sub-styles to pose challenges and +quantify the style modeling capability of LLMs. For such intricate text style +transfer, we further propose a fine-grained evaluation framework to analyze the +characteristics and identify the problems of stylized texts. Comprehensive +experiments suggest that current LLMs struggle to generate public speaking +texts that align with human preferences, primarily due to excessive stylization +and loss of semantic information. + +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Incorporate LLMs with Influential Recommender System + + +
+ Recommender systems have achieved increasing accuracy over the years. +However, this precision often leads users to narrow their interests, resulting +in issues such as limited diversity and the creation of echo chambers. Current +research addresses these challenges through proactive recommender systems by +recommending a sequence of items (called influence path) to guide user interest +in the target item. However, existing methods struggle to construct a coherent +influence path that builds up with items the user is likely to enjoy. In this +paper, we leverage the Large Language Model's (LLMs) exceptional ability for +path planning and instruction following, introducing a novel approach named +LLM-based Influence Path Planning (LLM-IPP). Our approach maintains coherence +between consecutive recommendations and enhances user acceptability of the +recommended items. To evaluate LLM-IPP, we implement various user simulators +and metrics to measure user acceptability and path coherence. Experimental +results demonstrate that LLM-IPP significantly outperforms traditional +proactive recommender systems. This study pioneers the integration of LLMs into +proactive recommender systems, offering a reliable and user-engaging +methodology for future recommendation technologies. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ Debias Can be Unreliable: Mitigating Bias Issue in Evaluating Debiasing + Recommendation + + +
+ Recent work has improved recommendation models remarkably by equipping them +with debiasing methods. Due to the unavailability of fully-exposed datasets, +most existing approaches resort to randomly-exposed datasets as a proxy for +evaluating debiased models, employing traditional evaluation scheme to +represent the recommendation performance. However, in this study, we reveal +that traditional evaluation scheme is not suitable for randomly-exposed +datasets, leading to inconsistency between the Recall performance obtained +using randomly-exposed datasets and that obtained using fully-exposed datasets. +Such inconsistency indicates the potential unreliability of experiment +conclusions on previous debiasing techniques and calls for unbiased Recall +evaluation using randomly-exposed datasets. To bridge the gap, we propose the +Unbiased Recall Evaluation (URE) scheme, which adjusts the utilization of +randomly-exposed datasets to unbiasedly estimate the true Recall performance on +fully-exposed datasets. We provide theoretical evidence to demonstrate the +rationality of URE and perform extensive experiments on real-world datasets to +validate its soundness. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Late Chunking: Contextual Chunk Embeddings Using Long-Context Embedding + Models + + +
+ Many use cases require retrieving smaller portions of text, and dense +vector-based retrieval systems often perform better with shorter text segments, +as the semantics are less likely to be "over-compressed" in the embeddings. +Consequently, practitioners often split text documents into smaller chunks and +encode them separately. However, chunk embeddings created in this way can lose +contextual information from surrounding chunks, resulting in suboptimal +representations. In this paper, we introduce a novel method called "late +chunking," which leverages long context embedding models to first embed all +tokens of the long text, with chunking applied after the transformer model and +just before mean pooling. The resulting chunk embeddings capture the full +contextual information, leading to superior results across various retrieval +tasks without the need for additional training. Moreover, our method is generic +enough to be applied to any long-context embedding model. + +
+
+ comment: 4 pages, early draft +
+
+
+
+
+ + ☆ QueryBuilder: Human-in-the-Loop Query Development for Information + Retrieval + + +
+ Frequently, users of an Information Retrieval (IR) system start with an +overarching information need (a.k.a., an analytic task) and proceed to define +finer-grained queries covering various important aspects (i.e., sub-topics) of +that analytic task. We present a novel, interactive system called +$\textit{QueryBuilder}$, which allows a novice, English-speaking user to create +queries with a small amount of effort, through efficient exploration of an +English development corpus in order to rapidly develop cross-lingual +information retrieval queries corresponding to the user's information needs. +QueryBuilder performs near real-time retrieval of documents based on +user-entered search terms; the user looks through the retrieved documents and +marks sentences as relevant to the information needed. The marked sentences are +used by the system as additional information in query formation and refinement: +query terms (and, optionally, event features, which capture event $'triggers'$ +(indicator terms) and agent/patient roles) are appropriately weighted, and a +neural-based system, which better captures textual meaning, retrieves other +relevant content. The process of retrieval and marking is repeated as many +times as desired, giving rise to increasingly refined queries in each +iteration. The final product is a fine-grained query used in Cross-Lingual +Information Retrieval (CLIR). Our experiments using analytic tasks and requests +from the IARPA BETTER IR datasets show that with a small amount of effort (at +most 10 minutes per sub-topic), novice users can form $\textit{useful}$ +fine-grained queries including in languages they don't understand. QueryBuilder +also provides beneficial capabilities to the traditional corpus exploration and +query formation process. A demonstration video is released at +https://vimeo.com/734795835 + +
+
+
+
+
+ + ♻ ☆ CoST: Contrastive Quantization based Semantic Tokenization for + Generative Recommendation RecSys'2024 + + +
+ Embedding-based retrieval serves as a dominant approach to candidate item +matching for industrial recommender systems. With the success of generative AI, +generative retrieval has recently emerged as a new retrieval paradigm for +recommendation, which casts item retrieval as a generation problem. Its model +consists of two stages: semantic tokenization and autoregressive generation. +The first stage involves item tokenization that constructs discrete semantic +tokens to index items, while the second stage autoregressively generates +semantic tokens of candidate items. Therefore, semantic tokenization serves as +a crucial preliminary step for training generative recommendation models. +Existing research usually employs a vector quantizier with reconstruction loss +(e.g., RQ-VAE) to obtain semantic tokens of items, but this method fails to +capture the essential neighborhood relationships that are vital for effective +item modeling in recommender systems. In this paper, we propose a contrastive +quantization-based semantic tokenization approach, named CoST, which harnesses +both item relationships and semantic information to learn semantic tokens. Our +experimental results highlight the significant impact of semantic tokenization +on generative recommendation performance, with CoST achieving up to a 43% +improvement in Recall@5 and 44% improvement in NDCG@5 on the MIND dataset over +previous baselines. + +
+
+ comment: Accepted by RecSys'2024 +
+
+
+
+
+ + ♻ ☆ Implementing Streaming algorithm and k-means clusters to RAG + + +
+ Retrieval-augmented generation (RAG) has achieved significant success in +information retrieval to assist large language models LLMs because it builds an +external knowledge database. However, it also has many problems, it consumes a +lot of memory because of the enormous database, and it cannot update the +established index database in time when confronted with massive streaming data. +To reduce the memory required for building the database and maintain accuracy +simultaneously, we proposed a new approach integrating a streaming algorithm +with k-means clustering into RAG. Our approach applied a streaming algorithm to +update the index dynamically and reduce memory consumption. Additionally, the +k-means algorithm clusters highly similar documents, and the query time would +be shortened. We conducted comparative experiments on four methods, and the +results indicated that RAG with streaming algorithm and k-means clusters +outperforms traditional RAG in accuracy and memory, particularly when dealing +with large-scale streaming data. + +
+
+
+
+
+ + ♻ ☆ Video Editing for Video Retrieval + + +
+ Though pre-training vision-language models have demonstrated significant +benefits in boosting video-text retrieval performance from large-scale web +videos, fine-tuning still plays a critical role with manually annotated clips +with start and end times, which requires considerable human effort. To address +this issue, we explore an alternative cheaper source of annotations, single +timestamps, for video-text retrieval. We initialise clips from timestamps in a +heuristic way to warm up a retrieval model. Then a video clip editing method is +proposed to refine the initial rough boundaries to improve retrieval +performance. A student-teacher network is introduced for video clip editing. +The teacher model is employed to edit the clips in the training set whereas the +student model trains on the edited clips. The teacher weights are updated from +the student's after the student's performance increases. Our method is model +agnostic and applicable to any retrieval models. We conduct experiments based +on three state-of-the-art retrieval models, COOT, VideoCLIP and CLIP4Clip. +Experiments conducted on three video retrieval datasets, YouCook2, DiDeMo and +ActivityNet-Captions show that our edited clips consistently improve retrieval +performance over initial clips across all the three retrieval models. + +
+
+
+
+
+ + ♻ ☆ Reformulating Conversational Recommender Systems as Tri-Phase Offline + Policy Learning CIKM 2024 + + +
+ Existing Conversational Recommender Systems (CRS) predominantly utilize user +simulators for training and evaluating recommendation policies. These +simulators often oversimplify the complexity of user interactions by focusing +solely on static item attributes, neglecting the rich, evolving preferences +that characterize real-world user behavior. This limitation frequently leads to +models that perform well in simulated environments but falter in actual +deployment. Addressing these challenges, this paper introduces the Tri-Phase +Offline Policy Learning-based Conversational Recommender System (TCRS), which +significantly reduces dependency on real-time interactions and mitigates +overfitting issues prevalent in traditional approaches. TCRS integrates a +model-based offline learning strategy with a controllable user simulation that +dynamically aligns with both personalized and evolving user preferences. +Through comprehensive experiments, TCRS demonstrates enhanced robustness, +adaptability, and accuracy in recommendations, outperforming traditional CRS +models in diverse user scenarios. This approach not only provides a more +realistic evaluation environment but also facilitates a deeper understanding of +user behavior dynamics, thereby refining the recommendation process. + +
+
+ comment: Accepted at CIKM 2024 +
+
+
+
+
+ + ♻ ☆ Deep Tree-based Retrieval for Efficient Recommendation: Theory and + Method + + +
+ With the development of deep learning techniques, deep recommendation models +also achieve remarkable improvements in terms of recommendation accuracy. +However, due to the large number of candidate items in practice and the high +cost of preference computation, these methods also suffer from low efficiency +of recommendation. The recently proposed tree-based deep recommendation models +alleviate the problem by directly learning tree structure and representations +under the guidance of recommendation objectives. However, such models have +shortcomings. The max-heap assumption in the hierarchical tree, in which the +preference for a parent node should be the maximum between the preferences for +its children, is difficult to satisfy in their binary classification +objectives. To this end, we propose Tree-based Deep Retrieval (TDR for short) +for efficient recommendation. In TDR, all the trees generated during the +training process are retained to form the forest. When learning the node +representation of each tree, we have to satisfy the max-heap assumption as much +as possible and mimic beam search behavior over the tree in the training stage. +This is achieved by TDR to regard the training task as multi-classification +over tree nodes at the same level. However, the number of tree nodes grows +exponentially with levels, making us train the preference model with the +guidance of the sampled-softmax technique. The experiments are conducted on +real-world datasets, validating the effectiveness of the proposed preference +model learning method and tree learning method. + +
+
+
+
+
+ + ♻ ☆ YouTube Videos for Public Health Literacy? A Machine Learning Pipeline + to Curate Covid-19 Videos + + +
+ The COVID-19 pandemic has highlighted the dire necessity to improve public +health literacy for societal resilience. YouTube, the largest video-sharing +social media platform, provides a vast repository of user-generated health +information in a multi-media-rich format which may be easier for the public to +understand and use if major concerns about content quality and accuracy are +addressed. This study develops an automated solution to identify, retrieve and +shortlist medically relevant and understandable YouTube videos that domain +experts can subsequently review and recommend for disseminating and educating +the public on the COVID-19 pandemic and similar public health outbreaks. Our +approach leverages domain knowledge from human experts and machine learning and +natural language processing methods to provide a scalable, replicable, and +generalizable approach that can also be applied to enhance the management of +many health conditions. + +
+
+ comment: Studies in health technology and informatics(MedInfo) 2023 +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ POINTS: Improving Your Vision-language Model with Affordable Strategies + + +
+ In recent years, vision-language models have made significant strides, +excelling in tasks like optical character recognition and geometric +problem-solving. However, several critical issues remain: 1) Proprietary models +often lack transparency about their architectures, while open-source models +need more detailed ablations of their training strategies. 2) Pre-training data +in open-source works is under-explored, with datasets added empirically, making +the process cumbersome. 3) Fine-tuning often focuses on adding datasets, +leading to diminishing returns. To address these issues, we propose the +following contributions: 1) We trained a robust baseline model using the latest +advancements in vision-language models, introducing effective improvements and +conducting comprehensive ablation and validation for each technique. 2) +Inspired by recent work on large language models, we filtered pre-training data +using perplexity, selecting the lowest perplexity data for training. This +approach allowed us to train on a curated 1M dataset, achieving competitive +performance. 3) During visual instruction tuning, we used model soup on +different datasets when adding more datasets yielded marginal improvements. +These innovations resulted in a 9B parameter model that performs competitively +with state-of-the-art models. Our strategies are efficient and lightweight, +making them easily adoptable by the community. + +
+
+
+
+
+ + ♻ ☆ Multimodal Speech Enhancement Using Burst Propagation + + +
+ This paper proposes the MBURST, a novel multimodal solution for audio-visual +speech enhancements that consider the most recent neurological discoveries +regarding pyramidal cells of the prefrontal cortex and other brain regions. The +so-called burst propagation implements several criteria to address the credit +assignment problem in a more biologically plausible manner: steering the sign +and magnitude of plasticity through feedback, multiplexing the feedback and +feedforward information across layers through different weight connections, +approximating feedback and feedforward connections, and linearizing the +feedback signals. MBURST benefits from such capabilities to learn correlations +between the noisy signal and the visual stimuli, thus attributing meaning to +the speech by amplifying relevant information and suppressing noise. +Experiments conducted over a Grid Corpus and CHiME3-based dataset show that +MBURST can reproduce similar mask reconstructions to the multimodal +backpropagation-based baseline while demonstrating outstanding energy +efficiency management, reducing the neuron firing rates to values up to +\textbf{$70\%$} lower. Such a feature implies more sustainable implementations, +suitable and desirable for hearing aids or any other similar embedded systems. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 57 + +
+
+
+ + ☆ RLPF: Reinforcement Learning from Prediction Feedback for User + Summarization with LLMs + + +
+ LLM-powered personalization agent systems employ Large Language Models (LLMs) +to predict users' behavior from their past activities. However, their +effectiveness often hinges on the ability to effectively leverage extensive, +long user historical data due to its inherent noise and length of such data. +Existing pretrained LLMs may generate summaries that are concise but lack the +necessary context for downstream tasks, hindering their utility in +personalization systems. To address these challenges, we introduce +Reinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to +generate concise, human-readable user summaries that are optimized for +downstream task performance. By maximizing the usefulness of the generated +summaries, RLPF effectively distills extensive user history data while +preserving essential information for downstream tasks. Our empirical evaluation +demonstrates significant improvements in both extrinsic downstream task utility +and intrinsic summary quality, surpassing baseline methods by up to 22% on +downstream task performance and achieving an up to 84.59% win rate on +Factuality, Abstractiveness, and Readability. RLPF also achieves a remarkable +74% reduction in context length while improving performance on 16 out of 19 +unseen tasks and/or datasets, showcasing its generalizability. This approach +offers a promising solution for enhancing LLM personalization by effectively +transforming long, noisy user histories into informative and human-readable +representations. + +
+
+
+
+
+ + ☆ Empirical Bayesian image restoration by Langevin sampling with a + denoising diffusion implicit prior + + +
+ Score-based diffusion methods provide a powerful strategy to solve image +restoration tasks by flexibly combining a pre-trained foundational prior model +with a likelihood function specified during test time. Such methods are +predominantly derived from two stochastic processes: reversing +Ornstein-Uhlenbeck, which underpins the celebrated denoising diffusion +probabilistic models (DDPM) and denoising diffusion implicit models (DDIM), and +the Langevin diffusion process. The solutions delivered by DDPM and DDIM are +often remarkably realistic, but they are not always consistent with +measurements because of likelihood intractability issues and the associated +required approximations. Alternatively, using a Langevin process circumvents +the intractable likelihood issue, but usually leads to restoration results of +inferior quality and longer computing times. This paper presents a novel and +highly computationally efficient image restoration method that carefully embeds +a foundational DDPM denoiser within an empirical Bayesian Langevin algorithm, +which jointly calibrates key model hyper-parameters as it estimates the model's +posterior mean. Extensive experimental results on three canonical tasks (image +deblurring, super-resolution, and inpainting) demonstrate that the proposed +approach improves on state-of-the-art strategies both in image estimation +accuracy and computing time. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ AGR: Age Group fairness Reward for Bias Mitigation in LLMs + + +
+ LLMs can exhibit age biases, resulting in unequal treatment of individuals +across age groups. While much research has addressed racial and gender biases, +age bias remains little explored. The scarcity of instruction-tuning and +preference datasets for age bias hampers its detection and measurement, and +existing fine-tuning methods seldom address age-related fairness. In this +paper, we construct age bias preference datasets and instruction-tuning +datasets for RLHF. We introduce ARG, an age fairness reward to reduce +differences in the response quality of LLMs across different age groups. +Extensive experiments demonstrate that this reward significantly improves +response accuracy and reduces performance disparities across age groups. Our +source code and datasets are available at the anonymous +\href{https://anonymous.4open.science/r/FairRLHF-D445/readme.md}{link}. + +
+
+ comment: The first two authors contributed equally to this work. Corresponding + to Zhiqiang Wang. ACKNOWLEDGMENT: we would like to thank the computing + resources support from the State Key Laboratory of New Computer Software + Technologies at Nanjing University +
+
+
+
+
+ + ☆ Learning vs Retrieval: The Role of In-Context Examples in Regression + with LLMs + + +
+ Generative Large Language Models (LLMs) are capable of being in-context +learners. However, the underlying mechanism of in-context learning (ICL) is +still a major research question, and experimental research results about how +models exploit ICL are not always consistent. In this work, we propose a +framework for evaluating in-context learning mechanisms, which we claim are a +combination of retrieving internal knowledge and learning from in-context +examples by focusing on regression tasks. First, we show that LLMs can perform +regression on real-world datasets and then design experiments to measure the +extent to which the LLM retrieves its internal knowledge versus learning from +in-context examples. We argue that this process lies on a spectrum between +these two extremes. We provide an in-depth analysis of the degrees to which +these mechanisms are triggered depending on various factors, such as prior +knowledge about the tasks and the type and richness of the information provided +by the in-context examples. We employ three LLMs and utilize multiple datasets +to corroborate the robustness of our findings. Our results shed light on how to +engineer prompts to leverage meta-learning from in-context examples and foster +knowledge retrieval depending on the problem being addressed. + +
+
+
+
+
+ + ☆ Using Large Language Models to Generate Authentic Multi-agent Knowledge + Work Datasets + + +
+ Current publicly available knowledge work data collections lack diversity, +extensive annotations, and contextual information about the users and their +documents. These issues hinder objective and comparable data-driven evaluations +and optimizations of knowledge work assistance systems. Due to the considerable +resources needed to collect such data in real-life settings and the necessity +of data censorship, collecting such a dataset appears nearly impossible. For +this reason, we propose a configurable, multi-agent knowledge work dataset +generator. This system simulates collaborative knowledge work among agents +producing Large Language Model-generated documents and accompanying data +traces. Additionally, the generator captures all background information, given +in its configuration or created during the simulation process, in a knowledge +graph. Finally, the resulting dataset can be utilized and shared without +privacy or confidentiality concerns. + This paper introduces our approach's design and vision and focuses on +generating authentic knowledge work documents using Large Language Models. Our +study involving human raters who assessed 53% of the generated and 74% of the +real documents as realistic demonstrates the potential of our approach. +Furthermore, we analyze the authenticity criteria mentioned in the +participants' comments and elaborate on potential improvements for identified +common issues. + +
+
+ comment: Accepted and in press (INFORMATIK Festival, Wiesbaden, 2024) +
+
+
+
+
+ + ☆ Open Language Data Initiative: Advancing Low-Resource Machine + Translation for Karakalpak + + +
+ This study presents several contributions for the Karakalpak language: a +FLORES+ devtest dataset translated to Karakalpak, parallel corpora for +Uzbek-Karakalpak, Russian-Karakalpak and English-Karakalpak of 100,000 pairs +each and open-sourced fine-tuned neural models for translation across these +languages. Our experiments compare different model variants and training +approaches, demonstrating improvements over existing baselines. This work, +conducted as part of the Open Language Data Initiative (OLDI) shared task, aims +to advance machine translation capabilities for Karakalpak and contribute to +expanding linguistic diversity in NLP technologies. + +
+
+ comment: Submitted to WMT 2024 +
+
+
+
+
+ + ☆ An overview of domain-specific foundation model: key technologies, + applications and challenges + + +
+ The impressive performance of ChatGPT and other foundation-model-based +products in human language understanding has prompted both academia and +industry to explore how these models can be tailored for specific industries +and application scenarios. This process, known as the customization of +domain-specific foundation models, addresses the limitations of general-purpose +models, which may not fully capture the unique patterns and requirements of +domain-specific data. Despite its importance, there is a notable lack of +comprehensive overview papers on building domain-specific foundation models, +while numerous resources exist for general-purpose models. To bridge this gap, +this article provides a timely and thorough overview of the methodology for +customizing domain-specific foundation models. It introduces basic concepts, +outlines the general architecture, and surveys key methods for constructing +domain-specific models. Furthermore, the article discusses various domains that +can benefit from these specialized models and highlights the challenges ahead. +Through this overview, we aim to offer valuable guidance and reference for +researchers and practitioners from diverse fields to develop their own +customized foundation models. + +
+
+
+
+
+ + ☆ Fast Forwarding Low-Rank Training + + +
+ Parameter efficient finetuning methods like low-rank adaptation (LoRA) aim to +reduce the computational costs of finetuning pretrained Language Models (LMs). +Enabled by these low-rank settings, we propose an even more efficient +optimization strategy: Fast Forward, a simple and effective approach to +accelerate large segments of training. In a Fast Forward stage, we repeat the +most recent optimizer step until the loss stops improving on a tiny validation +set. By alternating between regular optimization steps and Fast Forward stages, +Fast Forward provides up to an 87\% reduction in FLOPs and up to an 81\% +reduction in train time over standard SGD with Adam. We validate Fast Forward +by finetuning various models on different tasks and demonstrate that it speeds +up training without compromising model performance. Additionally, we analyze +when and how to apply Fast Forward. + +
+
+
+
+
+ + ☆ Residual Stream Analysis with Multi-Layer SAEs + + +
+ Sparse autoencoders (SAEs) are a promising approach to interpreting the +internal representations of transformer language models. However, standard SAEs +are trained separately on each transformer layer, making it difficult to use +them to study how information flows across layers. To solve this problem, we +introduce the multi-layer SAE (MLSAE): a single SAE trained on the residual +stream activation vectors from every transformer layer simultaneously. The +residual stream is usually understood as preserving information across layers, +so we expected to, and did, find individual SAE features that are active at +multiple layers. Interestingly, while a single SAE feature is active at +different layers for different prompts, for a single prompt, we find that a +single feature is far more likely to be active at a single layer. For larger +underlying models, we find that the cosine similarities between adjacent layers +in the residual stream are higher, so we expect more features to be active at +multiple layers. These results show that MLSAEs are a promising method to study +information flow in transformers. We release our code to train and analyze +MLSAEs at https://github.com/tim-lawson/mlsae. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ☆ GALLa: Graph Aligned Large Language Models for Improved Source Code + Understanding + + +
+ Programming languages possess rich semantic information such as data flow +that is represented by graphs and not available from the surface form of source +code. Recent code language models have scaled to billions of parameters, but +model source code solely as text tokens while ignoring any other structural +information. Conversely, models that do encode structural information of code +make modifications to the Transformer architecture, limiting their scale and +compatibility with pretrained LLMs. In this work, we take the best of both +worlds with GALLa - Graph Aligned Large Language Model. GALLa utilizes graph +neural networks and cross-modal alignment technologies to inject the structural +information of code into LLMs as an auxiliary task during finetuning. This +framework is both model-agnostic and task-agnostic, as it can be applied to any +code LLM for any code downstream task, and requires the structural graph data +only at training time from a corpus unrelated to the finetuning data, while +incurring no cost at inference time over the baseline LLM. Experiments on five +code tasks with four different baseline LLMs ranging in size from 350M to 8B +validate the effectiveness of GALLa, demonstrating consistent improvement over +the baseline, even for powerful models such as LLaMA3. + +
+
+
+
+
+ + ☆ Combining LLMs and Knowledge Graphs to Reduce Hallucinations in Question + Answering + + +
+ Advancements in natural language processing have revolutionized the way we +can interact with digital information systems, such as databases, making them +more accessible. However, challenges persist, especially when accuracy is +critical, as in the biomedical domain. A key issue is the hallucination +problem, where models generate information unsupported by the underlying data, +potentially leading to dangerous misinformation. This paper presents a novel +approach designed to bridge this gap by combining Large Language Models (LLM) +and Knowledge Graphs (KG) to improve the accuracy and reliability of +question-answering systems, on the example of a biomedical KG. Built on the +LangChain framework, our method incorporates a query checker that ensures the +syntactical and semantic validity of LLM-generated queries, which are then used +to extract information from a Knowledge Graph, substantially reducing errors +like hallucinations. We evaluated the overall performance using a new benchmark +dataset of 50 biomedical questions, testing several LLMs, including GPT-4 Turbo +and llama3:70b. Our results indicate that while GPT-4 Turbo outperforms other +models in generating accurate queries, open-source models like llama3:70b show +promise with appropriate prompt engineering. To make this approach accessible, +a user-friendly web-based interface has been developed, allowing users to input +natural language queries, view generated and corrected Cypher queries, and +verify the resulting paths for accuracy. Overall, this hybrid approach +effectively addresses common issues such as data gaps and hallucinations, +offering a reliable and intuitive solution for question answering systems. The +source code for generating the results of this paper and for the user-interface +can be found in our Git repository: https://git.zib.de/lpusch/cyphergenkg-gui + +
+
+
+
+
+ + ☆ From Calculation to Adjudication: Examining LLM judges on Mathematical + Reasoning Tasks + + +
+ To reduce the need for human annotations, large language models (LLMs) have +been proposed as judges of the quality of other candidate models. LLM judges +are typically evaluated by measuring the correlation with human judgments on +generation tasks such as summarization or machine translation. In contrast, we +study LLM judges on mathematical reasoning tasks. These tasks require +multi-step reasoning, and the correctness of their solutions is verifiable, +enabling a more objective evaluation. We perform a detailed performance +analysis and find that the used judges are mostly unable to improve task +performance but are able to pick the better model. Our analysis uncovers a +strong correlation between judgment performance and the candidate model task +performance. We observe that judges tend to choose the model of higher quality +even if its answer is incorrect. Further, we show that it is possible to use +statistics, such as the task performances of the individual models, to predict +judgment performance. In an ablation, we either swap or mask the candidate +answers and observe that judges often keep the original judgment, providing +evidence that judges incorporate writing style in their judgments. In summary, +we find that regularities in the judgments are quantifiable using statistical +measures and provide various angles on exploiting them. + +
+
+
+
+
+ + ☆ Can OpenSource beat ChatGPT? -- A Comparative Study of Large Language + Models for Text-to-Code Generation + + +
+ In recent years, large language models (LLMs) have emerged as powerful tools +with potential applications in various fields, including software engineering. +Within the scope of this research, we evaluate five different state-of-the-art +LLMs - Bard, BingChat, ChatGPT, Llama2, and Code Llama - concerning their +capabilities for text-to-code generation. In an empirical study, we feed +prompts with textual descriptions of coding problems sourced from the +programming website LeetCode to the models with the task of creating solutions +in Python. Subsequently, the quality of the generated outputs is assessed using +the testing functionalities of LeetCode. The results indicate large differences +in performance between the investigated models. ChatGPT can handle these +typical programming challenges by far the most effectively, surpassing even +code-specialized models like Code Llama. To gain further insights, we measure +the runtime as well as the memory usage of the generated outputs and compared +them to the other code submissions on Leetcode. A detailed error analysis, +encompassing a comparison of the differences concerning correct indentation and +form of the generated code as well as an assignment of the incorrectly solved +tasks to certain error categories allows us to obtain a more nuanced picture of +the results and potential for improvement. The results also show a clear +pattern of increasingly incorrect produced code when the models are facing a +lot of context in the form of longer prompts. + +
+
+ comment: Conference Paper accepted at the 9th SwissText Conference (2024) +
+
+
+
+
+ + ☆ A Coin Has Two Sides: A Novel Detector-Corrector Framework for Chinese + Spelling Correction ECAI-2024 + + +
+ Chinese Spelling Correction (CSC) stands as a foundational Natural Language +Processing (NLP) task, which primarily focuses on the correction of erroneous +characters in Chinese texts. Certain existing methodologies opt to disentangle +the error correction process, employing an additional error detector to +pinpoint error positions. However, owing to the inherent performance +limitations of error detector, precision and recall are like two sides of the +coin which can not be both facing up simultaneously. Furthermore, it is also +worth investigating how the error position information can be judiciously +applied to assist the error correction. In this paper, we introduce a novel +approach based on error detector-corrector framework. Our detector is designed +to yield two error detection results, each characterized by high precision and +recall. Given that the occurrence of errors is context-dependent and detection +outcomes may be less precise, we incorporate the error detection results into +the CSC task using an innovative feature fusion strategy and a selective +masking strategy. Empirical experiments conducted on mainstream CSC datasets +substantiate the efficacy of our proposed method. + +
+
+ comment: ECAI-2024 +
+
+
+
+
+ + ☆ Prompt-based Personality Profiling: Reinforcement Learning for Relevance + Filtering + + +
+ Author profiling is the task of inferring characteristics about individuals +by analyzing content they share. Supervised machine learning still dominates +automatic systems that perform this task, despite the popularity of prompting +large language models to address natural language understanding tasks. One +reason is that the classification instances consist of large amounts of posts, +potentially a whole user profile, which may exceed the input length of +Transformers. Even if a model can use a large context window, the entirety of +posts makes the application of API-accessed black box systems costly and slow, +next to issues which come with such "needle-in-the-haystack" tasks. To mitigate +this limitation, we propose a new method for author profiling which aims at +distinguishing relevant from irrelevant content first, followed by the actual +user profiling only with relevant data. To circumvent the need for +relevance-annotated data, we optimize this relevance filter via reinforcement +learning with a reward function that utilizes the zero-shot capabilities of +large language models. We evaluate our method for Big Five personality trait +prediction on two Twitter corpora. On publicly available real-world data with a +skewed label distribution, our method shows similar efficacy to using all posts +in a user profile, but with a substantially shorter context. An evaluation on a +version of these data balanced with artificial posts shows that the filtering +to relevant posts leads to a significantly improved accuracy of the +predictions. + +
+
+ comment: preprint, under review, supplementary material will be made available + upon acceptance of the paper +
+
+
+
+
+ + ☆ Confidence-Aware Document OCR Error Detection + + +
+ Optical Character Recognition (OCR) continues to face accuracy challenges +that impact subsequent applications. To address these errors, we explore the +utility of OCR confidence scores for enhancing post-OCR error detection. Our +study involves analyzing the correlation between confidence scores and error +rates across different OCR systems. We develop ConfBERT, a BERT-based model +that incorporates OCR confidence scores into token embeddings and offers an +optional pre-training phase for noise adjustment. Our experimental results +demonstrate that integrating OCR confidence scores can enhance error detection +capabilities. This work underscores the importance of OCR confidence scores in +improving detection accuracy and reveals substantial disparities in performance +between commercial and open-source OCR technologies. + +
+
+
+
+
+ + ☆ Multi-Programming Language Ensemble for Code Generation in Large + Language Model + + +
+ Large language models (LLMs) have significantly improved code generation, +particularly in one-pass code generation. However, most existing approaches +focus solely on generating code in a single programming language, overlooking +the potential of leveraging the multi-language capabilities of LLMs. LLMs have +varying patterns of errors across different languages, suggesting that a more +robust approach could be developed by leveraging these multi-language outputs. +In this study, we propose Multi-Programming Language Ensemble (MPLE), a novel +ensemble-based method that utilizes code generation across multiple programming +languages to enhance overall performance. By treating each language-specific +code generation process as an individual "weak expert" and effectively +integrating their outputs, our method mitigates language-specific errors and +biases. This multi-language ensemble strategy leverages the complementary +strengths of different programming languages, enabling the model to produce +more accurate and robust code. Our approach can be seamlessly integrated with +commonly used techniques such as the reflection algorithm and Monte Carlo tree +search to improve code generation quality further. Experimental results show +that our framework consistently enhances baseline performance by up to 17.92% +on existing benchmarks (HumanEval and HumanEval-plus), with a standout result +of 96.25% accuracy on the HumanEval benchmark, achieving new state-of-the-art +results across various LLM models. The code will be released at +https://github.com/NinjaTech-AI/MPLE + +
+
+ comment: Code available at https://github.com/NinjaTech-AI/MPLE +
+
+
+
+
+ + ☆ Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with + 100+ NLP Researchers + + +
+ Recent advancements in large language models (LLMs) have sparked optimism +about their potential to accelerate scientific discovery, with a growing number +of works proposing research agents that autonomously generate and validate new +ideas. Despite this, no evaluations have shown that LLM systems can take the +very first step of producing novel, expert-level ideas, let alone perform the +entire research process. We address this by establishing an experimental design +that evaluates research idea generation while controlling for confounders and +performs the first head-to-head comparison between expert NLP researchers and +an LLM ideation agent. By recruiting over 100 NLP researchers to write novel +ideas and blind reviews of both LLM and human ideas, we obtain the first +statistically significant conclusion on current LLM capabilities for research +ideation: we find LLM-generated ideas are judged as more novel (p < 0.05) than +human expert ideas while being judged slightly weaker on feasibility. Studying +our agent baselines closely, we identify open problems in building and +evaluating research agents, including failures of LLM self-evaluation and their +lack of diversity in generation. Finally, we acknowledge that human judgements +of novelty can be difficult, even by experts, and propose an end-to-end study +design which recruits researchers to execute these ideas into full projects, +enabling us to study whether these novelty and feasibility judgements result in +meaningful differences in research outcome. + +
+
+ comment: main paper is 20 pages +
+
+
+
+
+ + ☆ Structure and dynamics of growing networks of Reddit threads + + +
+ Millions of people use online social networks to reinforce their sense of +belonging, for example by giving and asking for feedback as a form of social +validation and self-recognition. It is common to observe disagreement among +people beliefs and points of view when expressing this feedback. Modeling and +analyzing such interactions is crucial to understand social phenomena that +happen when people face different opinions while expressing and discussing +their values. In this work, we study a Reddit community in which people +participate to judge or be judged with respect to some behavior, as it +represents a valuable source to study how users express judgments online. We +model threads of this community as complex networks of user interactions +growing in time, and we analyze the evolution of their structural properties. +We show that the evolution of Reddit networks differ from other real social +networks, despite falling in the same category. This happens because their +global clustering coefficient is extremely small and the average shortest path +length increases over time. Such properties reveal how users discuss in +threads, i.e. with mostly one other user and often by a single message. We +strengthen such result by analyzing the role that disagreement and reciprocity +play in such conversations. We also show that Reddit thread's evolution over +time is governed by two subgraphs growing at different speeds. We discover +that, in the studied community, the difference of such speed is higher than in +other communities because of the user guidelines enforcing specific user +interactions. Finally, we interpret the obtained results on user behavior +drawing back to Social Judgment Theory. + +
+
+ comment: 29 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ UI-JEPA: Towards Active Perception of User Intent through Onscreen User + Activity + + +
+ Generating user intent from a sequence of user interface (UI) actions is a +core challenge in comprehensive UI understanding. Recent advancements in +multimodal large language models (MLLMs) have led to substantial progress in +this area, but their demands for extensive model parameters, computing power, +and high latency makes them impractical for scenarios requiring lightweight, +on-device solutions with low latency or heightened privacy. Additionally, the +lack of high-quality datasets has hindered the development of such lightweight +models. To address these challenges, we propose UI-JEPA, a novel framework that +employs masking strategies to learn abstract UI embeddings from unlabeled data +through self-supervised learning, combined with an LLM decoder fine-tuned for +user intent prediction. We also introduce two new UI-grounded multimodal +datasets, "Intent in the Wild" (IIW) and "Intent in the Tame" (IIT), designed +for few-shot and zero-shot UI understanding tasks. IIW consists of 1.7K videos +across 219 intent categories, while IIT contains 914 videos across 10 +categories. We establish the first baselines for these datasets, showing that +representations learned using a JEPA-style objective, combined with an LLM +decoder, can achieve user intent predictions that match the performance of +state-of-the-art large MLLMs, but with significantly reduced annotation and +deployment resources. Measured by intent similarity scores, UI-JEPA outperforms +GPT-4 Turbo and Claude 3.5 Sonnet by 10.0% and 7.2% respectively, averaged +across two datasets. Notably, UI-JEPA accomplishes the performance with a 50.5x +reduction in computational cost and a 6.6x improvement in latency in the IIW +dataset. These results underscore the effectiveness of UI-JEPA, highlighting +its potential for lightweight, high-performance UI understanding. + +
+
+
+
+
+ + ☆ AnyMatch -- Efficient Zero-Shot Entity Matching with a Small Language + Model + + +
+ Entity matching (EM) is the problem of determining whether two records refer +to same real-world entity, which is crucial in data integration, e.g., for +product catalogs or address databases. A major drawback of many EM approaches +is their dependence on labelled examples. We thus focus on the challenging +setting of zero-shot entity matching where no labelled examples are available +for an unseen target dataset. Recently, large language models (LLMs) have shown +promising results for zero-shot EM, but their low throughput and high +deployment cost limit their applicability and scalability. + We revisit the zero-shot EM problem with AnyMatch, a small language model +fine-tuned in a transfer learning setup. We propose several novel data +selection techniques to generate fine-tuning data for our model, e.g., by +selecting difficult pairs to match via an AutoML filter, by generating +additional attribute-level examples, and by controlling label imbalance in the +data. + We conduct an extensive evaluation of the prediction quality and deployment +cost of our model, in a comparison to thirteen baselines on nine benchmark +datasets. We find that AnyMatch provides competitive prediction quality despite +its small parameter size: it achieves the second-highest F1 score overall, and +outperforms several other approaches that employ models with hundreds of +billions of parameters. Furthermore, our approach exhibits major cost benefits: +the average prediction quality of AnyMatch is within 4.4% of the +state-of-the-art method MatchGPT with the proprietary trillion-parameter model +GPT-4, yet AnyMatch requires four orders of magnitude less parameters and +incurs a 3,899 times lower inference cost (in dollars per 1,000 tokens). + +
+
+ comment: 12 pages excluding references, 3 figures, and 5 tables +
+
+
+
+
+ + ☆ Self-Harmonized Chain of Thought + + +
+ Chain-of-Thought (CoT) prompting reveals that large language models are +capable of performing complex reasoning via intermediate steps. CoT prompting +is primarily categorized into three approaches. The first approach utilizes +straightforward prompts like ``Let's think step by step'' to generate a +sequential thought process before yielding an answer. The second approach makes +use of human-crafted, step-by-step demonstrations to guide the model's +reasoning process. The third automates the generation of reasoned +demonstrations with the 'Let's think step by step'.This approach sometimes +leads to reasoning errors, highlighting the need to diversify demonstrations to +mitigate its misleading effects. However, diverse demonstrations pose +challenges for effective representations. In this work, we propose ECHO, a +self-harmonized chain-of-thought prompting method. It consolidates diverse +solution paths into a uniform and effective solution pattern.ECHO demonstrates +the best overall performance across three reasoning domains. + +
+
+
+
+
+ + ☆ Refining Wikidata Taxonomy using Large Language Models + + +
+ Due to its collaborative nature, Wikidata is known to have a complex +taxonomy, with recurrent issues like the ambiguity between instances and +classes, the inaccuracy of some taxonomic paths, the presence of cycles, and +the high level of redundancy across classes. Manual efforts to clean up this +taxonomy are time-consuming and prone to errors or subjective decisions. We +present WiKC, a new version of Wikidata taxonomy cleaned automatically using a +combination of Large Language Models (LLMs) and graph mining techniques. +Operations on the taxonomy, such as cutting links or merging classes, are +performed with the help of zero-shot prompting on an open-source LLM. The +quality of the refined taxonomy is evaluated from both intrinsic and extrinsic +perspectives, on a task of entity typing for the latter, showing the practical +interest of WiKC. + +
+
+ comment: ACM International Conference on Information and Knowledge Management, + Oct 2024, Boise, Idaho, United States +
+
+
+
+
+ + ☆ Towards Safer Online Spaces: Simulating and Assessing Intervention + Strategies for Eating Disorder Discussions + + +
+ Eating disorders are complex mental health conditions that affect millions of +people around the world. Effective interventions on social media platforms are +crucial, yet testing strategies in situ can be risky. We present a novel +LLM-driven experimental testbed for simulating and assessing intervention +strategies in ED-related discussions. Our framework generates synthetic +conversations across multiple platforms, models, and ED-related topics, +allowing for controlled experimentation with diverse intervention approaches. +We analyze the impact of various intervention strategies on conversation +dynamics across four dimensions: intervention type, generative model, social +media platform, and ED-related community/topic. We employ cognitive domain +analysis metrics, including sentiment, emotions, etc., to evaluate the +effectiveness of interventions. Our findings reveal that civility-focused +interventions consistently improve positive sentiment and emotional tone across +all dimensions, while insight-resetting approaches tend to increase negative +emotions. We also uncover significant biases in LLM-generated conversations, +with cognitive metrics varying notably between models (Claude-3 Haiku $>$ +Mistral $>$ GPT-3.5-turbo $>$ LLaMA3) and even between versions of the same +model. These variations highlight the importance of model selection in +simulating realistic discussions related to ED. Our work provides valuable +information on the complex dynamics of ED-related discussions and the +effectiveness of various intervention strategies. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Large Margin Prototypical Network for Few-shot Relation Classification + with Fine-grained Features CIKM'19 + + +
+ Relation classification (RC) plays a pivotal role in both natural language +understanding and knowledge graph completion. It is generally formulated as a +task to recognize the relationship between two entities of interest appearing +in a free-text sentence. Conventional approaches on RC, regardless of feature +engineering or deep learning based, can obtain promising performance on +categorizing common types of relation leaving a large proportion of +unrecognizable long-tail relations due to insufficient labeled instances for +training. In this paper, we consider few-shot learning is of great practical +significance to RC and thus improve a modern framework of metric learning for +few-shot RC. Specifically, we adopt the large-margin ProtoNet with fine-grained +features, expecting they can generalize well on long-tail relations. Extensive +experiments were conducted by FewRel, a large-scale supervised few-shot RC +dataset, to evaluate our framework: LM-ProtoNet (FGF). The results demonstrate +that it can achieve substantial improvements over many baseline approaches. + +
+
+ comment: Accepted by CIKM'19 +
+
+
+
+
+ + ☆ On The Role of Prompt Construction In Enhancing Efficacy and Efficiency + of LLM-Based Tabular Data Generation + + +
+ LLM-based data generation for real-world tabular data can be challenged by +the lack of sufficient semantic context in feature names used to describe +columns. We hypothesize that enriching prompts with domain-specific insights +can improve both the quality and efficiency of data generation. To test this +hypothesis, we explore three prompt construction protocols: Expert-guided, +LLM-guided, and Novel-Mapping. Through empirical studies with the recently +proposed GReaT framework, we find that context-enriched prompts lead to +significantly improved data generation quality and training efficiency. + +
+
+
+
+
+ + ☆ Sparse Rewards Can Self-Train Dialogue Agents + + +
+ Recent advancements in state-of-the-art (SOTA) Large Language Model (LLM) +agents, especially in multi-turn dialogue tasks, have been primarily driven by +supervised fine-tuning and high-quality human feedback. However, as base LLM +models continue to improve, acquiring meaningful human feedback has become +increasingly challenging and costly. In certain domains, base LLM agents may +eventually exceed human capabilities, making traditional feedback-driven +methods impractical. In this paper, we introduce a novel self-improvement +paradigm that empowers LLM agents to autonomously enhance their performance +without external human feedback. Our method, Juxtaposed Outcomes for Simulation +Harvesting (JOSH), is a self-alignment algorithm that leverages a sparse reward +simulation environment to extract ideal behaviors and further train the LLM on +its own outputs. We present ToolWOZ, a sparse reward tool-calling simulation +environment derived from MultiWOZ. We demonstrate that models trained with +JOSH, both small and frontier, significantly improve tool-based interactions +while preserving general model capabilities across diverse benchmarks. Our code +and data are publicly available on GitHub. + +
+
+ comment: Minor but nontrivial changes likely +
+
+
+
+
+ + ☆ BPE Gets Picky: Efficient Vocabulary Refinement During Tokenizer + Training + + +
+ Language models can largely benefit from efficient tokenization. However, +they still mostly utilize the classical BPE algorithm, a simple and reliable +method. This has been shown to cause such issues as under-trained tokens and +sub-optimal compression that may affect the downstream performance. We +introduce Picky BPE, a modified BPE algorithm that carries out vocabulary +refinement during tokenizer training. Our method improves vocabulary +efficiency, eliminates under-trained tokens, and does not compromise text +compression. Our experiments show that our method does not reduce the +downstream performance, and in several cases improves it. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Paper Copilot: A Self-Evolving and Efficient LLM System for Personalized + Academic Assistance + + +
+ As scientific research proliferates, researchers face the daunting task of +navigating and reading vast amounts of literature. Existing solutions, such as +document QA, fail to provide personalized and up-to-date information +efficiently. We present Paper Copilot, a self-evolving, efficient LLM system +designed to assist researchers, based on thought-retrieval, user profile and +high performance optimization. Specifically, Paper Copilot can offer +personalized research services, maintaining a real-time updated database. +Quantitative evaluation demonstrates that Paper Copilot saves 69.92\% of time +after efficient deployment. This paper details the design and implementation of +Paper Copilot, highlighting its contributions to personalized academic support +and its potential to streamline the research process. + +
+
+
+
+
+ + ☆ Customizing Large Language Model Generation Style using + Parameter-Efficient Finetuning + + +
+ One-size-fits-all large language models (LLMs) are increasingly being used to +help people with their writing. However, the style these models are trained to +write in may not suit all users or use cases. LLMs would be more useful as +writing assistants if their idiolect could be customized to match each user. In +this paper, we explore whether parameter-efficient finetuning (PEFT) with +Low-Rank Adaptation can effectively guide the style of LLM generations. We use +this method to customize LLaMA-2 to ten different authors and show that the +generated text has lexical, syntactic, and surface alignment with the target +author but struggles with content memorization. Our findings highlight the +potential of PEFT to support efficient, user-level customization of LLMs. + +
+
+
+
+
+ + ☆ How Does Code Pretraining Affect Language Model Task Performance? + + +
+ Large language models are increasingly trained on corpora containing both +natural language and non-linguistic data like source code. Aside from aiding +programming-related tasks, anecdotal evidence suggests that including code in +pretraining corpora may improve performance on other, unrelated tasks, yet to +date no work has been able to establish a causal connection by controlling +between language and code data. Here we do just this. We pretrain language +models on datasets which interleave natural language and code in two different +settings: additive, in which the total volume of data seen during pretraining +is held constant; and competitive, in which the volume of language data is held +constant. We study how the pretraining mixture affects performance on (a) a +diverse collection of tasks included in the BigBench benchmark, and (b) +compositionality, measured by generalization accuracy on semantic parsing and +syntactic transformations. We find that pretraining on higher proportions of +code improves performance on compositional tasks involving structured output +(like semantic parsing), and mathematics. Conversely, increase code mixture can +harm performance on other tasks, including on tasks that requires sensitivity +to linguistic structure such as syntax or morphology, and tasks measuring +real-world knowledge. + +
+
+
+
+
+ + ☆ Chain-of-Translation Prompting (CoTR): A Novel Prompting Technique for + Low Resource Languages + + +
+ This paper introduces Chain of Translation Prompting (CoTR), a novel strategy +designed to enhance the performance of language models in low-resource +languages. CoTR restructures prompts to first translate the input context from +a low-resource language into a higher-resource language, such as English. The +specified task like generation, classification, or any other NLP function is +then performed on the translated text, with the option to translate the output +back to the original language if needed. All these steps are specified in a +single prompt. We demonstrate the effectiveness of this method through a case +study on the low-resource Indic language Marathi. The CoTR strategy is applied +to various tasks, including sentiment analysis, hate speech classification, +subject classification and text generation, and its efficacy is showcased by +comparing it with regular prompting methods. Our results underscore the +potential of translation-based prompting strategies to significantly improve +multilingual LLM performance in low-resource languages, offering valuable +insights for future research and applications. We specifically see the highest +accuracy improvements with the hate speech detection task. The technique also +has the potential to enhance the quality of synthetic data generation for +underrepresented languages using LLMs. + +
+
+
+
+
+ + ☆ 3D Data Long-Term Preservation in Cultural Heritage + + +
+ The report explores the challenges and strategies for preserving 3D digital +data in cultural heritage. It discusses the issue of technological +obsolescence, emphasising the need for ustainable storage solutions and ongoing +data management strategies. Key topics include understanding technological +obsolescence, the lifecycle of digital content, digital continuity, data +management plans (DMP), FAIR principles, and the use of public repositories. +The report also covers the importance of metadata in long-term digital +preservation, including types of metadata and strategies for building valuable +metadata. It examines the evolving standards and interoperability in 3D format +preservation and the importance of managing metadata and paradata. The document +provides a comprehensive overview of the challenges and solutions for +preserving 3D cultural heritage data in the long term. + +
+
+
+
+
+ + ♻ ☆ Are LLM-based methods good enough for detecting unfair terms of service? + + +
+ Countless terms of service (ToS) are being signed everyday by users all over +the world while interacting with all kinds of apps and websites. More often +than not, these online contracts spanning double-digit pages are signed blindly +by users who simply want immediate access to the desired service. What would +normally require a consultation with a legal team, has now become a mundane +activity consisting of a few clicks where users potentially sign away their +rights, for instance in terms of their data privacy, to countless online +entities/companies. Large language models (LLMs) are good at parsing long +text-based documents, and could potentially be adopted to help users when +dealing with dubious clauses in ToS and their underlying privacy policies. To +investigate the utility of existing models for this task, we first build a +dataset consisting of 12 questions applied individually to a set of privacy +policies crawled from popular websites. Thereafter, a series of open-source as +well as commercial chatbots such as ChatGPT, are queried over each question, +with the answers being compared to a given ground truth. Our results show that +some open-source models are able to provide a higher accuracy compared to some +commercial models. However, the best performance is recorded from a commercial +chatbot (ChatGPT4). Overall, all models perform only slightly better than +random at this task. Consequently, their performance needs to be significantly +improved before they can be adopted at large for this purpose. + +
+
+
+
+
+ + ♻ ☆ RAG based Question-Answering for Contextual Response Prediction System CIKM'24 + + +
+ Large Language Models (LLMs) have shown versatility in various Natural +Language Processing (NLP) tasks, including their potential as effective +question-answering systems. However, to provide precise and relevant +information in response to specific customer queries in industry settings, LLMs +require access to a comprehensive knowledge base to avoid hallucinations. +Retrieval Augmented Generation (RAG) emerges as a promising technique to +address this challenge. Yet, developing an accurate question-answering +framework for real-world applications using RAG entails several challenges: 1) +data availability issues, 2) evaluating the quality of generated content, and +3) the costly nature of human evaluation. In this paper, we introduce an +end-to-end framework that employs LLMs with RAG capabilities for industry use +cases. Given a customer query, the proposed system retrieves relevant knowledge +documents and leverages them, along with previous chat history, to generate +response suggestions for customer service agents in the contact centers of a +major retail company. Through comprehensive automated and human evaluations, we +show that this solution outperforms the current BERT-based algorithms in +accuracy and relevance. Our findings suggest that RAG-based LLMs can be an +excellent support to human customer service representatives by lightening their +workload. + +
+
+ comment: Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise, + CIKM'24. 6 pages +
+
+
+
+
+ + ♻ ☆ Delving into the Utilisation of ChatGPT in Scientific Publications in + Astronomy SP + + +
+ Rapid progress in the capabilities of machine learning approaches in natural +language processing has culminated in the rise of large language models over +the last two years. Recent works have shown unprecedented adoption of these for +academic writing, especially in some fields, but their pervasiveness in +astronomy has not been studied sufficiently. To remedy this, we extract words +that ChatGPT uses more often than humans when generating academic text and +search a total of 1 million articles for them. This way, we assess the +frequency of word occurrence in published works in astronomy tracked by the +NASA Astrophysics Data System since 2000. We then perform a statistical +analysis of the occurrences. We identify a list of words favoured by ChatGPT +and find a statistically significant increase for these words against a control +group in 2024, which matches the trend in other disciplines. These results +suggest a widespread adoption of these models in the writing of astronomy +papers. We encourage organisations, publishers, and researchers to work +together to identify ethical and pragmatic guidelines to maximise the benefits +of these systems while maintaining scientific rigour. + +
+
+ comment: Accepted at SPAICE 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Biomedical Knowledge Discovery for Diseases: An Open-Source + Framework Applied on Rett Syndrome and Alzheimer's Disease + + +
+ The ever-growing volume of biomedical publications creates a critical need +for efficient knowledge discovery. In this context, we introduce an open-source +end-to-end framework designed to construct knowledge around specific diseases +directly from raw text. To facilitate research in disease-related knowledge +discovery, we create two annotated datasets focused on Rett syndrome and +Alzheimer's disease, enabling the identification of semantic relations between +biomedical entities. Extensive benchmarking explores various ways to represent +relations and entity representations, offering insights into optimal modeling +strategies for semantic relation detection and highlighting language models' +competence in knowledge discovery. We also conduct probing experiments using +different layer representations and attention scores to explore transformers' +ability to capture semantic relations. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ A Survey on Benchmarks of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) are gaining increasing popularity in +both academia and industry due to their remarkable performance in various +applications such as visual question answering, visual perception, +understanding, and reasoning. Over the past few years, significant efforts have +been made to examine MLLMs from multiple perspectives. This paper presents a +comprehensive review of 200 benchmarks and evaluations for MLLMs, focusing on +(1)perception and understanding, (2)cognition and reasoning, (3)specific +domains, (4)key capabilities, and (5)other modalities. Finally, we discuss the +limitations of the current evaluation methods for MLLMs and explore promising +future directions. Our key argument is that evaluation should be regarded as a +crucial discipline to support the development of MLLMs better. For more +details, please visit our GitHub repository: +https://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey. + +
+
+
+
+
+ + ♻ ☆ Towards a Unified View of Preference Learning for Large Language Models: + A Survey + + +
+ Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of +the crucial factors to achieve success is aligning the LLM's output with human +preferences. This alignment process often requires only a small amount of data +to efficiently enhance the LLM's performance. While effective, research in this +area spans multiple domains, and the methods involved are relatively complex to +understand. The relationships between different methods have been +under-explored, limiting the development of the preference alignment. In light +of this, we break down the existing popular alignment strategies into different +components and provide a unified framework to study the current alignment +strategies, thereby establishing connections among them. In this survey, we +decompose all the strategies in preference learning into four components: +model, data, feedback, and algorithm. This unified view offers an in-depth +understanding of existing alignment algorithms and also opens up possibilities +to synergize the strengths of different strategies. Furthermore, we present +detailed working examples of prevalent existing algorithms to facilitate a +comprehensive understanding for the readers. Finally, based on our unified +perspective, we explore the challenges and future research directions for +aligning large language models with human preferences. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ FineMath: A Fine-Grained Mathematical Evaluation Benchmark for Chinese + Large Language Models + + +
+ To thoroughly assess the mathematical reasoning abilities of Large Language +Models (LLMs), we need to carefully curate evaluation datasets covering diverse +mathematical concepts and mathematical problems at different difficulty levels. +In pursuit of this objective, we propose FineMath in this paper, a fine-grained +mathematical evaluation benchmark dataset for assessing Chinese LLMs. FineMath +is created to cover the major key mathematical concepts taught in elementary +school math, which are further divided into 17 categories of math word +problems, enabling in-depth analysis of mathematical reasoning abilities of +LLMs. All the 17 categories of math word problems are manually annotated with +their difficulty levels according to the number of reasoning steps required to +solve these problems. We conduct extensive experiments on a wide range of LLMs +on FineMath and find that there is still considerable room for improvements in +terms of mathematical reasoning capability of Chinese LLMs. We also carry out +an in-depth analysis on the evaluation process and methods that have been +overlooked previously. These two factors significantly influence the model +results and our understanding of their mathematical reasoning capabilities. The +dataset will be publicly available soon. + +
+
+
+
+
+ + ♻ ☆ CogniDual Framework: Self-Training Large Language Models within a + Dual-System Theoretical Framework for Improving Cognitive Tasks + + +
+ Cognitive psychology investigates perception, attention, memory, language, +problem-solving, decision-making, and reasoning. Kahneman's dual-system theory +elucidates the human decision-making process, distinguishing between the rapid, +intuitive System 1 and the deliberative, rational System 2. Recent advancements +have positioned large language Models (LLMs) as formidable tools nearing +human-level proficiency in various cognitive tasks. Nonetheless, the presence +of a dual-system framework analogous to human cognition in LLMs remains +unexplored. This study introduces the \textbf{CogniDual Framework for LLMs} +(CFLLMs), designed to assess whether LLMs can, through self-training, evolve +from deliberate deduction to intuitive responses, thereby emulating the human +process of acquiring and mastering new information. Our findings reveal the +cognitive mechanisms behind LLMs' response generation, enhancing our +understanding of their capabilities in cognitive psychology. Practically, +self-trained models can provide faster responses to certain queries, reducing +computational demands during inference. + +
+
+
+
+
+ + ♻ ☆ QET: Enhancing Quantized LLM Parameters and KV cache Compression through + Element Substitution and Residual Clustering + + +
+ The matrix quantization entails representing matrix elements in a more +space-efficient form to reduce storage usage, with dequantization restoring the +original matrix for use. We formulate the Quantization Error Minimization (QEM) +problem as minimizing the distance between a matrix before and after +quantization, under the condition that the quantized matrix occupies the same +memory space. Matrix quantization is crucial in various applications, including +Large Language Models (LLMs) weight quantization, vector databases, KV cache +quantization, graph compression, and image compression. Recent advancements in +LLMs, such as GPT-4 and BERT, have highlighted the importance of matrix +compression due to the large size of parameters and KV cache, which are stored +as matrices. + We propose Quantum Entanglement Trees (QET) to address the QEM problem by +leveraging the local orderliness of matrix elements, involving iterative +element swapping to form a locally ordered matrix. This matrix is then grouped +and quantized by columns. To enhance QET, we introduce two optimizations: +further quantizing residuals to reduce MSE, and using masking and batch +processing to accelerate the algorithm. + Experimental results demonstrate that QET can effectively reduce MSE to +5.05%, 13.33%, and 11.89% of the current best method on the LLM dataset, K +cache, and V cache, respectively. Our contributions include the abstraction of +the QEM problem, the design of the QET algorithm, and the proposal of two +optimizations to improve accuracy and speed. + +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes/formatting improvements. V3: improvements from journal + revisions. V4: fix figures +
+
+
+
+
+ + ♻ ☆ LLM-based multi-agent poetry generation in non-cooperative environments + + +
+ Despite substantial progress of large language models (LLMs) for automatic +poetry generation, the generated poetry lacks diversity while the training +process differs greatly from human learning. Under the rationale that the +learning process of the poetry generation systems should be more human-like and +their output more diverse and novel, we introduce a framework based on social +learning where we emphasize non-cooperative interactions besides cooperative +interactions to encourage diversity. Our experiments are the first attempt at +LLM-based multi-agent systems in non-cooperative environments for poetry +generation employing both TRAINING-BASED agents (GPT-2) and PROMPTING-BASED +agents (GPT-3 and GPT-4). Our evaluation based on 96k generated poems shows +that our framework benefits the poetry generation process for TRAINING-BASED +agents resulting in 1) a 3.0-3.7 percentage point (pp) increase in diversity +and a 5.6-11.3 pp increase in novelty according to distinct and novel n-grams. +The generated poetry from TRAINING-BASED agents also exhibits group divergence +in terms of lexicons, styles and semantics. PROMPTING-BASED agents in our +framework also benefit from non-cooperative environments and a more diverse +ensemble of models with non-homogeneous agents has the potential to further +enhance diversity, with an increase of 7.0-17.5 pp according to our +experiments. However, PROMPTING-BASED agents show a decrease in lexical +diversity over time and do not exhibit the group-based divergence intended in +the social network. Our paper argues for a paradigm shift in creative tasks +such as automatic poetry generation to include social learning processes (via +LLM-based agent modeling) similar to human interaction. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease + Classification: A Systematic Review + + +
+ Parkinson's disease (PD), the second most prevalent neurodegenerative +disorder worldwide, frequently presents with early-stage speech impairments. +Recent advancements in Artificial Intelligence (AI), particularly deep learning +(DL), have significantly enhanced PD diagnosis through the analysis of speech +data. Nevertheless, the progress of research is restricted by the limited +availability of publicly accessible speech-based PD datasets, primarily due to +privacy concerns. The goal of this systematic review is to explore the current +landscape of speech-based DL approaches for PD classification, based on 33 +scientific works published between January 2020 and March 2024. We discuss +their available resources, capabilities, and potential limitations, and issues +related to bias, explainability, and privacy. Furthermore, this review provides +an overview of publicly accessible speech-based datasets and open-source +material for PD. The DL approaches identified are categorized into end-to-end +(E2E) learning, transfer learning (TL), and deep acoustic feature extraction +(DAFE). Among E2E approaches, Convolutional Neural Networks (CNNs) are +prevalent, though Transformers are increasingly popular. E2E approaches face +challenges such as limited data and computational resources, especially with +Transformers. TL addresses these issues by providing more robust PD diagnosis +and better generalizability across languages. DAFE aims to improve the +explainability and interpretability of results by examining the specific +effects of deep features on both other DL approaches and more traditional +machine learning (ML) methods. However, it often underperforms compared to E2E +and TL approaches. + +
+
+ comment: van Gelderen, L., & Tejedor-Garc\'ia, C. (2024). Innovative + Speech-Based Deep Learning Approaches for Parkinson's Disease Classification: + A Systematic Review. Applied Sciences, 14(17). doi:10.3390/app14177873 This + research was funded by the NWO research programme NGF AiNed Fellowship Grants + under the project Responsible AI for Voice Diagnostics (RAIVD) - grant number + NGF.1607.22.013 +
+
+
+
+
+ + ♻ ☆ LooGLE: Can Long-Context Language Models Understand Long Contexts? + + +
+ Large language models (LLMs), despite their impressive performance in various +language tasks, are typically limited to processing texts within context-window +size. This limitation has spurred significant research efforts to enhance LLMs' +long-context understanding with high-quality long-sequence benchmarks. However, +prior datasets in this regard suffer from shortcomings, such as short context +length compared to the context window of modern LLMs; outdated documents that +have data leakage problems; and an emphasis on short dependency tasks rather +than long dependency tasks. In this paper, we present LooGLE, a Long Context +Generic Language Evaluation benchmark for LLMs' long context understanding. +LooGLE features relatively new documents post-2022, with over 24,000 tokens per +document and 6,000 newly generated questions spanning diverse domains. Human +annotators meticulously crafted more than 1,100 high-quality question-answer +pairs to meet the long dependency requirements. These pairs underwent thorough +cross-validation, yielding the most precise assessment of LLMs' long dependency +capabilities. The evaluation of eight state-of-the-art LLMs on LooGLE revealed +key findings: (i) commercial models outperformed open-sourced models; (ii) LLMs +excelled in short dependency tasks like short question-answering and cloze +tasks but struggled with more intricate long dependency tasks; (iii) in-context +learning and chaining thoughts offered only marginal improvements; (iv) +retrieval-based techniques demonstrated substantial benefits for short +question-answering, while strategies for extending context window length had +limited impact on long context understanding. As such, LooGLE not only provides +a systematic and comprehensive evaluation schema on long-context LLMs, but also +sheds light on future development of enhanced models towards "true long-context +understanding". + +
+
+
+
+
+ + ♻ ☆ CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the + Mathematics Reasoning of Large Multimodal Models + + +
+ Large language models (LLMs) have obtained promising results in mathematical +reasoning, which is a foundational skill for human intelligence. Most previous +studies focus on improving and measuring the performance of LLMs based on +textual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few +researchers have released English multimodal math datasets (e.g., MATHVISTA and +MATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In +this paper, we release a Chinese multimodal math (CMM-Math) dataset, including +benchmark and training parts, to evaluate and enhance the mathematical +reasoning of LMMs. CMM-Math contains over 28,000 high-quality samples, +featuring a variety of problem types (e.g., multiple-choice, fill-in-the-blank, +and so on) with detailed solutions across 12 grade levels from elementary to +high school in China. Specifically, the visual context may be present in the +questions or opinions, which makes this dataset more challenging. Through +comprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math +dataset face challenges, emphasizing the necessity for further improvements in +LMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to +handle the problems with mixed input of multiple images and text segments. We +train our model using three stages, including foundational pre-training, +foundational fine-tuning, and mathematical fine-tuning. The extensive +experiments indicate that our model effectively improves math reasoning +performance by comparing it with the SOTA LMMs over three multimodal +mathematical datasets. + +
+
+
+
+
+ + ♻ ☆ Language Models Benefit from Preparation with Elicited Knowledge + + +
+ The zero-shot chain of thought (CoT) approach is often used in question +answering (QA) by language models (LMs) for tasks that require multiple +reasoning steps, typically enhanced by the prompt "Let's think step by step." +However, some QA tasks hinge more on accessing relevant knowledge than on +chaining reasoning steps. We introduce a simple general prompting technique, +called PREP, that involves using two instances of LMs: the first (LM1) +generates relevant information, and the second (LM2) answers the question based +on this information. PREP is designed to be general and independent of the +user's domain knowledge, making it applicable across various QA tasks without +the need for specialized prompt engineering. To evaluate the effectiveness of +our prompting method, we create a dataset of 100 binary-choice questions, +derived from an extensive schematic dataset on artifact parts and material +composition. These questions ask which of two artifacts is less likely to share +materials with another artifact. Such questions probe the LM's knowledge of +shared materials in the part structure of different artifacts. We test our +method on our dataset and three published commonsense reasoning datasets. The +average accuracy of our method is consistently higher than that of all the +other tested methods across all the tested datasets. + +
+
+
+
+
+ + ♻ ☆ Using GPT-4 to Augment Unbalanced Data for Automatic Scoring + + +
+ Machine learning-based automatic scoring faces challenges with unbalanced +student responses across scoring categories. To address this, we introduce a +novel text data augmentation framework leveraging GPT-4, a generative large +language model, specifically tailored for unbalanced datasets in automatic +scoring. Our experimental dataset comprised student written responses to four +science items. We crafted prompts for GPT-4 to generate responses, especially +for minority scoring classes, enhancing the data set. We then finetuned +DistillBERT for automatic scoring based on the augmented and original datasets. +Model performance was assessed using accuracy, precision, recall, and F1 +metrics. Our findings revealed that incorporating GPT-4-augmented data +remarkedly improved model performance, particularly for precision and F1 +scores. Interestingly, the extent of improvement varied depending on the +specific dataset and the proportion of augmented data used. Notably, we found +that a varying amount of augmented data (20%-40%) was needed to obtain stable +improvement for automatic scoring. Comparisons with models trained on +additional student-written responses suggest that GPT-4 augmented models match +those trained with student data. This research underscores the potential and +effectiveness of data augmentation techniques utilizing generative large +language models like GPT-4 in addressing unbalanced datasets within automated +assessment. + +
+
+
+
+
+ + ♻ ☆ MEDSAGE: Enhancing Robustness of Medical Dialogue Summarization to ASR + Errors with LLM-generated Synthetic Dialogues + + +
+ Automatic Speech Recognition (ASR) systems are pivotal in transcribing speech +into text, yet the errors they introduce can significantly degrade the +performance of downstream tasks like summarization. This issue is particularly +pronounced in clinical dialogue summarization, a low-resource domain where +supervised data for fine-tuning is scarce, necessitating the use of ASR models +as black-box solutions. Employing conventional data augmentation for enhancing +the noise robustness of summarization models is not feasible either due to the +unavailability of sufficient medical dialogue audio recordings and +corresponding ASR transcripts. To address this challenge, we propose MEDSAGE, +an approach for generating synthetic samples for data augmentation using Large +Language Models (LLMs). Specifically, we leverage the in-context learning +capabilities of LLMs and instruct them to generate ASR-like errors based on a +few available medical dialogue examples with audio recordings. Experimental +results show that LLMs can effectively model ASR noise, and incorporating this +noisy data into the training process significantly improves the robustness and +accuracy of medical dialogue summarization systems. This approach addresses the +challenges of noisy ASR outputs in critical applications, offering a robust +solution to enhance the reliability of clinical dialogue summarization. + +
+
+
+
+
+ + ♻ ☆ Reasoning Beyond Bias: A Study on Counterfactual Prompting and Chain of + Thought Reasoning + + +
+ Language models are known to absorb biases from their training data, leading +to predictions driven by statistical regularities rather than semantic +relevance. We investigate the impact of these biases on answer choice +preferences in the Massive Multi-Task Language Understanding (MMLU) task. Our +findings reveal that differences in learned regularities across answer options +are predictive of model preferences and mirror human test-taking strategies. To +address this issue, we introduce two novel methods: Counterfactual Prompting +with Chain of Thought (CoT) and Counterfactual Prompting with Agnostically +Primed CoT (APriCoT). We demonstrate that while Counterfactual Prompting with +CoT alone is insufficient to mitigate bias, our novel Primed Counterfactual +Prompting with CoT approach effectively reduces the influence of base-rate +probabilities while improving overall accuracy. Our results suggest that +mitigating bias requires a "System-2" like process and that CoT reasoning is +susceptible to confirmation bias under some prompting methodologies. Our +contributions offer practical solutions for developing more robust and fair +language models. + +
+
+
+
+
+ + ♻ ☆ SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated + Responses + + +
+ Can LLMs consistently improve their previous outputs for better results? For +this to be true, LLMs would need to be better at discriminating among +previously-generated alternatives, than generating initial responses. We +explore the validity of this hypothesis in practice. We first formulate a +unified framework that allows us to compare the generative and discriminative +capability of any model on any task. In our resulting experimental analysis of +several open-source and industrial LLMs, we observe that models are not +reliably better at discriminating among previously-generated alternatives than +generating initial responses. This finding challenges the notion that LLMs may +be able to enhance their performance only through their own judgment. + +
+
+
+
+
+ + ♻ Foundational Challenges in Assuring Alignment and Safety of Large + Language Models + + +
+ This work identifies 18 foundational challenges in assuring the alignment and +safety of large language models (LLMs). These challenges are organized into +three different categories: scientific understanding of LLMs, development and +deployment methods, and sociotechnical challenges. Based on the identified +challenges, we pose $200+$ concrete research questions. + +
+
+
+
+
+ + ♻ ☆ PointLLM: Empowering Large Language Models to Understand Point Clouds ECCV 2024 + + +
+ The unprecedented advancements in Large Language Models (LLMs) have shown a +profound impact on natural language processing but are yet to fully embrace the +realm of 3D understanding. This paper introduces PointLLM, a preliminary effort +to fill this gap, enabling LLMs to understand point clouds and offering a new +avenue beyond 2D visual data. PointLLM understands colored object point clouds +with human instructions and generates contextually appropriate responses, +illustrating its grasp of point clouds and common sense. Specifically, it +leverages a point cloud encoder with a powerful LLM to effectively fuse +geometric, appearance, and linguistic information. We collect a novel dataset +comprising 660K simple and 70K complex point-text instruction pairs to enable a +two-stage training strategy: aligning latent spaces and subsequently +instruction-tuning the unified model. To rigorously evaluate the perceptual and +generalization capabilities of PointLLM, we establish two benchmarks: +Generative 3D Object Classification and 3D Object Captioning, assessed through +three different methods, including human evaluation, GPT-4/ChatGPT evaluation, +and traditional metrics. Experimental results reveal PointLLM's superior +performance over existing 2D and 3D baselines, with a notable achievement in +human-evaluated object captioning tasks where it surpasses human annotators in +over 50% of the samples. Codes, datasets, and benchmarks are available at +https://github.com/OpenRobotLab/PointLLM . + +
+
+ comment: ECCV 2024 Oral Camera Ready. This version includes clearer writing + and additional experimental results compared to previous versions. Project + page: https://runsenxu.com/projects/PointLLM +
+
+
+
+
+ + ♻ ☆ Beyond Words: On Large Language Models Actionability in Mission-Critical + Risk Analysis + + +
+ Context. Risk analysis assesses potential risks in specific scenarios. Risk +analysis principles are context-less; the same methodology can be applied to a +risk connected to health and information technology security. Risk analysis +requires a vast knowledge of national and international regulations and +standards and is time and effort-intensive. A large language model can quickly +summarize information in less time than a human and can be fine-tuned to +specific tasks. + Aim. Our empirical study aims to investigate the effectiveness of +Retrieval-Augmented Generation and fine-tuned LLM in risk analysis. To our +knowledge, no prior study has explored its capabilities in risk analysis. + Method. We manually curated 193 unique scenarios leading to 1283 +representative samples from over 50 mission-critical analyses archived by the +industrial context team in the last five years. We compared the base GPT-3.5 +and GPT-4 models versus their Retrieval-Augmented Generation and fine-tuned +counterparts. We employ two human experts as competitors of the models and +three other human experts to review the models and the former human experts' +analysis. The reviewers analyzed 5,000 scenario analyses. + Results and Conclusions. Human experts demonstrated higher accuracy, but LLMs +are quicker and more actionable. Moreover, our findings show that RAG-assisted +LLMs have the lowest hallucination rates, effectively uncovering hidden risks +and complementing human expertise. Thus, the choice of model depends on +specific needs, with FTMs for accuracy, RAG for hidden risks discovery, and +base models for comprehensiveness and actionability. Therefore, experts can +leverage LLMs as an effective complementing companion in risk analysis within a +condensed timeframe. They can also save costs by averting unnecessary expenses +associated with implementing unwarranted countermeasures. + +
+
+
+
+
+ + ♻ ☆ FiNER-ORD: Financial Named Entity Recognition Open Research Dataset + + +
+ Over the last two decades, the development of the CoNLL-2003 named entity +recognition (NER) dataset has helped enhance the capabilities of deep learning +and natural language processing (NLP). The finance domain, characterized by its +unique semantic and lexical variations for the same entities, presents specific +challenges to the NER task; thus, a domain-specific customized dataset is +crucial for advancing research in this field. In our work, we develop the first +high-quality English Financial NER Open Research Dataset (FiNER-ORD). We +benchmark multiple pre-trained language models (PLMs) and large-language models +(LLMs) on FiNER-ORD. We believe our proposed FiNER-ORD dataset will open future +opportunities to use FiNER-ORD as a benchmark for financial domain-specific NER +and NLP tasks. Our dataset, models, and code are publicly available on GitHub +and Hugging Face under CC BY-NC 4.0 license. + +
+
+
+
+
+ + ♻ ☆ GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase + Recommendation + + +
+ Online sellers and advertisers are recommended keyphrases for their listed +products, which they bid on to enhance their sales. One popular paradigm that +generates such recommendations is Extreme Multi-Label Classification (XMC), +which involves tagging/mapping keyphrases to items. We outline the limitations +of using traditional item-query based tagging or mapping techniques for +keyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an +innovative graph-based approach that recommends keyphrases to sellers using +extraction of token permutations from item titles. Additionally, we demonstrate +that relying on traditional metrics such as precision/recall can be misleading +in practical applications, thereby necessitating a combination of metrics to +evaluate performance in real-world scenarios. These metrics are designed to +assess the relevance of keyphrases to items and the potential for buyer +outreach. GraphEx outperforms production models at eBay, achieving the +objectives mentioned above. It supports near real-time inferencing in +resource-constrained production environments and scales effectively for +billions of items. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 100 + +
+
+
+ + ☆ Synergy and Synchrony in Couple Dances + + +
+ This paper asks to what extent social interaction influences one's behavior. +We study this in the setting of two dancers dancing as a couple. We first +consider a baseline in which we predict a dancer's future moves conditioned +only on their past motion without regard to their partner. We then investigate +the advantage of taking social information into account by conditioning also on +the motion of their dancing partner. We focus our analysis on Swing, a dance +genre with tight physical coupling for which we present an in-the-wild video +dataset. We demonstrate that single-person future motion prediction in this +context is challenging. Instead, we observe that prediction greatly benefits +from considering the interaction partners' behavior, resulting in surprisingly +compelling couple dance synthesis results (see supp. video). Our contributions +are a demonstration of the advantages of socially conditioned future motion +prediction and an in-the-wild, couple dance video dataset to enable future +research in this direction. Video results are available on the project website: +https://von31.github.io/synNsync + +
+
+
+
+
+ + ☆ VILA-U: a Unified Foundation Model Integrating Visual Understanding and + Generation + + +
+ VILA-U is a Unified foundation model that integrates Video, Image, Language +understanding and generation. Traditional visual language models (VLMs) use +separate modules for understanding and generating visual content, which can +lead to misalignment and increased complexity. In contrast, VILA-U employs a +single autoregressive next-token prediction framework for both tasks, +eliminating the need for additional components like diffusion models. This +approach not only simplifies the model but also achieves near state-of-the-art +performance in visual language understanding and generation. The success of +VILA-U is attributed to two main factors: the unified vision tower that aligns +discrete visual tokens with textual inputs during pretraining, which enhances +visual perception, and autoregressive image generation can achieve similar +quality as diffusion models with high-quality dataset. This allows VILA-U to +perform comparably to more complex models using a fully token-based +autoregressive framework. + +
+
+ comment: 11 pages, 7 figures, 8 tables +
+
+
+
+
+ + ☆ Exploring Foundation Models for Synthetic Medical Imaging: A Study on + Chest X-Rays and Fine-Tuning Techniques + + +
+ Machine learning has significantly advanced healthcare by aiding in disease +prevention and treatment identification. However, accessing patient data can be +challenging due to privacy concerns and strict regulations. Generating +synthetic, realistic data offers a potential solution for overcoming these +limitations, and recent studies suggest that fine-tuning foundation models can +produce such data effectively. In this study, we explore the potential of +foundation models for generating realistic medical images, particularly chest +x-rays, and assess how their performance improves with fine-tuning. We propose +using a Latent Diffusion Model, starting with a pre-trained foundation model +and refining it through various configurations. Additionally, we performed +experiments with input from a medical professional to assess the realism of the +images produced by each trained model. + +
+
+
+
+
+ + ☆ Open-MAGVIT2: An Open-Source Project Toward Democratizing + Auto-regressive Visual Generation + + +
+ We present Open-MAGVIT2, a family of auto-regressive image generation models +ranging from 300M to 1.5B. The Open-MAGVIT2 project produces an open-source +replication of Google's MAGVIT-v2 tokenizer, a tokenizer with a super-large +codebook (i.e., $2^{18}$ codes), and achieves the state-of-the-art +reconstruction performance (1.17 rFID) on ImageNet $256 \times 256$. +Furthermore, we explore its application in plain auto-regressive models and +validate scalability properties. To assist auto-regressive models in predicting +with a super-large vocabulary, we factorize it into two sub-vocabulary of +different sizes by asymmetric token factorization, and further introduce "next +sub-token prediction" to enhance sub-token interaction for better generation +quality. We release all models and codes to foster innovation and creativity in +the field of auto-regressive visual generation. + +
+
+
+
+
+ + ☆ Train Till You Drop: Towards Stable and Robust Source-free Unsupervised + 3D Domain Adaptation ECCV 2024 + + +
+ We tackle the challenging problem of source-free unsupervised domain +adaptation (SFUDA) for 3D semantic segmentation. It amounts to performing +domain adaptation on an unlabeled target domain without any access to source +data; the available information is a model trained to achieve good performance +on the source domain. A common issue with existing SFUDA approaches is that +performance degrades after some training time, which is a by product of an +under-constrained and ill-posed problem. We discuss two strategies to alleviate +this issue. First, we propose a sensible way to regularize the learning +problem. Second, we introduce a novel criterion based on agreement with a +reference model. It is used (1) to stop the training when appropriate and (2) +as validator to select hyperparameters without any knowledge on the target +domain. Our contributions are easy to implement and readily amenable for all +SFUDA methods, ensuring stable improvements over all baselines. We validate our +findings on various 3D lidar settings, achieving state-of-the-art performance. +The project repository (with code) is: github.com/valeoai/TTYD. + +
+
+ comment: Accepted to ECCV 2024. Project repository: github.com/valeoai/TTYD +
+
+
+
+
+ + ☆ HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale + Space Using Wearable IMUs and LiDAR + + +
+ We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture +method, aimed at accurately and efficiently creating a dynamic digital world, +containing large-scale indoor-outdoor scenes, diverse human motions, rich +human-human interactions, and human-environment interactions. By utilizing +body-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human +motions in unconstrained space without the need for external devices and +pre-built maps. This affords great flexibility and accessibility for +human-centered interaction and 4D scene capturing in various environments. +Taking into account that IMUs can capture human spatially unrestricted poses +but are prone to drifting for long-period using, and while LiDAR is stable for +global localization but rough for local positions and orientations, HiSC4D +employs a joint optimization method, harmonizing all sensors and utilizing +environment cues, yielding promising results for long-term capture in large +scenes. To promote research of egocentric human interaction in large scenes and +facilitate downstream tasks, we also present a dataset, containing 8 sequences +in 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D +human motions with SMPL annotations and dynamic scenes, 31k frames of cropped +human point clouds, and scene mesh of the environment. A variety of scenarios, +such as the basketball gym and commercial street, alongside challenging human +motions, such as daily greeting, one-on-one basketball playing, and tour +guiding, demonstrate the effectiveness and the generalization ability of +HiSC4D. The dataset and code will be publicated on +www.lidarhumanmotion.net/hisc4d available for research purposes. + +
+
+ comment: 17 pages, 10 figures, Jornal +
+
+
+
+
+ + ☆ Future Does Matter: Boosting 3D Object Detection with Temporal Motion + Estimation in Point Cloud Sequences + + +
+ Accurate and robust LiDAR 3D object detection is essential for comprehensive +scene understanding in autonomous driving. Despite its importance, LiDAR +detection performance is limited by inherent constraints of point cloud data, +particularly under conditions of extended distances and occlusions. Recently, +temporal aggregation has been proven to significantly enhance detection +accuracy by fusing multi-frame viewpoint information and enriching the spatial +representation of objects. In this work, we introduce a novel LiDAR 3D object +detection framework, namely LiSTM, to facilitate spatial-temporal feature +learning with cross-frame motion forecasting information. We aim to improve the +spatial-temporal interpretation capabilities of the LiDAR detector by +incorporating a dynamic prior, generated from a non-learnable motion estimation +model. Specifically, Motion-Guided Feature Aggregation (MGFA) is proposed to +utilize the object trajectory from previous and future motion states to model +spatial-temporal correlations into gaussian heatmap over a driving sequence. +This motion-based heatmap then guides the temporal feature fusion, enriching +the proposed object features. Moreover, we design a Dual Correlation Weighting +Module (DCWM) that effectively facilitates the interaction between past and +prospective frames through scene- and channel-wise feature abstraction. In the +end, a cascade cross-attention-based decoder is employed to refine the 3D +prediction. We have conducted experiments on the Waymo and nuScenes datasets to +demonstrate that the proposed framework achieves superior 3D detection +performance with effective spatial-temporal feature learning. + +
+
+
+
+
+ + ☆ Question-Answering Dense Video Events + + +
+ Multimodal Large Language Models (MLLMs) have shown excellent performance in +question-answering of single-event videos. In this paper, we present +question-answering dense video events, a novel task that requires answering and +grounding the dense-event questions in long videos, thus challenging MLLMs to +faithfully comprehend and reason about multiple events occurring over extended +time periods. To facilitate the study, we construct DeVE-QA - a dataset +featuring 78K questions about 26K events on 10.6K long videos. We then +benchmark and show that existing MLLMs excelling at single-event QA struggle to +perform well in DeVE-QA. For improvement, we propose DeVi, a novel +training-free MLLM approach that highlights a hierarchical captioning module, a +temporal event memory module, and a self-consistency checking module to +respectively detect, contextualize and memorize, and ground dense-events in +long videos for question answering. Extensive experiments show that DeVi is +superior at answering dense-event questions and grounding relevant video +moments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1 +percent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA +respectively. + +
+
+
+
+
+ + ☆ Empirical Bayesian image restoration by Langevin sampling with a + denoising diffusion implicit prior + + +
+ Score-based diffusion methods provide a powerful strategy to solve image +restoration tasks by flexibly combining a pre-trained foundational prior model +with a likelihood function specified during test time. Such methods are +predominantly derived from two stochastic processes: reversing +Ornstein-Uhlenbeck, which underpins the celebrated denoising diffusion +probabilistic models (DDPM) and denoising diffusion implicit models (DDIM), and +the Langevin diffusion process. The solutions delivered by DDPM and DDIM are +often remarkably realistic, but they are not always consistent with +measurements because of likelihood intractability issues and the associated +required approximations. Alternatively, using a Langevin process circumvents +the intractable likelihood issue, but usually leads to restoration results of +inferior quality and longer computing times. This paper presents a novel and +highly computationally efficient image restoration method that carefully embeds +a foundational DDPM denoiser within an empirical Bayesian Langevin algorithm, +which jointly calibrates key model hyper-parameters as it estimates the model's +posterior mean. Extensive experimental results on three canonical tasks (image +deblurring, super-resolution, and inpainting) demonstrate that the proposed +approach improves on state-of-the-art strategies both in image estimation +accuracy and computing time. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Enhancing Skin Lesion Diagnosis with Ensemble Learning + + +
+ Skin lesions are an increasingly significant medical concern, varying widely +in severity from benign to cancerous. Accurate diagnosis is essential for +ensuring timely and appropriate treatment. This study examines the +implementation of deep learning methods to assist in the diagnosis of skin +lesions using the HAM10000 dataset, which contains seven distinct types of +lesions. First, we evaluated three pre-trained models: MobileNetV2, ResNet18, +and VGG11, achieving accuracies of 0.798, 0.802, and 0.805, respectively. To +further enhance classification accuracy, we developed ensemble models employing +max voting, average voting, and stacking, resulting in accuracies of 0.803, +0.82, and 0.83. Building on the best-performing ensemble learning model, +stacking, we developed our proposed model, SkinNet, which incorporates a +customized architecture and fine-tuning, achieving an accuracy of 0.867 and an +AUC of 0.96. This substantial improvement over individual models demonstrates +the effectiveness of ensemble learning in improving skin lesion classification. + +
+
+
+
+
+ + ☆ The Impact of Scanner Domain Shift on Deep Learning Performance in + Medical Imaging: an Experimental Study + + +
+ Purpose: Medical images acquired using different scanners and protocols can +differ substantially in their appearance. This phenomenon, scanner domain +shift, can result in a drop in the performance of deep neural networks which +are trained on data acquired by one scanner and tested on another. This +significant practical issue is well-acknowledged, however, no systematic study +of the issue is available across different modalities and diagnostic tasks. +Materials and Methods: In this paper, we present a broad experimental study +evaluating the impact of scanner domain shift on convolutional neural network +performance for different automated diagnostic tasks. We evaluate this +phenomenon in common radiological modalities, including X-ray, CT, and MRI. +Results: We find that network performance on data from a different scanner is +almost always worse than on same-scanner data, and we quantify the degree of +performance drop across different datasets. Notably, we find that this drop is +most severe for MRI, moderate for X-ray, and quite small for CT, on average, +which we attribute to the standardized nature of CT acquisition systems which +is not present in MRI or X-ray. We also study how injecting varying amounts of +target domain data into the training set, as well as adding noise to the +training data, helps with generalization. Conclusion: Our results provide +extensive experimental evidence and quantification of the extent of performance +drop caused by scanner domain shift in deep learning across different +modalities, with the goal of guiding the future development of robust deep +learning models for medical image analysis. + +
+
+
+
+
+ + ☆ RCNet: Deep Recurrent Collaborative Network for Multi-View Low-Light + Image Enhancement + + +
+ Scene observation from multiple perspectives would bring a more comprehensive +visual experience. However, in the context of acquiring multiple views in the +dark, the highly correlated views are seriously alienated, making it +challenging to improve scene understanding with auxiliary views. Recent single +image-based enhancement methods may not be able to provide consistently +desirable restoration performance for all views due to the ignorance of +potential feature correspondence among different views. To alleviate this +issue, we make the first attempt to investigate multi-view low-light image +enhancement. First, we construct a new dataset called Multi-View Low-light +Triplets (MVLT), including 1,860 pairs of triple images with large illumination +ranges and wide noise distribution. Each triplet is equipped with three +different viewpoints towards the same scene. Second, we propose a deep +multi-view enhancement framework based on the Recurrent Collaborative Network +(RCNet). Specifically, in order to benefit from similar texture correspondence +across different views, we design the recurrent feature enhancement, alignment +and fusion (ReEAF) module, in which intra-view feature enhancement (Intra-view +EN) followed by inter-view feature alignment and fusion (Inter-view AF) is +performed to model the intra-view and inter-view feature propagation +sequentially via multi-view collaboration. In addition, two different modules +from enhancement to alignment (E2A) and from alignment to enhancement (A2E) are +developed to enable the interactions between Intra-view EN and Inter-view AF, +which explicitly utilize attentive feature weighting and sampling for +enhancement and alignment, respectively. Experimental results demonstrate that +our RCNet significantly outperforms other state-of-the-art methods. All of our +dataset, code, and model will be available at https://github.com/hluo29/RCNet. + +
+
+ comment: 14 Pages, 10 Figures, Under Review +
+
+
+
+
+ + ☆ Connectivity-Inspired Network for Context-Aware Recognition ECCV 2024 + + +
+ The aim of this paper is threefold. We inform the AI practitioner about the +human visual system with an extensive literature review; we propose a novel +biologically motivated neural network for image classification; and, finally, +we present a new plug-and-play module to model context awareness. We focus on +the effect of incorporating circuit motifs found in biological brains to +address visual recognition. Our convolutional architecture is inspired by the +connectivity of human cortical and subcortical streams, and we implement +bottom-up and top-down modulations that mimic the extensive afferent and +efferent connections between visual and cognitive areas. Our Contextual +Attention Block is simple and effective and can be integrated with any +feed-forward neural network. It infers weights that multiply the feature maps +according to their causal influence on the scene, modeling the co-occurrence of +different objects in the image. We place our module at different bottlenecks to +infuse a hierarchical context awareness into the model. We validated our +proposals through image classification experiments on benchmark data and found +a consistent improvement in performance and the robustness of the produced +explanations via class activation. Our code is available at +https://github.com/gianlucarloni/CoCoReco. + +
+
+ comment: ECCV 2024 - HCV Workshop, Accepted for presentation, Submitted + Manuscript Version (adapted to include author names, Acknowledgements, and + reference DOIs): the version of the manuscript improved after peer review + will appear in the Proceedings later +
+
+
+
+
+ + ☆ Serp-Mamba: Advancing High-Resolution Retinal Vessel Segmentation with + Selective State-Space Model + + +
+ Ultra-Wide-Field Scanning Laser Ophthalmoscopy (UWF-SLO) images capture +high-resolution views of the retina with typically 200 spanning degrees. +Accurate segmentation of vessels in UWF-SLO images is essential for detecting +and diagnosing fundus disease. Recent studies have revealed that the selective +State Space Model (SSM) in Mamba performs well in modeling long-range +dependencies, which is crucial for capturing the continuity of elongated vessel +structures. Inspired by this, we propose the first Serpentine Mamba +(Serp-Mamba) network to address this challenging task. Specifically, we +recognize the intricate, varied, and delicate nature of the tubular structure +of vessels. Furthermore, the high-resolution of UWF-SLO images exacerbates the +imbalance between the vessel and background categories. Based on the above +observations, we first devise a Serpentine Interwoven Adaptive (SIA) scan +mechanism, which scans UWF-SLO images along curved vessel structures in a +snake-like crawling manner. This approach, consistent with vascular texture +transformations, ensures the effective and continuous capture of curved +vascular structure features. Second, we propose an Ambiguity-Driven Dual +Recalibration (ADDR) module to address the category imbalance problem +intensified by high-resolution images. Our ADDR module delineates pixels by two +learnable thresholds and refines ambiguous pixels through a dual-driven +strategy, thereby accurately distinguishing vessels and background regions. +Experiment results on three datasets demonstrate the superior performance of +our Serp-Mamba on high-resolution vessel segmentation. We also conduct a series +of ablation studies to verify the impact of our designs. Our code shall be +released upon publication of this work. + +
+
+
+
+
+ + ☆ Computer-Generated Sand Mixtures and Sand-based Images + + +
+ This paper aims to verify the effectiveness of the software implementation of +the proposed algorithm in creating computer-generated images of sand mixtures +using a photograph of sand as an input and its effectiveness in converting +digital pictures into sand-based images out of the mixtures it generated. The +method of this paper is to visually compare the photographed image of the +actual mixtures to its computer-generated counterpart to verify if the mixture +generation produces results as expected and compare the computer-generated +sand-based images with its source to verify image reproduction maintains same +image content. The results of the mixture comparison shows that the actual and +the computer-generated ones have similar overall shade and color. Still, the +generated one has a rougher texture and higher contrast due to the method of +inheriting visual features by pixel, not by individual sand particles. The +comparison of the sand-based image and its source has demonstrated the +software's ability to maintain the essence of its contents during conversion +while replacing its texture with the visual properties of the generated sand +mixture. The result have shown that the software implementation of the proposed +algorithm can effectively use the images of sand to generate images of its +mixtures and use those mixture images to convert a digital picture into a +computer-generated sand-based image. + +
+
+ comment: 12 pages, 8 figures, 2nd International Research Conference on + Computer Engineering and Technology Education +
+
+
+
+
+ + ☆ How to Identify Good Superpixels for Deforestation Detection on Tropical + Rainforests + + +
+ The conservation of tropical forests is a topic of significant social and +ecological relevance due to their crucial role in the global ecosystem. +Unfortunately, deforestation and degradation impact millions of hectares +annually, requiring government or private initiatives for effective forest +monitoring. However, identifying deforested regions in satellite images is +challenging due to data imbalance, image resolution, low-contrast regions, and +occlusion. Superpixel segmentation can overcome these drawbacks, reducing +workload and preserving important image boundaries. However, most works for +remote sensing images do not exploit recent superpixel methods. In this work, +we evaluate 16 superpixel methods in satellite images to support a +deforestation detection system in tropical forests. We also assess the +performance of superpixel methods for the target task, establishing a +relationship with segmentation methodological evaluation. According to our +results, ERS, GMMSP, and DISF perform best on UE, BR, and SIRS, respectively, +whereas ERS has the best trade-off with CO and Reg. In classification, SH, +DISF, and ISF perform best on RGB, UMDA, and PCA compositions, respectively. +According to our experiments, superpixel methods with better trade-offs between +delineation, homogeneity, compactness, and regularity are more suitable for +identifying good superpixels for deforestation detection tasks. + +
+
+ comment: 8 pages, 3 figures, paper accepted for publication at the IEEE GRSL +
+
+
+
+
+ + ☆ Advancing SEM Based Nano-Scale Defect Analysis in Semiconductor + Manufacturing for Advanced IC Nodes ECCV 2024 + + +
+ In this research, we introduce a unified end-to-end Automated Defect +Classification-Detection-Segmentation (ADCDS) framework for classifying, +detecting, and segmenting multiple instances of semiconductor defects for +advanced nodes. This framework consists of two modules: (a) a defect detection +module, followed by (b) a defect segmentation module. The defect detection +module employs Deformable DETR to aid in the classification and detection of +nano-scale defects, while the segmentation module utilizes BoxSnake. BoxSnake +facilitates box-supervised instance segmentation of nano-scale defects, +supported by the former module. This simplifies the process by eliminating the +laborious requirement for ground-truth pixel-wise mask annotation by human +experts, which is typically associated with training conventional segmentation +models. We have evaluated the performance of our ADCDS framework using two +distinct process datasets from real wafers, as ADI and AEI, specifically +focusing on Line-space patterns. We have demonstrated the applicability and +significance of our proposed methodology, particularly in the nano-scale +segmentation and generation of binary defect masks, using the challenging ADI +SEM dataset where ground-truth pixelwise segmentation annotations were +unavailable. Furthermore, we have presented a comparative analysis of our +proposed framework against previous approaches to demonstrate its +effectiveness. Our proposed framework achieved an overall mAP@IoU0.5 of 72.19 +for detection and 78.86 for segmentation on the ADI dataset. Similarly, for the +AEI dataset, these metrics were 90.38 for detection and 95.48 for segmentation. +Thus, our proposed framework effectively fulfils the requirements of advanced +defect analysis while addressing significant constraints. + +
+
+ comment: Accepted in ECCV 2024 2nd workshop on Vision-based InduStrial + InspectiON (VISION) +
+
+
+
+
+ + ☆ FS-MedSAM2: Exploring the Potential of SAM2 for Few-Shot Medical Image + Segmentation without Fine-tuning + + +
+ The Segment Anything Model 2 (SAM2) has recently demonstrated exceptional +performance in zero-shot prompt segmentation for natural images and videos. +However, it faces significant challenges when applied to medical images. Since +its release, many attempts have been made to adapt SAM2's segmentation +capabilities to the medical imaging domain. These efforts typically involve +using a substantial amount of labeled data to fine-tune the model's weights. In +this paper, we explore SAM2 from a different perspective via making the full +use of its trained memory attention module and its ability of processing mask +prompts. We introduce FS-MedSAM2, a simple yet effective framework that enables +SAM2 to achieve superior medical image segmentation in a few-shot setting, +without the need for fine-tuning. Our framework outperforms the current +state-of-the-arts on two publicly available medical image datasets. The code is +available at https://github.com/DeepMed-Lab-ECNU/FS_MedSAM2. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Cycle Pixel Difference Network for Crisp Edge Detection + + +
+ Edge detection, as a fundamental task in computer vision, has garnered +increasing attention. The advent of deep learning has significantly advanced +this field. However, recent deep learning-based methods which rely on +large-scale pre-trained weights cannot be trained from scratch, with very +limited research addressing this issue. This paper proposes a novel cycle pixel +difference convolution (CPDC), which effectively integrates image gradient +information with modern convolution operations. Based on the CPDC, we develop a +U-shape encoder-decoder model named CPD-Net, which is a purely end-to-end +network. Additionally, to address the issue of edge thickness produced by most +existing methods, we construct a multi-scale information enhancement module +(MSEM) to enhance the discriminative ability of the model, thereby generating +crisp and clean contour maps. Comprehensive experiments conducted on three +standard benchmarks demonstrate that our method achieves competitive +performance on the BSDS500 dataset (ODS=0.813), NYUD-V2 (ODS=0.760), and BIPED +dataset (ODS=0.898). Our approach provides a novel perspective for addressing +these challenges in edge detection. + +
+
+
+
+
+ + ☆ Hybrid Cost Volume for Memory-Efficient Optical Flow + + +
+ Current state-of-the-art flow methods are mostly based on dense all-pairs +cost volumes. However, as image resolution increases, the computational and +spatial complexity of constructing these cost volumes grows at a quartic rate, +making these methods impractical for high-resolution images. In this paper, we +propose a novel Hybrid Cost Volume for memory-efficient optical flow, named +HCV. To construct HCV, we first propose a Top-k strategy to separate the 4D +cost volume into two global 3D cost volumes. These volumes significantly reduce +memory usage while retaining a substantial amount of matching information. We +further introduce a local 4D cost volume with a local search space to +supplement the local information for HCV. Based on HCV, we design a +memory-efficient optical flow network, named HCVFlow. Compared to the recurrent +flow methods based the all-pairs cost volumes, our HCVFlow significantly +reduces memory consumption while ensuring high accuracy. We validate the +effectiveness and efficiency of our method on the Sintel and KITTI datasets and +real-world 4K (2160*3840) resolution images. Extensive experiments show that +our HCVFlow has very low memory usage and outperforms other memory-efficient +methods in terms of accuracy. The code is publicly available at +https://github.com/gangweiX/HCVFlow. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Calibration of Network Confidence for Unsupervised Domain Adaptation + Using Estimated Accuracy + + +
+ This study addresses the problem of calibrating network confidence while +adapting a model that was originally trained on a source domain to a target +domain using unlabeled samples from the target domain. The absence of labels +from the target domain makes it impossible to directly calibrate the adapted +network on the target domain. To tackle this challenge, we introduce a +calibration procedure that relies on estimating the network's accuracy on the +target domain. The network accuracy is first computed on the labeled source +data and then is modified to represent the actual accuracy of the model on the +target domain. The proposed algorithm calibrates the prediction confidence +directly in the target domain by minimizing the disparity between the estimated +accuracy and the computed confidence. The experimental results show that our +method significantly outperforms existing methods, which rely on importance +weighting, across several standard datasets. + +
+
+
+
+
+ + ☆ UniDet3D: Multi-dataset Indoor 3D Object Detection + + +
+ Growing customer demand for smart solutions in robotics and augmented reality +has attracted considerable attention to 3D object detection from point clouds. +Yet, existing indoor datasets taken individually are too small and +insufficiently diverse to train a powerful and general 3D object detection +model. In the meantime, more general approaches utilizing foundation models are +still inferior in quality to those based on supervised training for a specific +task. In this work, we propose \ours{}, a simple yet effective 3D object +detection model, which is trained on a mixture of indoor datasets and is +capable of working in various indoor environments. By unifying different label +spaces, \ours{} enables learning a strong representation across multiple +datasets through a supervised joint training scheme. The proposed network +architecture is built upon a vanilla transformer encoder, making it easy to +run, customize and extend the prediction pipeline for practical use. Extensive +experiments demonstrate that \ours{} obtains significant gains over existing 3D +object detection methods in 6 indoor benchmarks: ScanNet (+1.1 mAP50), +ARKitScenes (+19.4 mAP25), S3DIS (+9.1 mAP50), MultiScan (+9.3 mAP50), 3RScan +(+3.2 mAP50), and ScanNet++ (+2.7 mAP50). Code is available at +https://github.com/filapro/unidet3d . + +
+
+
+
+
+ + ☆ MpoxMamba: A Grouped Mamba-based Lightweight Hybrid Network for Mpox + Detection + + +
+ Due to the lack of effective mpox detection tools, the mpox virus continues +to spread worldwide and has once again been declared a public health emergency +of international concern by the World Health Organization. Deep learning-based +mpox detection tools are crucial to alleviate mpox outbreak. However, existing +methods have difficulty in achieving a good trade-off between detection +performance, parameter size, and model complexity, which is crucial for +practical applications and widespread deployment, especially in +resource-limited scenarios. Given that the success of Mamba in modeling +long-range dependencies and its linear complexity, we proposed a lightweight +hybrid architecture called MpoxMamba. MpoxMamba utilizes deep separable +convolutions to extract local feature representations in mpox skin lesions, and +greatly enhances the model's ability to model the global contextual information +by grouped Mamba modules. Experimental results on two widely recognized mpox +datasets demonstrate that MpoxMamba outperforms existing mpox detection methods +and state-of-the-art lightweight models. We also developed a web-based online +application to provide free mpox detection services to the public in the +epidemic areas (http://5227i971s5.goho.co:30290). The source codes of MpoxMamba +are available at https://github.com/YubiaoYue/MpoxMamba. + +
+
+
+
+
+ + ☆ Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver + + +
+ Mathematical reasoning remains an ongoing challenge for AI models, especially +for geometry problems that require both linguistic and visual signals. As the +vision encoders of most MLLMs are trained on natural scenes, they often +struggle to understand geometric diagrams, performing no better in geometry +problem solving than LLMs that only process text. This limitation is amplified +by the lack of effective methods for representing geometric relationships. To +address these issues, we introduce the Diagram Formalization Enhanced Geometry +Problem Solver (DFE-GPS), a new framework that integrates visual features, +geometric formal language, and natural language representations. We propose a +novel synthetic data approach and create a large-scale geometric dataset, +SynthGeo228K, annotated with both formal and natural language captions, +designed to enhance the vision encoder for a better understanding of geometric +structures. Our framework improves MLLMs' ability to process geometric diagrams +and extends their application to open-ended tasks on the formalgeo7k dataset. + +
+
+
+
+
+ + ☆ Learning to Learn Transferable Generative Attack for Person + Re-Identification + + +
+ Deep learning-based person re-identification (re-id) models are widely +employed in surveillance systems and inevitably inherit the vulnerability of +deep networks to adversarial attacks. Existing attacks merely consider +cross-dataset and cross-model transferability, ignoring the cross-test +capability to perturb models trained in different domains. To powerfully +examine the robustness of real-world re-id models, the Meta Transferable +Generative Attack (MTGA) method is proposed, which adopts meta-learning +optimization to promote the generative attacker producing highly transferable +adversarial examples by learning comprehensively simulated transfer-based +cross-model\&dataset\&test black-box meta attack tasks. Specifically, +cross-model\&dataset black-box attack tasks are first mimicked by selecting +different re-id models and datasets for meta-train and meta-test attack +processes. As different models may focus on different feature regions, the +Perturbation Random Erasing module is further devised to prevent the attacker +from learning to only corrupt model-specific features. To boost the attacker +learning to possess cross-test transferability, the Normalization Mix strategy +is introduced to imitate diverse feature embedding spaces by mixing +multi-domain statistics of target models. Extensive experiments show the +superiority of MTGA, especially in cross-model\&dataset and +cross-model\&dataset\&test attacks, our MTGA outperforms the SOTA methods by +21.5\% and 11.3\% on mean mAP drop rate, respectively. The code of MTGA will be +released after the paper is accepted. + +
+
+
+
+
+ + ☆ Introducing Gating and Context into Temporal Action Detection ECCV 2024 + + +
+ Temporal Action Detection (TAD), the task of localizing and classifying +actions in untrimmed video, remains challenging due to action overlaps and +variable action durations. Recent findings suggest that TAD performance is +dependent on the structural design of transformers rather than on the +self-attention mechanism. Building on this insight, we propose a refined +feature extraction process through lightweight, yet effective operations. +First, we employ a local branch that employs parallel convolutions with varying +window sizes to capture both fine-grained and coarse-grained temporal features. +This branch incorporates a gating mechanism to select the most relevant +features. Second, we introduce a context branch that uses boundary frames as +key-value pairs to analyze their relationship with the central frame through +cross-attention. The proposed method captures temporal dependencies and +improves contextual understanding. Evaluations of the gating mechanism and +context branch on challenging datasets (THUMOS14 and EPIC-KITCHEN 100) show a +consistent improvement over the baseline and existing methods. + +
+
+ comment: Accepted for publication at the ECCV 2024 ABAW Workshop +
+
+
+
+
+ + ☆ GST: Precise 3D Human Body from a Single Image with Gaussian Splatting + Transformers + + +
+ Reconstructing realistic 3D human models from monocular images has +significant applications in creative industries, human-computer interfaces, and +healthcare. We base our work on 3D Gaussian Splatting (3DGS), a scene +representation composed of a mixture of Gaussians. Predicting such mixtures for +a human from a single input image is challenging, as it is a non-uniform +density (with a many-to-one relationship with input pixels) with strict +physical constraints. At the same time, it needs to be flexible to accommodate +a variety of clothes and poses. Our key observation is that the vertices of +standardized human meshes (such as SMPL) can provide an adequate density and +approximate initial position for Gaussians. We can then train a transformer +model to jointly predict comparatively small adjustments to these positions, as +well as the other Gaussians' attributes and the SMPL parameters. We show +empirically that this combination (using only multi-view supervision) can +achieve fast inference of 3D human models from a single image without test-time +optimization, expensive diffusion models, or 3D points supervision. We also +show that it can improve 3D pose estimation by better fitting human models that +account for clothes and other variations. The code is available on the project +website https://abdullahamdi.com/gst/ . + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ LITE: A Paradigm Shift in Multi-Object Tracking with Efficient ReID + Feature Integration ICONIP-2024 + + +
+ The Lightweight Integrated Tracking-Feature Extraction (LITE) paradigm is +introduced as a novel multi-object tracking (MOT) approach. It enhances +ReID-based trackers by eliminating inference, pre-processing, post-processing, +and ReID model training costs. LITE uses real-time appearance features without +compromising speed. By integrating appearance feature extraction directly into +the tracking pipeline using standard CNN-based detectors such as YOLOv8m, LITE +demonstrates significant performance improvements. The simplest implementation +of LITE on top of classic DeepSORT achieves a HOTA score of 43.03% at 28.3 FPS +on the MOT17 benchmark, making it twice as fast as DeepSORT on MOT17 and four +times faster on the more crowded MOT20 dataset, while maintaining similar +accuracy. Additionally, a new evaluation framework for tracking-by-detection +approaches reveals that conventional trackers like DeepSORT remain competitive +with modern state-of-the-art trackers when evaluated under fair conditions. The +code will be available post-publication at https://github.com/Jumabek/LITE. + +
+
+ comment: 15 pages, 6 figures, to be published in ICONIP-2024 +
+
+
+
+
+ + ☆ Reprojection Errors as Prompts for Efficient Scene Coordinate Regression ECCV2024 + + +
+ Scene coordinate regression (SCR) methods have emerged as a promising area of +research due to their potential for accurate visual localization. However, many +existing SCR approaches train on samples from all image regions, including +dynamic objects and texture-less areas. Utilizing these areas for optimization +during training can potentially hamper the overall performance and efficiency +of the model. In this study, we first perform an in-depth analysis to validate +the adverse impacts of these areas. Drawing inspiration from our analysis, we +then introduce an error-guided feature selection (EGFS) mechanism, in tandem +with the use of the Segment Anything Model (SAM). This mechanism seeds low +reprojection areas as prompts and expands them into error-guided masks, and +then utilizes these masks to sample points and filter out problematic areas in +an iterative manner. The experiments demonstrate that our method outperforms +existing SCR approaches that do not rely on 3D information on the Cambridge +Landmarks and Indoor6 datasets. + +
+
+ comment: ECCV2024 +
+
+
+
+
+ + ☆ CISCA and CytoDArk0: a Cell Instance Segmentation and Classification + method for histo(patho)logical image Analyses and a new, open, Nissl-stained + dataset for brain cytoarchitecture studies + + +
+ Delineating and classifying individual cells in microscopy tissue images is a +complex task, yet it is a pivotal endeavor in various medical and biological +investigations. We propose a new deep learning framework (CISCA) for automatic +cell instance segmentation and classification in histological slices to support +detailed morphological and structural analysis or straightforward cell counting +in digital pathology workflows and brain cytoarchitecture studies. At the core +of CISCA lies a network architecture featuring a lightweight U-Net with three +heads in the decoder. The first head classifies pixels into boundaries between +neighboring cells, cell bodies, and background, while the second head regresses +four distance maps along four directions. The network outputs from the first +and second heads are integrated through a tailored post-processing step, which +ultimately yields the segmentation of individual cells. A third head enables +simultaneous classification of cells into relevant classes, if required. We +showcase the effectiveness of our method using four datasets, including CoNIC, +PanNuke, and MoNuSeg, which are publicly available H\&E datasets. Additionally, +we introduce CytoDArk0, a novel dataset consisting of Nissl-stained images of +the cortex, cerebellum, and hippocampus from mammals belonging to the orders +Cetartiodactyla and Primates. We evaluate CISCA in comparison to other +state-of-the-art methods, demonstrating CISCA's robustness and accuracy in +segmenting and classifying cells across diverse tissue types, magnifications, +and staining techniques. + +
+
+
+
+
+ + ☆ Optical Coherence Tomography Angiography-OCTA dataset for the study of + Diabetic Retinopathy + + +
+ This study presents a dataset consisting of 268 retinal images from 179 +individuals, including 133 left-eye and 135 right-eye images, collected from +Natasha Eye Care and Research Institute in Pune, Maharashtra, India. The images +were captured using a nonmydriatic Optical Coherence Tomography Angiography +(OCTA) device, specifically the Optovue Avanti Edition machine as per the +protocol mentioned in this paper. Two ophthalmologists then annotated the +images. This dataset can be used by researchers and doctors to develop +automated diagnostic tools for early detection of diabetic retinopathy (DR). + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Secure Traffic Sign Recognition: An Attention-Enabled Universal Image + Inpainting Mechanism against Light Patch Attacks + + +
+ Traffic sign recognition systems play a crucial role in assisting drivers to +make informed decisions while driving. However, due to the heavy reliance on +deep learning technologies, particularly for future connected and autonomous +driving, these systems are susceptible to adversarial attacks that pose +significant safety risks to both personal and public transportation. Notably, +researchers recently identified a new attack vector to deceive sign recognition +systems: projecting well-designed adversarial light patches onto traffic signs. +In comparison with traditional adversarial stickers or graffiti, these emerging +light patches exhibit heightened aggression due to their ease of implementation +and outstanding stealthiness. To effectively counter this security threat, we +propose a universal image inpainting mechanism, namely, SafeSign. It relies on +attention-enabled multi-view image fusion to repair traffic signs contaminated +by adversarial light patches, thereby ensuring the accurate sign recognition. +Here, we initially explore the fundamental impact of malicious light patches on +the local and global feature spaces of authentic traffic signs. Then, we design +a binary mask-based U-Net image generation pipeline outputting diverse +contaminated sign patterns, to provide our image inpainting model with needed +training data. Following this, we develop an attention mechanism-enabled neural +network to jointly utilize the complementary information from multi-view images +to repair contaminated signs. Finally, extensive experiments are conducted to +evaluate SafeSign's effectiveness in resisting potential light patch-based +attacks, bringing an average accuracy improvement of 54.8% in three widely-used +sign recognition models + +
+
+
+
+
+ + ☆ Confidence-Aware Document OCR Error Detection + + +
+ Optical Character Recognition (OCR) continues to face accuracy challenges +that impact subsequent applications. To address these errors, we explore the +utility of OCR confidence scores for enhancing post-OCR error detection. Our +study involves analyzing the correlation between confidence scores and error +rates across different OCR systems. We develop ConfBERT, a BERT-based model +that incorporates OCR confidence scores into token embeddings and offers an +optional pre-training phase for noise adjustment. Our experimental results +demonstrate that integrating OCR confidence scores can enhance error detection +capabilities. This work underscores the importance of OCR confidence scores in +improving detection accuracy and reveals substantial disparities in performance +between commercial and open-source OCR technologies. + +
+
+
+
+
+ + ☆ Smooth-edged Perturbations Improve Perturbation-based Image Explanations + + +
+ Perturbation-based post-hoc image explanation methods are commonly used to +explain image prediction models by perturbing parts of the input to measure how +those parts affect the output. Due to the intractability of perturbing each +pixel individually, images are typically attributed to larger segments. The +Randomized Input Sampling for Explanations (RISE) method solved this issue by +using smooth perturbation masks. + While this method has proven effective and popular, it has not been +investigated which parts of the method are responsible for its success. This +work tests many combinations of mask sampling, segmentation techniques, +smoothing, and attribution calculation. The results show that the RISE-style +pixel attribution is beneficial to all evaluated methods. Furthermore, it is +shown that attribution calculation is the least impactful parameter. + The implementation of this work is available online: +https://github.com/guspih/post-hoc-image-perturbation. + +
+
+ comment: This manuscript have been submitted to NLDL 2025 +
+
+
+
+
+ + ☆ MixNet: Joining Force of Classical and Modern Approaches Toward the + Comprehensive Pipeline in Motor Imagery EEG Classification + + +
+ Recent advances in deep learning (DL) have significantly impacted motor +imagery (MI)-based brain-computer interface (BCI) systems, enhancing the +decoding of electroencephalography (EEG) signals. However, most studies +struggle to identify discriminative patterns across subjects during MI tasks, +limiting MI classification performance. In this article, we propose MixNet, a +novel classification framework designed to overcome this limitation by +utilizing spectral-spatial signals from MI data, along with a multitask +learning architecture named MIN2Net, for classification. Here, the +spectral-spatial signals are generated using the filter-bank common spatial +patterns (FBCSPs) method on MI data. Since the multitask learning architecture +is used for the classification task, the learning in each task may exhibit +different generalization rates and potential overfitting across tasks. To +address this issue, we implement adaptive gradient blending, simultaneously +regulating multiple loss weights and adjusting the learning pace for each task +based on its generalization/overfitting tendencies. Experimental results on six +benchmark data sets of different data sizes demonstrate that MixNet +consistently outperforms all state-of-the-art algorithms in subject-dependent +and -independent settings. Finally, the low-density EEG MI classification +results show that MixNet outperforms all state-of-the-art algorithms, offering +promising implications for Internet of Thing (IoT) applications, such as +lightweight and portable EEG wearable devices based on low-density montages. + +
+
+ comment: Supplementary materials and source codes are available on-line at + https://github.com/Max-Phairot-A/MixNet +
+
+
+
+
+ + ☆ UNIT: Unifying Image and Text Recognition in One Vision Encoder + + +
+ Currently, vision encoder models like Vision Transformers (ViTs) typically +excel at image recognition tasks but cannot simultaneously support text +recognition like human visual recognition. To address this limitation, we +propose UNIT, a novel training framework aimed at UNifying Image and Text +recognition within a single model. Starting with a vision encoder pre-trained +with image recognition tasks, UNIT introduces a lightweight language decoder +for predicting text outputs and a lightweight vision decoder to prevent +catastrophic forgetting of the original image encoding capabilities. The +training process comprises two stages: intra-scale pretraining and inter-scale +finetuning. During intra-scale pretraining, UNIT learns unified representations +from multi-scale inputs, where images and documents are at their commonly used +resolution, to enable fundamental recognition capability. In the inter-scale +finetuning stage, the model introduces scale-exchanged data, featuring images +and documents at resolutions different from the most commonly used ones, to +enhance its scale robustness. Notably, UNIT retains the original vision encoder +architecture, making it cost-free in terms of inference and deployment. +Experiments across multiple benchmarks confirm that our method significantly +outperforms existing methods on document-related tasks (e.g., OCR and DocQA) +while maintaining the performances on natural images, demonstrating its ability +to substantially enhance text recognition without compromising its core image +recognition capabilities. + +
+
+
+
+
+ + ☆ Introducing a Class-Aware Metric for Monocular Depth Estimation: An + Automotive Perspective ECCV + + +
+ The increasing accuracy reports of metric monocular depth estimation models +lead to a growing interest from the automotive domain. Current model +evaluations do not provide deeper insights into the models' performance, also +in relation to safety-critical or unseen classes. Within this paper, we present +a novel approach for the evaluation of depth estimation models. Our proposed +metric leverages three components, a class-wise component, an edge and corner +image feature component, and a global consistency retaining component. Classes +are further weighted on their distance in the scene and on criticality for +automotive applications. In the evaluation, we present the benefits of our +metric through comparison to classical metrics, class-wise analytics, and the +retrieval of critical situations. The results show that our metric provides +deeper insights into model results while fulfilling safety-critical +requirements. We release the code and weights on the following repository: +\href{https://github.com/leisemann/ca_mmde} + +
+
+ comment: Accepted at the European Conference on Computer Vision (ECCV) 2024 + Workshop on Out Of Distribution Generalization in Computer Vision +
+
+
+
+
+ + ☆ SDformerFlow: Spatiotemporal swin spikeformer for event-based optical + flow estimation + + +
+ Event cameras generate asynchronous and sparse event streams capturing +changes in light intensity. They offer significant advantages over conventional +frame-based cameras, such as a higher dynamic range and an extremely faster +data rate, making them particularly useful in scenarios involving fast motion +or challenging lighting conditions. Spiking neural networks (SNNs) share +similar asynchronous and sparse characteristics and are well-suited for +processing data from event cameras. Inspired by the potential of transformers +and spike-driven transformers (spikeformers) in other computer vision tasks, we +propose two solutions for fast and robust optical flow estimation for event +cameras: STTFlowNet and SDformerFlow. STTFlowNet adopts a U-shaped artificial +neural network (ANN) architecture with spatiotemporal shifted window +self-attention (swin) transformer encoders, while SDformerFlow presents its +fully spiking counterpart, incorporating swin spikeformer encoders. +Furthermore, we present two variants of the spiking version with different +neuron models. Our work is the first to make use of spikeformers for dense +optical flow estimation. We conduct end-to-end training for all models using +supervised learning. Our results yield state-of-the-art performance among +SNN-based event optical flow methods on both the DSEC and MVSEC datasets, and +show significant reduction in power consumption compared to the equivalent +ANNs. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Site-Specific Color Features of Green Coffee Beans + + +
+ Coffee is one of the most valuable primary commodities. Despite this, the +common selection technique of green coffee beans relies on personnel visual +inspection, which is labor-intensive and subjective. Therefore, an efficient +way to evaluate the quality of beans is needed. In this paper, we demonstrate a +site-independent approach to find site-specific color features of the seed coat +in qualified green coffee beans. We then propose two evaluation schemes for +green coffee beans based on this site-specific color feature of qualified +beans. Due to the site-specific properties of these color features, machine +learning classifiers indicate that compared with the existing evaluation +schemes of beans, our evaluation schemes have the advantages of being simple, +having less computational costs, and having universal applicability. Finally, +this site-specific color feature can distinguish qualified beans from different +growing sites. Moreover, this function can prevent cheating in the coffee +business and is unique to our evaluation scheme of beans. + +
+
+ comment: 21 pages, 7 figures +
+
+
+
+
+ + ☆ D4: Text-guided diffusion model-based domain adaptive data augmentation + for vineyard shoot detection + + +
+ In an agricultural field, plant phenotyping using object detection models is +gaining attention. However, collecting the training data necessary to create +generic and high-precision models is extremely challenging due to the +difficulty of annotation and the diversity of domains. Furthermore, it is +difficult to transfer training data across different crops, and although +machine learning models effective for specific environments, conditions, or +crops have been developed, they cannot be widely applied in actual fields. In +this study, we propose a generative data augmentation method (D4) for vineyard +shoot detection. D4 uses a pre-trained text-guided diffusion model based on a +large number of original images culled from video data collected by unmanned +ground vehicles or other means, and a small number of annotated datasets. The +proposed method generates new annotated images with background information +adapted to the target domain while retaining annotation information necessary +for object detection. In addition, D4 overcomes the lack of training data in +agriculture, including the difficulty of annotation and diversity of domains. +We confirmed that this generative data augmentation method improved the mean +average precision by up to 28.65% for the BBox detection task and the average +precision by up to 13.73% for the keypoint detection task for vineyard shoot +detection. Our generative data augmentation method D4 is expected to +simultaneously solve the cost and domain diversity issues of training data +generation in agriculture and improve the generalization performance of +detection models. + +
+
+
+
+
+ + ☆ COLUMBUS: Evaluating COgnitive Lateral Understanding through + Multiple-choice reBUSes AAAI-25 + + +
+ While visual question-answering (VQA) benchmarks have catalyzed the +development of reasoning techniques, they have focused on vertical thinking. +Effective problem-solving also necessitates lateral thinking, which remains +understudied in AI and has not been used to test visual perception systems. To +bridge this gap, we formulate visual lateral thinking as a multiple-choice +question-answering task and describe a three-step taxonomy-driven methodology +for instantiating task examples. Then, we develop COLUMBUS, a synthetic +benchmark that applies the task pipeline to create QA sets with text and icon +rebus puzzles based on publicly available collections of compounds and common +phrases. COLUMBUS comprises over 1,000 puzzles, each with four answer +candidates. While the SotA vision-language models (VLMs) achieve decent +performance, our evaluation demonstrates a substantial gap between humans and +models. VLMs benefit from human-curated descriptions but struggle to +self-generate such representations at the right level of abstraction. + +
+
+ comment: 18 pages, 10 figures, submitted to AAAI-25 +
+
+
+
+
+ + ☆ EigenSR: Eigenimage-Bridged Pre-Trained RGB Learners for Single + Hyperspectral Image Super-Resolution AAAI 2025 + + +
+ Single hyperspectral image super-resolution (single-HSI-SR) aims to improve +the resolution of a single input low-resolution HSI. Due to the bottleneck of +data scarcity, the development of single-HSI-SR lags far behind that of RGB +natural images. In recent years, research on RGB SR has shown that models +pre-trained on large-scale benchmark datasets can greatly improve performance +on unseen data, which may stand as a remedy for HSI. But how can we transfer +the pre-trained RGB model to HSI, to overcome the data-scarcity bottleneck? +Because of the significant difference in the channels between the pre-trained +RGB model and the HSI, the model cannot focus on the correlation along the +spectral dimension, thus limiting its ability to utilize on HSI. Inspired by +the HSI spatial-spectral decoupling, we propose a new framework that first +fine-tunes the pre-trained model with the spatial components (known as +eigenimages), and then infers on unseen HSI using an iterative spectral +regularization (ISR) to maintain the spectral correlation. The advantages of +our method lie in: 1) we effectively inject the spatial texture processing +capabilities of the pre-trained RGB model into HSI while keeping spectral +fidelity, 2) learning in the spectral-decorrelated domain can improve the +generalizability to spectral-agnostic data, and 3) our inference in the +eigenimage domain naturally exploits the spectral low-rank property of HSI, +thereby reducing the complexity. This work bridges the gap between pre-trained +RGB models and HSI via eigenimages, addressing the issue of limited HSI +training data, hence the name EigenSR. Extensive experiments show that EigenSR +outperforms the state-of-the-art (SOTA) methods in both spatial and spectral +metrics. Our code will be released. + +
+
+ comment: Submitted to AAAI 2025 +
+
+
+
+
+ + ☆ On Evaluation of Vision Datasets and Models using Human Competency + Frameworks + + +
+ Evaluating models and datasets in computer vision remains a challenging task, +with most leaderboards relying solely on accuracy. While accuracy is a popular +metric for model evaluation, it provides only a coarse assessment by +considering a single model's score on all dataset items. This paper explores +Item Response Theory (IRT), a framework that infers interpretable latent +parameters for an ensemble of models and each dataset item, enabling richer +evaluation and analysis beyond the single accuracy number. Leveraging IRT, we +assess model calibration, select informative data subsets, and demonstrate the +usefulness of its latent parameters for analyzing and comparing models and +datasets in computer vision. + +
+
+
+
+
+ + ☆ PlantSeg: A Large-Scale In-the-wild Dataset for Plant Disease + Segmentation + + +
+ Plant diseases pose significant threats to agriculture. It necessitates +proper diagnosis and effective treatment to safeguard crop yields. To automate +the diagnosis process, image segmentation is usually adopted for precisely +identifying diseased regions, thereby advancing precision agriculture. +Developing robust image segmentation models for plant diseases demands +high-quality annotations across numerous images. However, existing plant +disease datasets typically lack segmentation labels and are often confined to +controlled laboratory settings, which do not adequately reflect the complexity +of natural environments. Motivated by this fact, we established PlantSeg, a +large-scale segmentation dataset for plant diseases. PlantSeg distinguishes +itself from existing datasets in three key aspects. (1) Annotation type: Unlike +the majority of existing datasets that only contain class labels or bounding +boxes, each image in PlantSeg includes detailed and high-quality segmentation +masks, associated with plant types and disease names. (2) Image source: Unlike +typical datasets that contain images from laboratory settings, PlantSeg +primarily comprises in-the-wild plant disease images. This choice enhances the +practical applicability, as the trained models can be applied for integrated +disease management. (3) Scale: PlantSeg is extensive, featuring 11,400 images +with disease segmentation masks and an additional 8,000 healthy plant images +categorized by plant type. Extensive technical experiments validate the high +quality of PlantSeg's annotations. This dataset not only allows researchers to +evaluate their image classification methods but also provides a critical +foundation for developing and benchmarking advanced plant disease segmentation +algorithms. + +
+
+
+
+
+ + ☆ MultiCounter: Multiple Action Agnostic Repetition Counting in Untrimmed + Videos ECAI 2024 + + +
+ Multi-instance Repetitive Action Counting (MRAC) aims to estimate the number +of repetitive actions performed by multiple instances in untrimmed videos, +commonly found in human-centric domains like sports and exercise. In this +paper, we propose MultiCounter, a fully end-to-end deep learning framework that +enables simultaneous detection, tracking, and counting of repetitive actions of +multiple human instances. Specifically, MultiCounter incorporates two novel +modules: 1) mixed spatiotemporal interaction for efficient context correlation +across consecutive frames, and 2) task-specific heads for accurate perception +of periodic boundaries and generalization for action-agnostic human instances. +We train MultiCounter on a synthetic dataset called MultiRep generated from +annotated real-world videos. Experiments on the MultiRep dataset validate the +fundamental challenge of MRAC tasks and showcase the superiority of our +proposed model. Compared to ByteTrack+RepNet, a solution that combines an +advanced tracker with a single repetition counter, MultiCounter substantially +improves Period-mAP by 41.0%, reduces AvgMAE by 58.6%, and increases AvgOBO +1.48 times. This sets a new benchmark in the field of MRAC. Moreover, +MultiCounter runs in real-time on a commodity GPU server and is insensitive to +the number of human instances in a video. + +
+
+ comment: Accepted by ECAI 2024 +
+
+
+
+
+ + ☆ Dense Hand-Object(HO) GraspNet with Full Grasping Taxonomy and Dynamics ECCV + + +
+ Existing datasets for 3D hand-object interaction are limited either in the +data cardinality, data variations in interaction scenarios, or the quality of +annotations. In this work, we present a comprehensive new training dataset for +hand-object interaction called HOGraspNet. It is the only real dataset that +captures full grasp taxonomies, providing grasp annotation and wide intraclass +variations. Using grasp taxonomies as atomic actions, their space and time +combinatorial can represent complex hand activities around objects. We select +22 rigid objects from the YCB dataset and 8 other compound objects using shape +and size taxonomies, ensuring coverage of all hand grasp configurations. The +dataset includes diverse hand shapes from 99 participants aged 10 to 74, +continuous video frames, and a 1.5M RGB-Depth of sparse frames with +annotations. It offers labels for 3D hand and object meshes, 3D keypoints, +contact maps, and \emph{grasp labels}. Accurate hand and object 3D meshes are +obtained by fitting the hand parametric model (MANO) and the hand implicit +function (HALO) to multi-view RGBD frames, with the MoCap system only for +objects. Note that HALO fitting does not require any parameter tuning, enabling +scalability to the dataset's size with comparable accuracy to MANO. We evaluate +HOGraspNet on relevant tasks: grasp classification and 3D hand pose estimation. +The result shows performance variations based on grasp type and object class, +indicating the potential importance of the interaction space captured by our +dataset. The provided data aims at learning universal shape priors or +foundation models for 3D hand-object interaction. Our dataset and code are +available at https://hograspnet2024.github.io/. + +
+
+ comment: 14 pages except for references. It will be published at European + Conference on Computer Vision(ECCV) 2024 +
+
+
+
+
+ + ☆ BFA-YOLO: Balanced multiscale object detection network for multi-view + building facade attachments detection + + +
+ Detection of building facade attachments such as doors, windows, balconies, +air conditioner units, billboards, and glass curtain walls plays a pivotal role +in numerous applications. Building facade attachments detection aids in +vbuilding information modeling (BIM) construction and meeting Level of Detail 3 +(LOD3) standards. Yet, it faces challenges like uneven object distribution, +small object detection difficulty, and background interference. To counter +these, we propose BFA-YOLO, a model for detecting facade attachments in +multi-view images. BFA-YOLO incorporates three novel innovations: the Feature +Balanced Spindle Module (FBSM) for addressing uneven distribution, the Target +Dynamic Alignment Task Detection Head (TDATH) aimed at improving small object +detection, and the Position Memory Enhanced Self-Attention Mechanism (PMESA) to +combat background interference, with each component specifically designed to +solve its corresponding challenge. Detection efficacy of deep network models +deeply depends on the dataset's characteristics. Existing open source datasets +related to building facades are limited by their single perspective, small +image pool, and incomplete category coverage. We propose a novel method for +building facade attachments detection dataset construction and construct the +BFA-3D dataset for facade attachments detection. The BFA-3D dataset features +multi-view, accurate labels, diverse categories, and detailed classification. +BFA-YOLO surpasses YOLOv8 by 1.8% and 2.9% in mAP@0.5 on the multi-view BFA-3D +and street-view Facade-WHU datasets, respectively. These results underscore +BFA-YOLO's superior performance in detecting facade attachments. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ☆ Towards Energy-Efficiency by Navigating the Trilemma of Energy, Latency, + and Accuracy + + +
+ Extended Reality (XR) enables immersive experiences through untethered +headsets but suffers from stringent battery and resource constraints. +Energy-efficient design is crucial to ensure both longevity and high +performance in XR devices. However, latency and accuracy are often prioritized +over energy, leading to a gap in achieving energy efficiency. This paper +examines scene reconstruction, a key building block for immersive XR +experiences, and demonstrates how energy efficiency can be achieved by +navigating the trilemma of energy, latency, and accuracy. + We explore three classes of energy-oriented optimizations, covering the +algorithm, execution, and data, that reveal a broad design space through +configurable parameters. Our resulting 72 designs expose a wide range of +latency and energy trade-offs, with a smaller range of accuracy loss. We +identify a Pareto-optimal curve and show that the designs on the curve are +achievable only through synergistic co-optimization of all three optimization +classes and by considering the latency and accuracy needs of downstream scene +reconstruction consumers. Our analysis covering various use cases and +measurements on an embedded class system shows that, relative to the baseline, +our designs offer energy benefits of up to 60X with potential latency range of +4X slowdown to 2X speedup. Detailed exploration of a use case across +representative data sequences from ScanNet showed about 25X energy savings with +1.5X latency reduction and negligible reconstruction quality loss. + +
+
+ comment: ISMAR 2024 +
+
+
+
+
+ + ☆ 3D-GP-LMVIC: Learning-based Multi-View Image Coding with 3D Gaussian + Geometric Priors + + +
+ Multi-view image compression is vital for 3D-related applications. To +effectively model correlations between views, existing methods typically +predict disparity between two views on a 2D plane, which works well for small +disparities, such as in stereo images, but struggles with larger disparities +caused by significant view changes. To address this, we propose a novel +approach: learning-based multi-view image coding with 3D Gaussian geometric +priors (3D-GP-LMVIC). Our method leverages 3D Gaussian Splatting to derive +geometric priors of the 3D scene, enabling more accurate disparity estimation +across views within the compression model. Additionally, we introduce a depth +map compression model to reduce redundancy in geometric information between +views. A multi-view sequence ordering method is also proposed to enhance +correlations between adjacent views. Experimental results demonstrate that +3D-GP-LMVIC surpasses both traditional and learning-based methods in +performance, while maintaining fast encoding and decoding speed. + +
+
+ comment: 19pages, 8 figures, conference +
+
+
+
+
+ + ☆ Hybrid Mask Generation for Infrared Small Target Detection with + Single-Point Supervision + + +
+ Single-frame infrared small target (SIRST) detection poses a significant +challenge due to the requirement to discern minute targets amidst complex +infrared background clutter. Recently, deep learning approaches have shown +promising results in this domain. However, these methods heavily rely on +extensive manual annotations, which are particularly cumbersome and +resource-intensive for infrared small targets owing to their minute sizes. To +address this limitation, we introduce a Hybrid Mask Generation (HMG) approach +that recovers high-quality masks for each target from only a single-point label +for network training. Specifically, our HMG approach consists of a handcrafted +Points-to-Mask Generation strategy coupled with a pseudo mask updating strategy +to recover and refine pseudo masks from point labels. The Points-to-Mask +Generation strategy divides two distinct stages: Points-to-Box conversion, +where individual point labels are transformed into bounding boxes, and +subsequently, Box-to-Mask prediction, where these bounding boxes are elaborated +into precise masks. The mask updating strategy integrates the complementary +strengths of handcrafted and deep-learning algorithms to iteratively refine the +initial pseudo masks. Experimental results across three datasets demonstrate +that our method outperforms the existing methods for infrared small target +detection with single-point supervision. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Qihoo-T2X: An Efficiency-Focused Diffusion Transformer via Proxy Tokens + for Text-to-Any-Task + + +
+ The global self-attention mechanism in diffusion transformers involves +redundant computation due to the sparse and redundant nature of visual +information, and the attention map of tokens within a spatial window shows +significant similarity. To address this redundancy, we propose the Proxy Token +Diffusion Transformer (PT-DiT), which employs sparse representative token +attention (where the number of representative tokens is much smaller than the +total number of tokens) to model global visual information efficiently. +Specifically, in each transformer block, we randomly sample one token from each +spatial-temporal window to serve as a proxy token for that region. The global +semantics are captured through the self-attention of these proxy tokens and +then injected into all latent tokens via cross-attention. Simultaneously, we +introduce window and shift window attention to address the limitations in +detail modeling caused by the sparse attention mechanism. Building on the +well-designed PT-DiT, we further develop the Qihoo-T2X family, which includes a +variety of models for T2I, T2V, and T2MV tasks. Experimental results show that +PT-DiT achieves competitive performance while reducing the computational +complexity in both image and video generation tasks (e.g., a 48% reduction +compared to DiT and a 35% reduction compared to Pixart-alpha). Our source code +is available at https://github.com/360CVGroup/Qihoo-T2X. + +
+
+
+
+
+ + ☆ One-Shot Diffusion Mimicker for Handwritten Text Generation ECCV 2024 + + +
+ Existing handwritten text generation methods often require more than ten +handwriting samples as style references. However, in practical applications, +users tend to prefer a handwriting generation model that operates with just a +single reference sample for its convenience and efficiency. This approach, +known as "one-shot generation", significantly simplifies the process but poses +a significant challenge due to the difficulty of accurately capturing a +writer's style from a single sample, especially when extracting fine details +from the characters' edges amidst sparse foreground and undesired background +noise. To address this problem, we propose a One-shot Diffusion Mimicker +(One-DM) to generate handwritten text that can mimic any calligraphic style +with only one reference sample. Inspired by the fact that high-frequency +information of the individual sample often contains distinct style patterns +(e.g., character slant and letter joining), we develop a novel style-enhanced +module to improve the style extraction by incorporating high-frequency +components from a single sample. We then fuse the style features with the text +content as a merged condition for guiding the diffusion model to produce +high-quality handwritten text images. Extensive experiments demonstrate that +our method can successfully generate handwriting scripts with just one sample +reference in multiple languages, even outperforming previous methods using over +ten samples. Our source code is available at +https://github.com/dailenson/One-DM. + +
+
+ comment: To appear in ECCV 2024 +
+
+
+
+
+ + ☆ DreamForge: Motion-Aware Autoregressive Video Generation for Multi-View + Driving Scenes + + +
+ Recent advances in diffusion models have significantly enhanced the +cotrollable generation of streetscapes for and facilitated downstream +perception and planning tasks. However, challenges such as maintaining temporal +coherence, generating long videos, and accurately modeling driving scenes +persist. Accordingly, we propose DreamForge, an advanced diffusion-based +autoregressive video generation model designed for the long-term generation of +3D-controllable and extensible video. In terms of controllability, our +DreamForge supports flexible conditions such as text descriptions, camera +poses, 3D bounding boxes, and road layouts, while also providing perspective +guidance to produce driving scenes that are both geometrically and contextually +accurate. For consistency, we ensure inter-view consistency through cross-view +attention and temporal coherence via an autoregressive architecture enhanced +with motion cues. Codes will be available at +https://github.com/PJLab-ADG/DriveArena. + +
+
+ comment: Second place solution for W-CODA-Track2 +
+
+
+
+
+ + ☆ Boundary feature fusion network for tooth image segmentation MICCAI + + +
+ Tooth segmentation is a critical technology in the field of medical image +segmentation, with applications ranging from orthodontic treatment to human +body identification and dental pathology assessment. Despite the development of +numerous tooth image segmentation models by researchers, a common shortcoming +is the failure to account for the challenges of blurred tooth boundaries. +Dental diagnostics require precise delineation of tooth boundaries. This paper +introduces an innovative tooth segmentation network that integrates boundary +information to address the issue of indistinct boundaries between teeth and +adjacent tissues. This network's core is its boundary feature extraction +module, which is designed to extract detailed boundary information from +high-level features. Concurrently, the feature cross-fusion module merges +detailed boundary and global semantic information in a synergistic way, +allowing for stepwise layer transfer of feature information. This method +results in precise tooth segmentation. In the most recent STS Data Challenge, +our methodology was rigorously tested and received a commendable overall score +of 0.91. When compared to other existing approaches, this score demonstrates +our method's significant superiority in segmenting tooth boundaries. + +
+
+ comment: MICCAI workshop,see https://link.springer.com/book/9783031723957 +
+
+
+
+
+ + ☆ Bi-modality Images Transfer with a Discrete Process Matching Method + + +
+ Recently, medical image synthesis gains more and more popularity, along with +the rapid development of generative models. Medical image synthesis aims to +generate an unacquired image modality, often from other observed data +modalities. Synthesized images can be used for clinical diagnostic assistance, +data augmentation for model training and validation or image quality improving. +In the meanwhile, the flow-based models are among the successful generative +models for the ability of generating realistic and high-quality synthetic +images. However, most flow-based models require to calculate flow ordinary +different equation (ODE) evolution steps in transfer process, for which the +performances are significantly limited by heavy computation time due to a large +number of time iterations. In this paper, we propose a novel flow-based model, +namely Discrete Process Matching (DPM) to accomplish the bi-modality image +transfer tasks. Different to other flow matching based models, we propose to +utilize both forward and backward ODE flow and enhance the consistency on the +intermediate images of few discrete time steps, resulting in a transfer process +with much less iteration steps while maintaining high-quality generations for +both modalities. Our experiments on three datasets of MRI T1/T2 and CT/MRI +demonstrate that DPM outperforms other state-of-the-art flow-based methods for +bi-modality image synthesis, achieving higher image quality with less +computation time cost. + +
+
+
+
+
+ + ☆ Generating Faithful and Salient Text from Multimodal Data + + +
+ While large multimodal models (LMMs) have obtained strong performance on many +multimodal tasks, they may still hallucinate while generating text. Their +performance on detecting salient features from visual data is also unclear. In +this paper, we develop a framework to generate faithful and salient text from +mixed-modal data, which includes images and structured data ( represented in +knowledge graphs or tables). Specifically, we train a small vision critic model +to identify hallucinated and non-salient features from the image modality. The +critic model also generates a list of salient image features. This information +is used in the post editing step to improve the generation quality. Experiments +on two datasets show that our framework improves LMMs' generation quality on +both faithfulness and saliency, outperforming recent techniques aimed at +reducing hallucination. + +
+
+
+
+
+ + ☆ FODA-PG for Enhanced Medical Imaging Narrative Generation: Adaptive + Differentiation of Normal and Abnormal Attributes + + +
+ Automatic Medical Imaging Narrative generation aims to alleviate the workload +of radiologists by producing accurate clinical descriptions directly from +radiological images. However, the subtle visual nuances and domain-specific +terminology in medical images pose significant challenges compared to generic +image captioning tasks. Existing approaches often neglect the vital distinction +between normal and abnormal findings, leading to suboptimal performance. In +this work, we propose FODA-PG, a novel Fine-grained Organ-Disease Adaptive +Partitioning Graph framework that addresses these limitations through +domain-adaptive learning. FODA-PG constructs a granular graphical +representation of radiological findings by separating disease-related +attributes into distinct "disease-specific" and "disease-free" categories based +on their clinical significance and location. This adaptive partitioning enables +our model to capture the nuanced differences between normal and pathological +states, mitigating the impact of data biases. By integrating this fine-grained +semantic knowledge into a powerful transformer-based architecture and providing +rigorous mathematical justifications for its effectiveness, FODA-PG generates +precise and clinically coherent reports with enhanced generalization +capabilities. Extensive experiments on the IU-Xray and MIMIC-CXR benchmarks +demonstrate the superiority of our approach over state-of-the-art methods, +highlighting the importance of domain adaptation in medical report generation. + +
+
+
+
+
+ + ♻ ☆ MSLIQA: Enhancing Learning Representations for Image Quality Assessment + through Multi-Scale Learning + + +
+ No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due +to the diversity of distortions and the lack of large annotated datasets. Many +studies have attempted to tackle these challenges by developing more accurate +NR-IQA models, often employing complex and computationally expensive networks, +or by bridging the domain gap between various distortions to enhance +performance on test datasets. In our work, we improve the performance of a +generic lightweight NR-IQA model by introducing a novel augmentation strategy +that boosts its performance by almost 28\%. This augmentation strategy enables +the network to better discriminate between different distortions in various +parts of the image by zooming in and out. Additionally, the inclusion of +test-time augmentation further enhances performance, making our lightweight +network's results comparable to the current state-of-the-art models, simply +through the use of augmentations. + +
+
+
+
+
+ + ♻ ☆ LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality + Assessment Model + + +
+ Recent advancements in the field of No-Reference Image Quality Assessment +(NR-IQA) using deep learning techniques demonstrate high performance across +multiple open-source datasets. However, such models are typically very large +and complex making them not so suitable for real-world deployment, especially +on resource- and battery-constrained mobile devices. To address this +limitation, we propose a compact, lightweight NR-IQA model that achieves +state-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation +and test datasets while being also nearly 5.7 times faster than the fastest +SOTA model. Our model features a dual-branch architecture, with each branch +separately trained on synthetically and authentically distorted images which +enhances the model's generalizability across different distortion types. To +improve robustness under diverse real-world visual conditions, we additionally +incorporate multiple color spaces during the training process. We also +demonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks +(KANs) for final quality regression as compared to the conventional Multi-Layer +Perceptrons (MLPs). Our evaluation considering various open-source datasets +highlights the practical, high-accuracy, and robust performance of our proposed +lightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA. + +
+
+
+
+
+ + ♻ ☆ HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical + MLLM Prompts + + +
+ The potential for higher-resolution image generation using pretrained +diffusion models is immense, yet these models often struggle with issues of +object repetition and structural artifacts especially when scaling to 4K +resolution and higher. We figure out that the problem is caused by that, a +single prompt for the generation of multiple scales provides insufficient +efficacy. In response, we propose HiPrompt, a new tuning-free solution that +tackles the above problems by introducing hierarchical prompts. The +hierarchical prompts offer both global and local guidance. Specifically, the +global guidance comes from the user input that describes the overall content, +while the local guidance utilizes patch-wise descriptions from MLLMs to +elaborately guide the regional structure and texture generation. Furthermore, +during the inverse denoising process, the generated noise is decomposed into +low- and high-frequency spatial components. These components are conditioned on +multiple prompt levels, including detailed patch-wise descriptions and broader +image-level prompts, facilitating prompt-guided denoising under hierarchical +semantic guidance. It further allows the generation to focus more on local +spatial regions and ensures the generated images maintain coherent local and +global semantics, structures, and textures with high definition. Extensive +experiments demonstrate that HiPrompt outperforms state-of-the-art works in +higher-resolution image generation, significantly reducing object repetition +and enhancing structural quality. + +
+
+ comment: https://liuxinyv.github.io/HiPrompt/ +
+
+
+
+
+ + ♻ ☆ Res-VMamba: Fine-Grained Food Category Visual Classification Using + Selective State Space Models with Deep Residual Learning + + +
+ Food classification is the foundation for developing food vision tasks and +plays a key role in the burgeoning field of computational nutrition. Due to the +complexity of food requiring fine-grained classification, recent academic +research mainly modifies Convolutional Neural Networks (CNNs) and/or Vision +Transformers (ViTs) to perform food category classification. However, to learn +fine-grained features, the CNN backbone needs additional structural design, +whereas ViT, containing the self-attention module, has increased computational +complexity. In recent months, a new Sequence State Space (S4) model, through a +Selection mechanism and computation with a Scan (S6), colloquially termed +Mamba, has demonstrated superior performance and computation efficiency +compared to the Transformer architecture. The VMamba model, which incorporates +the Mamba mechanism into image tasks (such as classification), currently +establishes the state-of-the-art (SOTA) on the ImageNet dataset. In this +research, we introduce an academically underestimated food dataset CNFOOD-241, +and pioneer the integration of a residual learning framework within the VMamba +model to concurrently harness both global and local state features inherent in +the original VMamba architectural design. The research results show that VMamba +surpasses current SOTA models in fine-grained and food classification. The +proposed Res-VMamba further improves the classification accuracy to 79.54\% +without pretrained weight. Our findings elucidate that our proposed methodology +establishes a new benchmark for SOTA performance in food recognition on the +CNFOOD-241 dataset. The code can be obtained on GitHub: +https://github.com/ChiShengChen/ResVMamba. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ 3D Gaussian Splatting for Large-scale 3D Surface Reconstruction from + Aerial Images + + +
+ Recently, 3D Gaussian Splatting (3DGS) has garnered significant attention. +However, the unstructured nature of 3DGS poses challenges for large-scale +surface reconstruction from aerial images. To address this gap, we propose the +first large-scale surface reconstruction method for multi-view stereo (MVS) +aerial images based on 3DGS, named Aerial Gaussian Splatting (AGS). Initially, +we introduce a data chunking method tailored for large-scale aerial imagery, +making the modern 3DGS technology feasible for surface reconstruction over +extensive scenes. Additionally, we integrate the Ray-Gaussian Intersection +method to obtain normal and depth information, facilitating geometric +constraints. Finally, we introduce a multi-view geometric consistency +constraint to enhance global geometric consistency and improve reconstruction +accuracy. Our experiments on multiple datasets demonstrate for the first time +that the GS-based technique can match traditional aerial MVS methods on +geometric accuracy, and beat state-of-the-art GS-based methods on geometry and +rendering quality. + +
+
+ comment: In the writing, some parts of the book were wrong and needed a large + revision +
+
+
+
+
+ + ♻ ☆ Training-Free Condition Video Diffusion Models for single frame + Spatial-Semantic Echocardiogram Synthesis MICCAI 2024 + + +
+ Conditional video diffusion models (CDM) have shown promising results for +video synthesis, potentially enabling the generation of realistic +echocardiograms to address the problem of data scarcity. However, current CDMs +require a paired segmentation map and echocardiogram dataset. We present a new +method called Free-Echo for generating realistic echocardiograms from a single +end-diastolic segmentation map without additional training data. Our method is +based on the 3D-Unet with Temporal Attention Layers model and is conditioned on +the segmentation map using a training-free conditioning method based on SDEdit. +We evaluate our model on two public echocardiogram datasets, CAMUS and +EchoNet-Dynamic. We show that our model can generate plausible echocardiograms +that are spatially aligned with the input segmentation map, achieving +performance comparable to training-based CDMs. Our work opens up new +possibilities for generating echocardiograms from a single segmentation map, +which can be used for data augmentation, domain adaptation, and other +applications in medical imaging. Our code is available at +\url{https://github.com/gungui98/echo-free} + +
+
+ comment: Accepted to MICCAI 2024 +
+
+
+
+
+ + ♻ ☆ Many-Worlds Inverse Rendering + + +
+ Discontinuous visibility changes remain a major bottleneck when optimizing +surfaces within a physically-based inverse renderer. Many previous works have +proposed sophisticated algorithms and data structures to sample visibility +silhouettes more efficiently. + Our work presents another solution: instead of differentiating a tentative +surface locally, we differentiate a volumetric perturbation of a surface. We +refer this as a many-worlds representation because it models a non-interacting +superposition of conflicting explanations (worlds) of the input dataset. Each +world is optically isolated from others, leading to a new transport law that +distinguishes our method from prior work based on exponential random media. + The resulting Monte Carlo algorithm is simpler and more efficient than prior +methods. We demonstrate that our method promotes rapid convergence, both in +terms of the total iteration count and the cost per iteration. + +
+
+
+
+
+ + ♻ ☆ Open-Vocabulary Object Detectors: Robustness Challenges under + Distribution Shifts ECCV + + +
+ The challenge of Out-Of-Distribution (OOD) robustness remains a critical +hurdle towards deploying deep vision models. Vision-Language Models (VLMs) have +recently achieved groundbreaking results. VLM-based open-vocabulary object +detection extends the capabilities of traditional object detection frameworks, +enabling the recognition and classification of objects beyond predefined +categories. Investigating OOD robustness in recent open-vocabulary object +detection is essential to increase the trustworthiness of these models. This +study presents a comprehensive robustness evaluation of the zero-shot +capabilities of three recent open-vocabulary (OV) foundation object detection +models: OWL-ViT, YOLO World, and Grounding DINO. Experiments carried out on the +robustness benchmarks COCO-O, COCO-DC, and COCO-C encompassing distribution +shifts due to information loss, corruption, adversarial attacks, and +geometrical deformation, highlighting the challenges of the model's robustness +to foster the research for achieving robustness. Project page: +https://prakashchhipa.github.io/projects/ovod_robustness + +
+
+ comment: Accepted at 2024 European Conference on Computer Vision Workshops + (ECCVW). Project page - + https://prakashchhipa.github.io/projects/ovod_robustness +
+
+
+
+
+ + ♻ ☆ The Faiss library + + +
+ Vector databases typically manage large collections of embedding vectors. +Currently, AI applications are growing rapidly, and so is the number of +embeddings that need to be stored and indexed. The Faiss library is dedicated +to vector similarity search, a core functionality of vector databases. Faiss is +a toolkit of indexing methods and related primitives used to search, cluster, +compress and transform vectors. This paper describes the trade-off space of +vector search and the design principles of Faiss in terms of structure, +approach to optimization and interfacing. We benchmark key features of the +library and discuss a few selected applications to highlight its broad +applicability. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Video Editing through Adaptive Sliding Score Distillation + + +
+ The rapidly evolving field of Text-to-Video generation (T2V) has catalyzed +renewed interest in controllable video editing research. While the application +of editing prompts to guide diffusion model denoising has gained prominence, +mirroring advancements in image editing, this noise-based inference process +inherently compromises the original video's integrity, resulting in unintended +over-editing and temporal discontinuities. To address these challenges, this +study proposes a novel paradigm of video-based score distillation, facilitating +direct manipulation of original video content. Specifically, distinguishing it +from image-based score distillation, we propose an Adaptive Sliding Score +Distillation strategy, which incorporates both global and local video guidance +to reduce the impact of editing errors. Combined with our proposed Image-based +Joint Guidance mechanism, it has the ability to mitigate the inherent +instability of the T2V model and single-step sampling. Additionally, we design +a Weighted Attention Fusion module to further preserve the key features of the +original video and avoid over-editing. Extensive experiments demonstrate that +these strategies effectively address existing challenges, achieving superior +performance compared to current state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ UniPortrait: A Unified Framework for Identity-Preserving Single- and + Multi-Human Image Personalization + + +
+ This paper presents UniPortrait, an innovative human image personalization +framework that unifies single- and multi-ID customization with high face +fidelity, extensive facial editability, free-form input description, and +diverse layout generation. UniPortrait consists of only two plug-and-play +modules: an ID embedding module and an ID routing module. The ID embedding +module extracts versatile editable facial features with a decoupling strategy +for each ID and embeds them into the context space of diffusion models. The ID +routing module then combines and distributes these embeddings adaptively to +their respective regions within the synthesized image, achieving the +customization of single and multiple IDs. With a carefully designed two-stage +training scheme, UniPortrait achieves superior performance in both single- and +multi-ID customization. Quantitative and qualitative experiments demonstrate +the advantages of our method over existing approaches as well as its good +scalability, e.g., the universal compatibility with existing generative control +tools. The project page is at +https://aigcdesigngroup.github.io/UniPortrait-Page/ . + +
+
+ comment: Tech report; Project page: + https://aigcdesigngroup.github.io/UniPortrait-Page/ +
+
+
+
+
+ + ♻ ☆ Geospecific View Generation -- Geometry-Context Aware High-resolution + Ground View Inference from Satellite Views + + +
+ Predicting realistic ground views from satellite imagery in urban scenes is a +challenging task due to the significant view gaps between satellite and +ground-view images. We propose a novel pipeline to tackle this challenge, by +generating geospecifc views that maximally respect the weak geometry and +texture from multi-view satellite images. Different from existing approaches +that hallucinate images from cues such as partial semantics or geometry from +overhead satellite images, our method directly predicts ground-view images at +geolocation by using a comprehensive set of information from the satellite +image, resulting in ground-level images with a resolution boost at a factor of +ten or more. We leverage a novel building refinement method to reduce geometric +distortions in satellite data at ground level, which ensures the creation of +accurate conditions for view synthesis using diffusion networks. Moreover, we +proposed a novel geospecific prior, which prompts distribution learning of +diffusion models to respect image samples that are closer to the geolocation of +the predicted images. We demonstrate our pipeline is the first to generate +close-to-real and geospecific ground views merely based on satellite images. + +
+
+ comment: 11 figures +
+
+
+
+
+ + ♻ ☆ iSeg: An Iterative Refinement-based Framework for Training-free + Segmentation + + +
+ Stable diffusion has demonstrated strong image synthesis ability to given +text descriptions, suggesting it to contain strong semantic clue for grouping +objects. Inspired by this, researchers have explored employing stable diffusion +for trainingfree segmentation. Most existing approaches either simply employ +cross-attention map or refine it by self-attention map, to generate +segmentation masks. We believe that iterative refinement with self-attention +map would lead to better results. However, we mpirically demonstrate that such +a refinement is sub-optimal likely due to the self-attention map containing +irrelevant global information which hampers accurately refining cross-attention +map with multiple iterations. To address this, we propose an iterative +refinement framework for training-free segmentation, named iSeg, having an +entropy-reduced self-attention module which utilizes a gradient descent scheme +to reduce the entropy of self-attention map, thereby suppressing the weak +responses corresponding to irrelevant global information. Leveraging the +entropy-reduced self-attention module, our iSeg stably improves refined +crossattention map with iterative refinement. Further, we design a +category-enhanced cross-attention module to generate accurate cross-attention +map, providing a better initial input for iterative refinement. Extensive +experiments across different datasets and diverse segmentation tasks reveal the +merits of proposed contributions, leading to promising performance on diverse +segmentation tasks. For unsupervised semantic segmentation on Cityscapes, our +iSeg achieves an absolute gain of 3.8% in terms of mIoU compared to the best +existing training-free approach in literature. Moreover, our proposed iSeg can +support segmentation with different kind of images and interactions. + +
+
+ comment: Project Page: https://linsun449.github.io/iSeg/ Code: + https://github.com/linsun449/iseg.code +
+
+
+
+
+ + ♻ ☆ HSTR-Net: Reference Based Video Super-resolution with Dual Cameras + + +
+ High-spatio-temporal resolution (HSTR) video recording plays a crucial role +in enhancing various imagery tasks that require fine-detailed information. +State-of-the-art cameras provide this required high frame-rate and high spatial +resolution together, albeit at a high cost. To alleviate this issue, this paper +proposes a dual camera system for the generation of HSTR video using +reference-based super-resolution (RefSR). One camera captures high spatial +resolution low frame rate (HSLF) video while the other captures low spatial +resolution high frame rate (LSHF) video simultaneously for the same scene. A +novel deep learning architecture is proposed to fuse HSLF and LSHF video feeds +and synthesize HSTR video frames. The proposed model combines optical flow +estimation and (channel-wise and spatial) attention mechanisms to capture the +fine motion and complex dependencies between frames of the two video feeds. +Simulations show that the proposed model provides significant improvement over +existing reference-based SR techniques in terms of PSNR and SSIM metrics. The +method also exhibits sufficient frames per second (FPS) for aerial monitoring +when deployed on a power-constrained drone equipped with dual cameras. + +
+
+ comment: 15 pages, 8 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ LMFLOSS: A Hybrid Loss For Imbalanced Medical Image Classification + + +
+ With advances in digital technology, the classification of medical images has +become a crucial step for image-based clinical decision support systems. +Automatic medical image classification represents a pivotal domain where the +use of AI holds the potential to create a significant social impact. However, +several challenges act as obstacles to the development of practical and +effective solutions. One of these challenges is the prevalent class imbalance +problem in most medical imaging datasets. As a result, existing AI techniques, +particularly deep-learning-based methodologies, often underperform in such +scenarios. In this study, we propose a novel framework called Large Margin +aware Focal (LMF) loss to mitigate the class imbalance problem in medical +imaging. The LMF loss represents a linear combination of two loss functions +optimized by two hyperparameters. This framework harnesses the distinct +characteristics of both loss functions by enforcing wider margins for minority +classes while simultaneously emphasizing challenging samples found in the +datasets. We perform rigorous experiments on three neural network architectures +and with four medical imaging datasets. We provide empirical evidence that our +proposed framework consistently outperforms other baseline methods, showing an +improvement of 2%-9% in macro-f1 scores. Through class-wise analysis of f1 +scores, we also demonstrate how the proposed framework can significantly +improve performance for minority classes. The results of our experiments show +that our proposed framework can perform consistently well across different +architectures and datasets. Overall, our study demonstrates a simple and +effective approach to addressing the class imbalance problem in medical imaging +datasets. We hope our work will inspire new research toward a more generalized +approach to medical image classification. + +
+
+ comment: 21 pages, 4 figures, a detailed version of our previous submission + with additional findings +
+
+
+
+
+ + ♻ ☆ A Unified Representation Framework for the Evaluation of Optical Music + Recognition Systems + + +
+ Modern-day Optical Music Recognition (OMR) is a fairly fragmented field. Most +OMR approaches use datasets that are independent and incompatible between each +other, making it difficult to both combine them and compare recognition systems +built upon them. In this paper we identify the need of a common music +representation language and propose the Music Tree Notation (MTN) format, with +the idea to construct a common endpoint for OMR research that allows +coordination, reuse of technology and fair evaluation of community efforts. +This format represents music as a set of primitives that group together into +higher-abstraction nodes, a compromise between the expression of fully +graph-based and sequential notation formats. We have also developed a specific +set of OMR metrics and a typeset score dataset as a proof of concept of this +idea. + +
+
+ comment: 18 pages, 4 figures, 3 tables, submitted (under review) for the + International Journal in Document Analysis and Recognition +
+
+
+
+
+ + ♻ ☆ RSF-Conv: Rotation-and-Scale Equivariant Fourier Parameterized + Convolution for Retinal Vessel Segmentation + + +
+ Retinal vessel segmentation is of great clinical significance for the +diagnosis of many eye-related diseases, but it is still a formidable challenge +due to the intricate vascular morphology. With the skillful characterization of +the translation symmetry existing in retinal vessels, convolutional neural +networks (CNNs) have achieved great success in retinal vessel segmentation. +However, the rotation-and-scale symmetry, as a more widespread image prior in +retinal vessels, fails to be characterized by CNNs. Therefore, we propose a +rotation-and-scale equivariant Fourier parameterized convolution (RSF-Conv) +specifically for retinal vessel segmentation, and provide the corresponding +equivariance analysis. As a general module, RSF-Conv can be integrated into +existing networks in a plug-and-play manner while significantly reducing the +number of parameters. For instance, we replace the traditional convolution +filters in U-Net and Iter-Net with RSF-Convs, and faithfully conduct +comprehensive experiments. RSF-Conv+U-Net and RSF-Conv+Iter-Net not only have +slight advantages under in-domain evaluation, but more importantly, outperform +all comparison methods by a significant margin under out-of-domain evaluation. +It indicates the remarkable generalization of RSF-Conv, which holds greater +practical clinical significance for the prevalent cross-device and +cross-hospital challenges in clinical practice. To comprehensively demonstrate +the effectiveness of RSF-Conv, we also apply RSF-Conv+U-Net and +RSF-Conv+Iter-Net to retinal artery/vein classification and achieve promising +performance as well, indicating its clinical application potential. + +
+
+
+
+
+ + ♻ ☆ AdaNAT: Exploring Adaptive Policy for Token-Based Image Generation ECCV2024 + + +
+ Recent studies have demonstrated the effectiveness of token-based methods for +visual content generation. As a representative work, non-autoregressive +Transformers (NATs) are able to synthesize images with decent quality in a +small number of steps. However, NATs usually necessitate configuring a +complicated generation policy comprising multiple manually-designed scheduling +rules. These heuristic-driven rules are prone to sub-optimality and come with +the requirements of expert knowledge and labor-intensive efforts. Moreover, +their one-size-fits-all nature cannot flexibly adapt to the diverse +characteristics of each individual sample. To address these issues, we propose +AdaNAT, a learnable approach that automatically configures a suitable policy +tailored for every sample to be generated. In specific, we formulate the +determination of generation policies as a Markov decision process. Under this +framework, a lightweight policy network for generation can be learned via +reinforcement learning. Importantly, we demonstrate that simple reward designs +such as FID or pre-trained reward models, may not reliably guarantee the +desired quality or diversity of generated samples. Therefore, we propose an +adversarial reward design to guide the training of policy networks effectively. +Comprehensive experiments on four benchmark datasets, i.e., ImageNet-256 & 512, +MS-COCO, and CC3M, validate the effectiveness of AdaNAT. Code and pre-trained +models will be released at https://github.com/LeapLabTHU/AdaNAT. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ TaskCLIP: Extend Large Vision-Language Model for Task Oriented Object + Detection + + +
+ Task-oriented object detection aims to find objects suitable for +accomplishing specific tasks. As a challenging task, it requires simultaneous +visual data processing and reasoning under ambiguous semantics. Recent +solutions are mainly all-in-one models. However, the object detection backbones +are pre-trained without text supervision. Thus, to incorporate task +requirements, their intricate models undergo extensive learning on a highly +imbalanced and scarce dataset, resulting in capped performance, laborious +training, and poor generalizability. In contrast, we propose TaskCLIP, a more +natural two-stage design composed of general object detection and task-guided +object selection. Particularly for the latter, we resort to the recently +successful large Vision-Language Models (VLMs) as our backbone, which provides +rich semantic knowledge and a uniform embedding space for images and texts. +Nevertheless, the naive application of VLMs leads to sub-optimal quality, due +to the misalignment between embeddings of object images and their visual +attributes, which are mainly adjective phrases. To this end, we design a +transformer-based aligner after the pre-trained VLMs to re-calibrate both +embeddings. Finally, we employ a trainable score function to post-process the +VLM matching results for object selection. Experimental results demonstrate +that our TaskCLIP outperforms the state-of-the-art DETR-based model TOIST by +3.5% and only requires a single NVIDIA RTX 4090 for both training and +inference. + +
+
+
+
+
+ + ♻ ☆ Video alignment using unsupervised learning of local and global features + + +
+ In this paper, we tackle the problem of video alignment, the process of +matching the frames of a pair of videos containing similar actions. The main +challenge in video alignment is that accurate correspondence should be +established despite the differences in the execution processes and appearances +between the two videos. We introduce an unsupervised method for alignment that +uses global and local features of the frames. In particular, we introduce +effective features for each video frame by means of three machine vision tools: +person detection, pose estimation, and VGG network. Then the features are +processed and combined to construct a multidimensional time series that +represent the video. The resulting time series are used to align videos of the +same actions using a novel version of dynamic time warping named Diagonalized +Dynamic Time Warping(DDTW). The main advantage of our approach is that no +training is required, which makes it applicable for any new type of action +without any need to collect training samples for it. Additionally, our approach +can be used for framewise labeling of action phases in a dataset with only a +few labeled videos. For evaluation, we considered video synchronization and +phase classification tasks on the Penn action and subset of UCF101 datasets. +Also, for an effective evaluation of the video synchronization task, we present +a new metric called Enclosed Area Error(EAE). The results show that our method +outperforms previous state-of-the-art methods, such as TCC, and other +self-supervised and weakly supervised methods. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ EgoPoser: Robust Real-Time Egocentric Pose Estimation from Sparse and + Intermittent Observations Everywhere ECCV 2024 + + +
+ Full-body egocentric pose estimation from head and hand poses alone has +become an active area of research to power articulate avatar representations on +headset-based platforms. However, existing methods over-rely on the indoor +motion-capture spaces in which datasets were recorded, while simultaneously +assuming continuous joint motion capture and uniform body dimensions. We +propose EgoPoser to overcome these limitations with four main contributions. 1) +EgoPoser robustly models body pose from intermittent hand position and +orientation tracking only when inside a headset's field of view. 2) We rethink +input representations for headset-based ego-pose estimation and introduce a +novel global motion decomposition method that predicts full-body pose +independent of global positions. 3) We enhance pose estimation by capturing +longer motion time series through an efficient SlowFast module design that +maintains computational efficiency. 4) EgoPoser generalizes across various body +shapes for different users. We experimentally evaluate our method and show that +it outperforms state-of-the-art methods both qualitatively and quantitatively +while maintaining a high inference speed of over 600fps. EgoPoser establishes a +robust baseline for future work where full-body pose estimation no longer needs +to rely on outside-in capture and can scale to large-scale and unseen +environments. + +
+
+ comment: Accepted by ECCV 2024, Code: https://siplab.org/projects/EgoPoser +
+
+
+
+
+ + ♻ ☆ A Survey on Benchmarks of Multimodal Large Language Models + + +
+ Multimodal Large Language Models (MLLMs) are gaining increasing popularity in +both academia and industry due to their remarkable performance in various +applications such as visual question answering, visual perception, +understanding, and reasoning. Over the past few years, significant efforts have +been made to examine MLLMs from multiple perspectives. This paper presents a +comprehensive review of 200 benchmarks and evaluations for MLLMs, focusing on +(1)perception and understanding, (2)cognition and reasoning, (3)specific +domains, (4)key capabilities, and (5)other modalities. Finally, we discuss the +limitations of the current evaluation methods for MLLMs and explore promising +future directions. Our key argument is that evaluation should be regarded as a +crucial discipline to support the development of MLLMs better. For more +details, please visit our GitHub repository: +https://github.com/swordlidev/Evaluation-Multimodal-LLMs-Survey. + +
+
+
+
+
+ + ♻ ☆ Is my Data in your AI Model? Membership Inference Test with Application + to Face Images + + +
+ This article introduces the Membership Inference Test (MINT), a novel +approach that aims to empirically assess if given data was used during the +training of AI/ML models. Specifically, we propose two MINT architectures +designed to learn the distinct activation patterns that emerge when an Audited +Model is exposed to data used during its training process. These architectures +are based on Multilayer Perceptrons (MLPs) and Convolutional Neural Networks +(CNNs). The experimental framework focuses on the challenging task of Face +Recognition, considering three state-of-the-art Face Recognition systems. +Experiments are carried out using six publicly available databases, comprising +over 22 million face images in total. Different experimental scenarios are +considered depending on the context of the AI model to test. Our proposed MINT +approach achieves promising results, with up to 90% accuracy, indicating the +potential to recognize if an AI model has been trained with specific data. The +proposed MINT approach can serve to enforce privacy and fairness in several AI +applications, e.g., revealing if sensitive or private data was used for +training or tuning Large Language Models (LLMs). + +
+
+ comment: 12 pages including references and authors +
+
+
+
+
+ + ♻ ☆ Training-free Camera Control for Video Generation + + +
+ We propose a training-free and robust solution to offer camera movement +control for off-the-shelf video diffusion models. Unlike previous work, our +method does not require any supervised finetuning on camera-annotated datasets +or self-supervised training via data augmentation. Instead, it can be plugged +and played with most pretrained video diffusion models and generate camera +controllable videos with a single image or text prompt as input. The +inspiration of our work comes from the layout prior that intermediate latents +hold towards generated results, thus rearranging noisy pixels in them will make +output content reallocated as well. As camera move could also be seen as a kind +of pixel rearrangement caused by perspective change, videos could be +reorganized following specific camera motion if their noisy latents change +accordingly. Established on this, we propose our method CamTrol, which enables +robust camera control for video diffusion models. It is achieved by a two-stage +process. First, we model image layout rearrangement through explicit camera +movement in 3D point cloud space. Second, we generate videos with camera motion +using layout prior of noisy latents formed by a series of rearranged images. +Extensive experiments have demonstrated the robustness our method holds in +controlling camera motion of generated videos. Furthermore, we show that our +method can produce impressive results in generating 3D rotation videos with +dynamic content. Project page at https://lifedecoder.github.io/CamTrol/. + +
+
+
+
+
+ + ♻ ☆ Hyp2Nav: Hyperbolic Planning and Curiosity for Crowd Navigation IROS 2024 + + +
+ Autonomous robots are increasingly becoming a strong fixture in social +environments. Effective crowd navigation requires not only safe yet fast +planning, but should also enable interpretability and computational efficiency +for working in real-time on embedded devices. In this work, we advocate for +hyperbolic learning to enable crowd navigation and we introduce Hyp2Nav. +Different from conventional reinforcement learning-based crowd navigation +methods, Hyp2Nav leverages the intrinsic properties of hyperbolic geometry to +better encode the hierarchical nature of decision-making processes in +navigation tasks. We propose a hyperbolic policy model and a hyperbolic +curiosity module that results in effective social navigation, best success +rates, and returns across multiple simulation settings, using up to 6 times +fewer parameters than competitor state-of-the-art models. With our approach, it +becomes even possible to obtain policies that work in 2-dimensional embedding +spaces, opening up new possibilities for low-resource crowd navigation and +model interpretability. Insightfully, the internal hyperbolic representation of +Hyp2Nav correlates with how much attention the robot pays to the surrounding +crowds, e.g. due to multiple people occluding its pathway or to a few of them +showing colliding plans, rather than to its own planned route. The code is +available at https://github.com/GDam90/hyp2nav. + +
+
+ comment: Accepted as oral at IROS 2024 +
+
+
+
+
+ + ♻ ☆ Gaussian Splatting in Style + + +
+ 3D scene stylization extends the work of neural style transfer to 3D. A vital +challenge in this problem is to maintain the uniformity of the stylized +appearance across multiple views. A vast majority of the previous works achieve +this by training a 3D model for every stylized image and a set of multi-view +images. In contrast, we propose a novel architecture trained on a collection of +style images that, at test time, produces real time high-quality stylized novel +views. We choose the underlying 3D scene representation for our model as 3D +Gaussian splatting. We take the 3D Gaussians and process them using a +multi-resolution hash grid and a tiny MLP to obtain stylized views. The MLP is +conditioned on different style codes for generalization to different styles +during test time. The explicit nature of 3D Gaussians gives us inherent +advantages over NeRF-based methods, including geometric consistency and a fast +training and rendering regime. This enables our method to be useful for various +practical use cases, such as augmented or virtual reality. We demonstrate that +our method achieves state-of-the-art performance with superior visual quality +on various indoor and outdoor real-world data. + +
+
+ comment: GCPR 2024 +
+
+
+
+
+ + ♻ ☆ Barbie: Text to Barbie-Style 3D Avatars + + +
+ Recent advances in text-guided 3D avatar generation have made substantial +progress by distilling knowledge from diffusion models. Despite the plausible +generated appearance, existing methods cannot achieve fine-grained +disentanglement or high-fidelity modeling between inner body and outfit. In +this paper, we propose Barbie, a novel framework for generating 3D avatars that +can be dressed in diverse and high-quality Barbie-like garments and +accessories. Instead of relying on a holistic model, Barbie achieves +fine-grained disentanglement on avatars by semantic-aligned separated models +for human body and outfits. These disentangled 3D representations are then +optimized by different expert models to guarantee the domain-specific fidelity. +To balance geometry diversity and reasonableness, we propose a series of losses +for template-preserving and human-prior evolving. The final avatar is enhanced +by unified texture refinement for superior texture consistency. Extensive +experiments demonstrate that Barbie outperforms existing methods in both +dressed human and outfit generation, supporting flexible apparel combination +and animation. The code will be released for research purposes. Our project +page is: https://xiaokunsun.github.io/Barbie.github.io/. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Scattering-induced entropy boost for highly-compressed optical sensing + and encryption + + +
+ Image sensing often relies on a high-quality machine vision system with a +large field of view and high resolution. It requires fine imaging optics, has +high computational costs, and requires a large communication bandwidth between +image sensors and computing units. In this paper, we propose a novel image-free +sensing framework for resource-efficient image classification, where the +required number of measurements can be reduced by up to two orders of +magnitude. In the proposed framework for single-pixel detection, the optical +field for a target is first scattered by an optical diffuser and then +two-dimensionally modulated by a spatial light modulator. The optical diffuser +simultaneously serves as a compressor and an encryptor for the target +information, effectively narrowing the field of view and improving the system's +security. The one-dimensional sequence of intensity values, which is measured +with time-varying patterns on the spatial light modulator, is then used to +extract semantic information based on end-to-end deep learning. The proposed +sensing framework is shown to obtain over a 95\% accuracy at sampling rates of +1% and 5% for classification on the MNIST dataset and the recognition of +Chinese license plates, respectively, and the framework is up to 24% more +efficient than the approach without an optical diffuser. The proposed framework +represents a significant breakthrough in high-throughput machine intelligence +for scene analysis with low bandwidth, low costs, and strong encryption. + +
+
+
+
+
+ + ♻ ☆ Segment Change Model (SCM) for Unsupervised Change detection in VHR + Remote Sensing Images: a Case Study of Buildings + + +
+ The field of Remote Sensing (RS) widely employs Change Detection (CD) on +very-high-resolution (VHR) images. A majority of extant deep-learning-based +methods hinge on annotated samples to complete the CD process. Recently, the +emergence of Vision Foundation Model (VFM) enables zero-shot predictions in +particular vision tasks. In this work, we propose an unsupervised CD method +named Segment Change Model (SCM), built upon the Segment Anything Model (SAM) +and Contrastive Language-Image Pre-training (CLIP). Our method recalibrates +features extracted at different scales and integrates them in a top-down manner +to enhance discriminative change edges. We further design an innovative +Piecewise Semantic Attention (PSA) scheme, which can offer semantic +representation without training, thereby minimize pseudo change phenomenon. +Through conducting experiments on two public datasets, the proposed SCM +increases the mIoU from 46.09% to 53.67% on the LEVIR-CD dataset, and from +47.56% to 52.14% on the WHU-CD dataset. Our codes are available at +https://github.com/StephenApX/UCD-SCM. + +
+
+ comment: Published in: IGARSS 2024 - 2024 IEEE International Geoscience and + Remote Sensing Symposium +
+
+
+
+
+ + ♻ ☆ Beware of Validation by Eye: Visual Validation of Linear Trends in + Scatterplots + + +
+ Visual validation of regression models in scatterplots is a common practice +for assessing model quality, yet its efficacy remains unquantified. We +conducted two empirical experiments to investigate individuals' ability to +visually validate linear regression models (linear trends) and to examine the +impact of common visualization designs on validation quality. The first +experiment showed that the level of accuracy for visual estimation of slope +(i.e., fitting a line to data) is higher than for visual validation of slope +(i.e., accepting a shown line). Notably, we found bias toward slopes that are +"too steep" in both cases. This lead to novel insights that participants +naturally assessed regression with orthogonal distances between the points and +the line (i.e., ODR regression) rather than the common vertical distances (OLS +regression). In the second experiment, we investigated whether incorporating +common designs for regression visualization (error lines, bounding boxes, and +confidence intervals) would improve visual validation. Even though error lines +reduced validation bias, results failed to show the desired improvements in +accuracy for any design. Overall, our findings suggest caution in using visual +model validation for linear trends in scatterplots. + +
+
+ comment: Preprint and Author Version of a Full Paper, accepted to the 2024 + IEEE Visualization Conference (VIS) +
+
+
+
+
+ + ♻ ☆ 3D Single-object Tracking in Point Clouds with High Temporal Variation ECCV24 + + +
+ The high temporal variation of the point clouds is the key challenge of 3D +single-object tracking (3D SOT). Existing approaches rely on the assumption +that the shape variation of the point clouds and the motion of the objects +across neighboring frames are smooth, failing to cope with high temporal +variation data. In this paper, we present a novel framework for 3D SOT in point +clouds with high temporal variation, called HVTrack. HVTrack proposes three +novel components to tackle the challenges in the high temporal variation +scenario: 1) A Relative-Pose-Aware Memory module to handle temporal point cloud +shape variations; 2) a Base-Expansion Feature Cross-Attention module to deal +with similar object distractions in expanded search areas; 3) a Contextual +Point Guided Self-Attention module for suppressing heavy background noise. We +construct a dataset with high temporal variation (KITTI-HV) by setting +different frame intervals for sampling in the KITTI dataset. On the KITTI-HV +with 5 frame intervals, our HVTrack surpasses the state-of-the-art tracker +CXTracker by 11.3%/15.7% in Success/Precision. + +
+
+ comment: Accepted by ECCV24 +
+
+
+
+
+ + ♻ ☆ DEVIAS: Learning Disentangled Video Representations of Action and Scene ECCV 2024 + + +
+ Video recognition models often learn scene-biased action representation due +to the spurious correlation between actions and scenes in the training data. +Such models show poor performance when the test data consists of videos with +unseen action-scene combinations. Although scene-debiased action recognition +models might address the issue, they often overlook valuable scene information +in the data. To address this challenge, we propose to learn DisEntangled VIdeo +representations of Action and Scene (DEVIAS), for more holistic video +understanding. We propose an encoder-decoder architecture to learn disentangled +action and scene representations with a single model. The architecture consists +of a disentangling encoder (DE), an action mask decoder (AMD), and a prediction +head. The key to achieving the disentanglement is employing both DE and AMD +during training time. The DE uses the slot attention mechanism to learn +disentangled action and scene representations. For further disentanglement, an +AMD learns to predict action masks, given an action slot. With the resulting +disentangled representations, we can achieve robust performance across diverse +scenarios, including both seen and unseen action-scene combinations. We +rigorously validate the proposed method on the UCF-101, Kinetics-400, and HVU +datasets for the seen, and the SCUBA, HAT, and HVU datasets for unseen +action-scene combination scenarios. Furthermore, DEVIAS provides flexibility to +adjust the emphasis on action or scene information depending on dataset +characteristics for downstream tasks. DEVIAS shows favorable performance in +various downstream tasks: Diving48, Something-Something-V2, UCF-101, and +ActivityNet. The code is available at https://github.com/KHU-VLL/DEVIAS. + +
+
+ comment: Accepted to ECCV 2024 (Oral). Project page : + https://khu-vll.github.io/DEVIAS/ +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes/formatting improvements. V3: improvements from journal + revisions. V4: fix figures +
+
+
+
+
+ + ♻ ☆ DiscoNeRF: Class-Agnostic Object Field for 3D Object Discovery + + +
+ Neural Radiance Fields (NeRFs) have become a powerful tool for modeling 3D +scenes from multiple images. However, NeRFs remain difficult to segment into +semantically meaningful regions. Previous approaches to 3D segmentation of +NeRFs either require user interaction to isolate a single object, or they rely +on 2D semantic masks with a limited number of classes for supervision. As a +consequence, they generalize poorly to class-agnostic masks automatically +generated in real scenes. This is attributable to the ambiguity arising from +zero-shot segmentation, yielding inconsistent masks across views. In contrast, +we propose a method that is robust to inconsistent segmentations and +successfully decomposes the scene into a set of objects of any class. By +introducing a limited number of competing object slots against which masks are +matched, a meaningful object representation emerges that best explains the 2D +supervision and minimizes an additional regularization term. Our experiments +demonstrate the ability of our method to generate 3D panoptic segmentations on +complex scenes, and extract high-quality 3D assets from NeRFs that can then be +used in virtual 3D environments. + +
+
+
+
+
+ + ♻ ☆ UV-Mamba: A DCN-Enhanced State Space Model for Urban Village Boundary + Identification in High-Resolution Remote Sensing Images + + +
+ Owing to the diverse geographical environments, intricate landscapes, and +high-density settlements, the automatic identification of urban village +boundaries using remote sensing images is a highly challenging task. This paper +proposes a novel and efficient neural network model called UV-Mamba for +accurate boundary detection in high-resolution remote sensing images. UV-Mamba +mitigates the memory loss problem in long sequence modeling, which arises in +state space model (SSM) with increasing image size, by incorporating deformable +convolutions (DCN). Its architecture utilizes an encoder-decoder framework, +includes an encoder with four deformable state space augmentation (DSSA) blocks +for efficient multi-level semantic extraction and a decoder to integrate the +extracted semantic information. We conducted experiments on the Beijing and +Xi'an datasets, and the results show that UV-Mamba achieves state-of-the-art +performance. Specifically, our model achieves 73.3% and 78.1% IoU on the +Beijing and Xi'an datasets, respectively, representing improvements of 1.2% and +3.4% IoU over the previous best model, while also being 6x faster in inference +speed and 40x smaller in parameter count. Source code and pre-trained models +are available in the supplementary material. + +
+
+ comment: 5 pages, 4 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Invisible Gas Detection: An RGB-Thermal Cross Attention Network and A + New Benchmark + + +
+ The widespread use of various chemical gases in industrial processes +necessitates effective measures to prevent their leakage during transportation +and storage, given their high toxicity. Thermal infrared-based computer vision +detection techniques provide a straightforward approach to identify gas leakage +areas. However, the development of high-quality algorithms has been challenging +due to the low texture in thermal images and the lack of open-source datasets. +In this paper, we present the RGB-Thermal Cross Attention Network (RT-CAN), +which employs an RGB-assisted two-stream network architecture to integrate +texture information from RGB images and gas area information from thermal +images. Additionally, to facilitate the research of invisible gas detection, we +introduce Gas-DB, an extensive open-source gas detection database including +about 1.3K well-annotated RGB-thermal images with eight variant collection +scenes. Experimental results demonstrate that our method successfully leverages +the advantages of both modalities, achieving state-of-the-art (SOTA) +performance among RGB-thermal methods, surpassing single-stream SOTA models in +terms of accuracy, Intersection of Union (IoU), and F2 metrics by 4.86%, 5.65%, +and 4.88%, respectively. The code and data can be found at +https://github.com/logic112358/RT-CAN. + +
+
+
+
+
+ + ♻ ☆ Frame Interpolation with Consecutive Brownian Bridge Diffusion + + +
+ Recent work in Video Frame Interpolation (VFI) tries to formulate VFI as a +diffusion-based conditional image generation problem, synthesizing the +intermediate frame given a random noise and neighboring frames. Due to the +relatively high resolution of videos, Latent Diffusion Models (LDMs) are +employed as the conditional generation model, where the autoencoder compresses +images into latent representations for diffusion and then reconstructs images +from these latent representations. Such a formulation poses a crucial +challenge: VFI expects that the output is deterministically equal to the ground +truth intermediate frame, but LDMs randomly generate a diverse set of different +images when the model runs multiple times. The reason for the diverse +generation is that the cumulative variance (variance accumulated at each step +of generation) of generated latent representations in LDMs is large. This makes +the sampling trajectory random, resulting in diverse rather than deterministic +generations. To address this problem, we propose our unique solution: Frame +Interpolation with Consecutive Brownian Bridge Diffusion. Specifically, we +propose consecutive Brownian Bridge diffusion that takes a deterministic +initial value as input, resulting in a much smaller cumulative variance of +generated latent representations. Our experiments suggest that our method can +improve together with the improvement of the autoencoder and achieve +state-of-the-art performance in VFI, leaving strong potential for further +enhancement. + +
+
+ comment: corrected typo +
+
+
+
+
+ + ♻ ☆ Scaling Diffusion Transformers to 16 Billion Parameters + + +
+ In this paper, we present DiT-MoE, a sparse version of the diffusion +Transformer, that is scalable and competitive with dense networks while +exhibiting highly optimized inference. The DiT-MoE includes two simple designs: +shared expert routing and expert-level balance loss, thereby capturing common +knowledge and reducing redundancy among the different routed experts. When +applied to conditional image generation, a deep analysis of experts +specialization gains some interesting observations: (i) Expert selection shows +preference with spatial position and denoising time step, while insensitive +with different class-conditional information; (ii) As the MoE layers go deeper, +the selection of experts gradually shifts from specific spacial position to +dispersion and balance. (iii) Expert specialization tends to be more +concentrated at the early time step and then gradually uniform after half. We +attribute it to the diffusion process that first models the low-frequency +spatial information and then high-frequency complex information. Based on the +above guidance, a series of DiT-MoE experimentally achieves performance on par +with dense networks yet requires much less computational load during inference. +More encouragingly, we demonstrate the potential of DiT-MoE with synthesized +image data, scaling diffusion model at a 16.5B parameter that attains a new +SoTA FID-50K score of 1.80 in 512$\times$512 resolution settings. The project +page: https://github.com/feizc/DiT-MoE. + +
+
+
+
+
+ + ♻ ☆ HyperKAN: Kolmogorov-Arnold Networks make Hyperspectral Image + Classificators Smarter + + +
+ In traditional neural network architectures, a multilayer perceptron (MLP) is +typically employed as a classification block following the feature extraction +stage. However, the Kolmogorov-Arnold Network (KAN) presents a promising +alternative to MLP, offering the potential to enhance prediction accuracy. In +this paper, we propose the replacement of linear and convolutional layers of +traditional networks with KAN-based counterparts. These modifications allowed +us to significantly increase the per-pixel classification accuracy for +hyperspectral remote-sensing images. We modified seven different neural network +architectures for hyperspectral image classification and observed a substantial +improvement in the classification accuracy across all the networks. The +architectures considered in the paper include baseline MLP, state-of-the-art 1D +(1DCNN) and 3D convolutional (two different 3DCNN, NM3DCNN), and transformer +(SSFTT) architectures, as well as newly proposed M1DCNN. The greatest effect +was achieved for convolutional networks working exclusively on spectral data, +and the best classification quality was achieved using a KAN-based transformer +architecture. All the experiments were conducted using seven openly available +hyperspectral datasets. Our code is available at +https://github.com/f-neumann77/HyperKAN. + +
+
+
+
+
+ + ♻ ☆ Spatial-frequency Dual-Domain Feature Fusion Network for Low-Light + Remote Sensing Image Enhancement + + +
+ Low-light remote sensing images generally feature high resolution and high +spatial complexity, with continuously distributed surface features in space. +This continuity in scenes leads to extensive long-range correlations in spatial +domains within remote sensing images. Convolutional Neural Networks, which rely +on local correlations for long-distance modeling, struggle to establish +long-range correlations in such images. On the other hand, transformer-based +methods that focus on global information face high computational complexities +when processing high-resolution remote sensing images. From another +perspective, Fourier transform can compute global information without +introducing a large number of parameters, enabling the network to more +efficiently capture the overall image structure and establish long-range +correlations. Therefore, we propose a Dual-Domain Feature Fusion Network (DFFN) +for low-light remote sensing image enhancement. Specifically, this challenging +task of low-light enhancement is divided into two more manageable sub-tasks: +the first phase learns amplitude information to restore image brightness, and +the second phase learns phase information to refine details. To facilitate +information exchange between the two phases, we designed an information fusion +affine block that combines data from different phases and scales. Additionally, +we have constructed two dark light remote sensing datasets to address the +current lack of datasets in dark light remote sensing image enhancement. +Extensive evaluations show that our method outperforms existing +state-of-the-art methods. The code is available at +https://github.com/iijjlk/DFFN. + +
+
+ comment: 14 page +
+
+
+
+
+ + ♻ ☆ Orthogonal Adaptation for Modular Customization of Diffusion Models + + +
+ Customization techniques for text-to-image models have paved the way for a +wide range of previously unattainable applications, enabling the generation of +specific concepts across diverse contexts and styles. While existing methods +facilitate high-fidelity customization for individual concepts or a limited, +pre-defined set of them, they fall short of achieving scalability, where a +single model can seamlessly render countless concepts. In this paper, we +address a new problem called Modular Customization, with the goal of +efficiently merging customized models that were fine-tuned independently for +individual concepts. This allows the merged model to jointly synthesize +concepts in one image without compromising fidelity or incurring any additional +computational costs. To address this problem, we introduce Orthogonal +Adaptation, a method designed to encourage the customized models, which do not +have access to each other during fine-tuning, to have orthogonal residual +weights. This ensures that during inference time, the customized models can be +summed with minimal interference. Our proposed method is both simple and +versatile, applicable to nearly all optimizable weights in the model +architecture. Through an extensive set of quantitative and qualitative +evaluations, our method consistently outperforms relevant baselines in terms of +efficiency and identity preservation, demonstrating a significant leap toward +scalable customization of diffusion models. + +
+
+ comment: Project page: https://ryanpo.com/ortha/; Hugging Face Demo: + https://huggingface.co/spaces/ujin-song/ortha +
+
+
+
+
+ + ♻ ☆ QuEST: Low-bit Diffusion Model Quantization via Efficient Selective + Finetuning + + +
+ The practical deployment of diffusion models still suffers from the high +memory and time overhead. While quantization paves a way for compression and +acceleration, existing methods unfortunately fail when the models are quantized +to low-bits. In this paper, we empirically unravel three properties in +quantized diffusion models that compromise the efficacy of current methods: +imbalanced activation distributions, imprecise temporal information, and +vulnerability to perturbations of specific modules. To alleviate the +intensified low-bit quantization difficulty stemming from the distribution +imbalance, we propose finetuning the quantized model to better adapt to the +activation distribution. Building on this idea, we identify two critical types +of quantized layers: those holding vital temporal information and those +sensitive to reduced bit-width, and finetune them to mitigate performance +degradation with efficiency. We empirically verify that our approach modifies +the activation distribution and provides meaningful temporal information, +facilitating easier and more accurate quantization. Our method is evaluated +over three high-resolution image generation tasks and achieves state-of-the-art +performance under various bit-width settings, as well as being the first method +to generate readable images on full 4-bit (i.e. W4A4) Stable Diffusion. Code is +available \href{https://github.com/hatchetProject/QuEST}{here}. + +
+
+ comment: Code available at https://github.com/hatchetProject/QuEST +
+
+
+
+
+ + ♻ ☆ RMT-BVQA: Recurrent Memory Transformer-based Blind Video Quality + Assessment for Enhanced Video Content ECCV 2024 + + +
+ With recent advances in deep learning, numerous algorithms have been +developed to enhance video quality, reduce visual artifacts, and improve +perceptual quality. However, little research has been reported on the quality +assessment of enhanced content - the evaluation of enhancement methods is often +based on quality metrics that were designed for compression applications. In +this paper, we propose a novel blind deep video quality assessment (VQA) method +specifically for enhanced video content. It employs a new Recurrent Memory +Transformer (RMT) based network architecture to obtain video quality +representations, which is optimized through a novel content-quality-aware +contrastive learning strategy based on a new database containing 13K training +patches with enhanced content. The extracted quality representations are then +combined through linear regression to generate video-level quality indices. The +proposed method, RMT-BVQA, has been evaluated on the VDPVE (VQA Dataset for +Perceptual Video Enhancement) database through a five-fold cross validation. +The results show its superior correlation performance when compared to ten +existing no-reference quality metrics. + +
+
+ comment: This paper has been accepted by the ECCV 2024 AIM Advances in Image + Manipulation workshop +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ A Survey on Knowledge Organization Systems of Research Fields: Resources + and Challenges + + +
+ Knowledge Organization Systems (KOSs), such as term lists, thesauri, +taxonomies, and ontologies, play a fundamental role in categorising, managing, +and retrieving information. In the academic domain, KOSs are often adopted for +representing research areas and their relationships, primarily aiming to +classify research articles, academic courses, patents, books, scientific +venues, domain experts, grants, software, experiment materials, and several +other relevant products and agents. These structured representations of +research areas, widely embraced by many academic fields, have proven effective +in empowering AI-based systems to i) enhance retrievability of relevant +documents, ii) enable advanced analytic solutions to quantify the impact of +academic research, and iii) analyse and forecast research dynamics. This paper +aims to present a comprehensive survey of the current KOS for academic +disciplines. We analysed and compared 45 KOSs according to five main +dimensions: scope, structure, curation, usage, and links to other KOSs. Our +results reveal a very heterogeneous scenario in terms of scope, scale, quality, +and usage, highlighting the need for more integrated solutions for representing +research knowledge across academic fields. We conclude by discussing the main +challenges and the most promising future directions. + +
+
+
+
+
+ + ☆ How Fair is Your Diffusion Recommender Model? + + +
+ Diffusion-based recommender systems have recently proven to outperform +traditional generative recommendation approaches, such as variational +autoencoders and generative adversarial networks. Nevertheless, the machine +learning literature has raised several concerns regarding the possibility that +diffusion models, while learning the distribution of data samples, may +inadvertently carry information bias and lead to unfair outcomes. In light of +this aspect, and considering the relevance that fairness has held in +recommendations over the last few decades, we conduct one of the first fairness +investigations in the literature on DiffRec, a pioneer approach in +diffusion-based recommendation. First, we propose an experimental setting +involving DiffRec (and its variant L-DiffRec) along with nine state-of-the-art +recommendation models, two popular recommendation datasets from the +fairness-aware literature, and six metrics accounting for accuracy and +consumer/provider fairness. Then, we perform a twofold analysis, one assessing +models' performance under accuracy and recommendation fairness separately, and +the other identifying if and to what extent such metrics can strike a +performance trade-off. Experimental results from both studies confirm the +initial unfairness warnings but pave the way for how to address them in future +research directions. + +
+
+
+
+
+ + ☆ Enhancing Sequential Music Recommendation with Personalized Popularity + Awareness RecSys'24 + + +
+ In the realm of music recommendation, sequential recommender systems have +shown promise in capturing the dynamic nature of music consumption. +Nevertheless, traditional Transformer-based models, such as SASRec and +BERT4Rec, while effective, encounter challenges due to the unique +characteristics of music listening habits. In fact, existing models struggle to +create a coherent listening experience due to rapidly evolving preferences. +Moreover, music consumption is characterized by a prevalence of repeated +listening, i.e., users frequently return to their favourite tracks, an +important signal that could be framed as individual or personalized popularity. + This paper addresses these challenges by introducing a novel approach that +incorporates personalized popularity information into sequential +recommendation. By combining user-item popularity scores with model-generated +scores, our method effectively balances the exploration of new music with the +satisfaction of user preferences. Experimental results demonstrate that a +Personalized Most Popular recommender, a method solely based on user-specific +popularity, outperforms existing state-of-the-art models. Furthermore, +augmenting Transformer-based models with personalized popularity awareness +yields superior performance, showing improvements ranging from 25.2% to 69.8%. +The code for this paper is available at +https://github.com/sisinflab/personalized-popularity-awareness. + +
+
+ comment: Accepted by RecSys'24 as an LBR paper +
+
+
+
+
+ + ☆ WarpAdam: A new Adam optimizer based on Meta-Learning approach + + +
+ Optimal selection of optimization algorithms is crucial for training deep +learning models. The Adam optimizer has gained significant attention due to its +efficiency and wide applicability. However, to enhance the adaptability of +optimizers across diverse datasets, we propose an innovative optimization +strategy by integrating the 'warped gradient descend'concept from Meta Learning +into the Adam optimizer. In the conventional Adam optimizer, gradients are +utilized to compute estimates of gradient mean and variance, subsequently +updating model parameters. Our approach introduces a learnable distortion +matrix, denoted as P, which is employed for linearly transforming gradients. +This transformation slightly adjusts gradients during each iteration, enabling +the optimizer to better adapt to distinct dataset characteristics. By learning +an appropriate distortion matrix P, our method aims to adaptively adjust +gradient information across different data distributions, thereby enhancing +optimization performance. Our research showcases the potential of this novel +approach through theoretical insights and empirical evaluations. Experimental +results across various tasks and datasets validate the superiority of our +optimizer that integrates the 'warped gradient descend' concept in terms of +adaptability. Furthermore, we explore effective strategies for training the +adaptation matrix P and identify scenarios where this method can yield optimal +results. In summary, this study introduces an innovative approach that merges +the 'warped gradient descend' concept from Meta Learning with the Adam +optimizer. By introducing a learnable distortion matrix P within the optimizer, +we aim to enhance the model's generalization capability across diverse data +distributions, thus opening up new possibilities in the field of deep learning +optimization. + +
+
+
+
+
+ + ☆ Refining Wikidata Taxonomy using Large Language Models + + +
+ Due to its collaborative nature, Wikidata is known to have a complex +taxonomy, with recurrent issues like the ambiguity between instances and +classes, the inaccuracy of some taxonomic paths, the presence of cycles, and +the high level of redundancy across classes. Manual efforts to clean up this +taxonomy are time-consuming and prone to errors or subjective decisions. We +present WiKC, a new version of Wikidata taxonomy cleaned automatically using a +combination of Large Language Models (LLMs) and graph mining techniques. +Operations on the taxonomy, such as cutting links or merging classes, are +performed with the help of zero-shot prompting on an open-source LLM. The +quality of the refined taxonomy is evaluated from both intrinsic and extrinsic +perspectives, on a task of entity typing for the latter, showing the practical +interest of WiKC. + +
+
+ comment: ACM International Conference on Information and Knowledge Management, + Oct 2024, Boise, Idaho, United States +
+
+
+
+
+ + ☆ Preserving Individuality while Following the Crowd: Understanding the + Role of User Taste and Crowd Wisdom in Online Product Rating Prediction + + +
+ Numerous algorithms have been developed for online product rating prediction, +but the specific influence of user and product information in determining the +final prediction score remains largely unexplored. Existing research often +relies on narrowly defined data settings, which overlooks real-world challenges +such as the cold-start problem, cross-category information utilization, and +scalability and deployment issues. To delve deeper into these aspects, and +particularly to uncover the roles of individual user taste and collective +wisdom, we propose a unique and practical approach that emphasizes historical +ratings at both the user and product levels, encapsulated using a continuously +updated dynamic tree representation. This representation effectively captures +the temporal dynamics of users and products, leverages user information across +product categories, and provides a natural solution to the cold-start problem. +Furthermore, we have developed an efficient data processing strategy that makes +this approach highly scalable and easily deployable. Comprehensive experiments +in real industry settings demonstrate the effectiveness of our approach. +Notably, our findings reveal that individual taste dominates over collective +wisdom in online product rating prediction, a perspective that contrasts with +the commonly observed wisdom of the crowd phenomenon in other domains. This +dominance of individual user taste is consistent across various model types, +including the boosting tree model, recurrent neural network (RNN), and +transformer-based architectures. This observation holds true across the overall +population, within individual product categories, and in cold-start scenarios. +Our findings underscore the significance of individual user tastes in the +context of online product rating prediction and the robustness of our approach +across different model architectures. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ A Unified Framework for Cross-Domain Recommendation + + +
+ In addressing the persistent challenges of data-sparsity and cold-start +issues in domain-expert recommender systems, Cross-Domain Recommendation (CDR) +emerges as a promising methodology. CDR aims at enhancing prediction +performance in the target domain by leveraging interaction knowledge from +related source domains, particularly through users or items that span across +multiple domains (e.g., Short-Video and Living-Room). For academic research +purposes, there are a number of distinct aspects to guide CDR method designing, +including the auxiliary domain number, domain-overlapped element, user-item +interaction types, and downstream tasks. With so many different CDR combination +scenario settings, the proposed scenario-expert approaches are tailored to +address a specific vertical CDR scenario, and often lack the capacity to adapt +to multiple horizontal scenarios. In an effect to coherently adapt to various +scenarios, and drawing inspiration from the concept of domain-invariant +transfer learning, we extend the former SOTA model UniCDR in five different +aspects, named as UniCDR+. Our work was successfully deployed on the Kuaishou +Living-Room RecSys. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Zero-Shot Topic Classification of Column Headers: Leveraging LLMs for + Metadata Enrichment + + +
+ Traditional dataset retrieval systems rely on metadata for indexing, rather +than on the underlying data values. However, high-quality metadata creation and +enrichment often require manual annotations, which is a labour-intensive and +challenging process to automate. In this study, we propose a method to support +metadata enrichment using topic annotations generated by three Large Language +Models (LLMs): ChatGPT-3.5, GoogleBard, and GoogleGemini. Our analysis focuses +on classifying column headers based on domain-specific topics from the +Consortium of European Social Science Data Archives (CESSDA), a Linked Data +controlled vocabulary. Our approach operates in a zero-shot setting, +integrating the controlled topic vocabulary directly within the input prompt. +This integration serves as a Large Context Windows approach, with the aim of +improving the results of the topic classification task. + We evaluated the performance of the LLMs in terms of internal consistency, +inter-machine alignment, and agreement with human classification. Additionally, +we investigate the impact of contextual information (i.e., dataset description) +on the classification outcomes. Our findings suggest that ChatGPT and +GoogleGemini outperform GoogleBard in terms of internal consistency as well as +LLM-human-agreement. Interestingly, we found that contextual information had no +significant impact on LLM performance. + This work proposes a novel approach that leverages LLMs for topic +classification of column headers using a controlled vocabulary, presenting a +practical application of LLMs and Large Context Windows within the Semantic Web +domain. This approach has the potential to facilitate automated metadata +enrichment, thereby enhancing dataset retrieval and the Findability, +Accessibility, Interoperability, and Reusability (FAIR) of research data on the +Web. + +
+
+
+
+
+ + ♻ ☆ RAG based Question-Answering for Contextual Response Prediction System CIKM'24 + + +
+ Large Language Models (LLMs) have shown versatility in various Natural +Language Processing (NLP) tasks, including their potential as effective +question-answering systems. However, to provide precise and relevant +information in response to specific customer queries in industry settings, LLMs +require access to a comprehensive knowledge base to avoid hallucinations. +Retrieval Augmented Generation (RAG) emerges as a promising technique to +address this challenge. Yet, developing an accurate question-answering +framework for real-world applications using RAG entails several challenges: 1) +data availability issues, 2) evaluating the quality of generated content, and +3) the costly nature of human evaluation. In this paper, we introduce an +end-to-end framework that employs LLMs with RAG capabilities for industry use +cases. Given a customer query, the proposed system retrieves relevant knowledge +documents and leverages them, along with previous chat history, to generate +response suggestions for customer service agents in the contact centers of a +major retail company. Through comprehensive automated and human evaluations, we +show that this solution outperforms the current BERT-based algorithms in +accuracy and relevance. Our findings suggest that RAG-based LLMs can be an +excellent support to human customer service representatives by lightening their +workload. + +
+
+ comment: Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise, + CIKM'24. 6 pages +
+
+
+
+
+ + ♻ ☆ Hybrid Semantic Search: Unveiling User Intent Beyond Keywords + + +
+ This paper addresses the limitations of traditional keyword-based search in +understanding user intent and introduces a novel hybrid search approach that +leverages the strengths of non-semantic search engines, Large Language Models +(LLMs), and embedding models. The proposed system integrates keyword matching, +semantic vector embeddings, and LLM-generated structured queries to deliver +highly relevant and contextually appropriate search results. By combining these +complementary methods, the hybrid approach effectively captures both explicit +and implicit user intent.The paper further explores techniques to optimize +query execution for faster response times and demonstrates the effectiveness of +this hybrid search model in producing comprehensive and accurate search +outcomes. + +
+
+
+
+
+ + ♻ ☆ Hi-Gen: Generative Retrieval For Large-Scale Personalized E-commerce + Search ICDM 2024 + + +
+ Leveraging generative retrieval (GR) techniques to enhance search systems is +an emerging methodology that has shown promising results in recent years. In +GR, a text-to-text model maps string queries directly to relevant document +identifiers (docIDs), dramatically simplifying the retrieval process. However, +when applying most GR models in large-scale E-commerce for personalized item +search, we must face two key problems in encoding and decoding. (1) Existing +docID generation methods ignore the encoding of efficiency information, which +is critical in E-commerce. (2) The positional information is important in +decoding docIDs, while prior studies have not adequately discriminated the +significance of positional information or well exploited the inherent +interrelation among these positions. To overcome these problems, we introduce +an efficient Hierarchical encoding-decoding Generative retrieval method +(Hi-Gen) for large-scale personalized E-commerce search systems. Specifically, +we first design a representation learning model using metric learning to learn +discriminative feature representations of items to capture semantic relevance +and efficiency information. Then, we propose a category-guided hierarchical +clustering scheme that makes full use of the semantic and efficiency +information of items to facilitate docID generation. Finally, we design a +position-aware loss to discriminate the importance of positions and mine the +inherent interrelation between different tokens at the same position. This loss +boosts the performance of the language model used in the decoding stage. +Besides, we propose two variants of Hi-Gen (Hi-Gen-I2I and Hi-Gen-Cluster) to +support online real-time large-scale recall in the online serving process. +Hi-Gen gets 3.30% and 4.62% improvements over SOTA for Recall@1 on the public +and industry datasets, respectively. + +
+
+ comment: Accepted by ICDM 2024 +
+
+
+
+
+ + ♻ ☆ MedPromptExtract (Medical Data Extraction Tool): Anonymization and + Hi-fidelity Automated data extraction using NLP and prompt engineering + + +
+ Introduction: The labour-intensive nature of data extraction from sources +like discharge summaries (DS) poses significant obstacles to the digitisation +of medical records particularly for low- and middle-income countries (LMICs). +In this paper we present a completely automated method MedPromptExtract to +efficiently extract data from DS while maintaining confidentiality. Methods: +The source of data was Discharge Summaries (DS) from Kokilaben Dhirubhai Ambani +Hospital (KDAH) of patients having Acute Kidney Injury (AKI). A pre-existing +tool EIGEN which leverages semi-supervised learning techniques for +high-fidelity information extraction was used to anonymize the DS, Natural +Language Processing (NLP) was used to extract data from regular fields. We used +Prompt Engineering and Large Language Model(LLM) to extract custom clinical +information from free flowing text describing the patients stay in the +hospital. Twelve features associated with occurrence of AKI were extracted. The +LLM responses were validated against clinicians annotations. Results: The +MedPromptExtracttool first subjected DS to the anonymization pipeline which +took three seconds per summary. Successful anonymization was verified by +clinicians, thereafter NLP pipeline extracted structured text from the +anonymized pdfs at the rate of 0.2 seconds per summary with 100% +accuracy.Finally DS were analysed by the LLM pipeline using Gemini Pro for the +twelve features. Accuracy metrics were calculated by comparing model responses +to clinicians annotations with seven features achieving AUCs above 0.9, +indicating high fidelity of the extraction process. Conclusion: +MedPromptExtract serves as an automated adaptable tool for efficient data +extraction from medical records with a dynamic user interface. Keywords: +Digitizing Medical Records, Automated Anonymisation, Information Retrieval, +Large Language Models, Prompt Engineering + +
+
+
+
+
+ + ♻ ☆ FiNER-ORD: Financial Named Entity Recognition Open Research Dataset + + +
+ Over the last two decades, the development of the CoNLL-2003 named entity +recognition (NER) dataset has helped enhance the capabilities of deep learning +and natural language processing (NLP). The finance domain, characterized by its +unique semantic and lexical variations for the same entities, presents specific +challenges to the NER task; thus, a domain-specific customized dataset is +crucial for advancing research in this field. In our work, we develop the first +high-quality English Financial NER Open Research Dataset (FiNER-ORD). We +benchmark multiple pre-trained language models (PLMs) and large-language models +(LLMs) on FiNER-ORD. We believe our proposed FiNER-ORD dataset will open future +opportunities to use FiNER-ORD as a benchmark for financial domain-specific NER +and NLP tasks. Our dataset, models, and code are publicly available on GitHub +and Hugging Face under CC BY-NC 4.0 license. + +
+
+
+
+
+ + ♻ ☆ GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase + Recommendation + + +
+ Online sellers and advertisers are recommended keyphrases for their listed +products, which they bid on to enhance their sales. One popular paradigm that +generates such recommendations is Extreme Multi-Label Classification (XMC), +which involves tagging/mapping keyphrases to items. We outline the limitations +of using traditional item-query based tagging or mapping techniques for +keyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an +innovative graph-based approach that recommends keyphrases to sellers using +extraction of token permutations from item titles. Additionally, we demonstrate +that relying on traditional metrics such as precision/recall can be misleading +in practical applications, thereby necessitating a combination of metrics to +evaluate performance in real-world scenarios. These metrics are designed to +assess the relevance of keyphrases to items and the potential for buyer +outreach. GraphEx outperforms production models at eBay, achieving the +objectives mentioned above. It supports near real-time inferencing in +resource-constrained production environments and scales effectively for +billions of items. + +
+
+
+
+
+
+
+
+ + Machine Learning 104 + +
+
+
+ + ☆ Accelerating Training with Neuron Interaction and Nowcasting Networks + + +
+ Neural network training can be accelerated when a learnable update rule is +used in lieu of classic adaptive optimizers (e.g. Adam). However, learnable +update rules can be costly and unstable to train and use. A simpler recently +proposed approach to accelerate training is to use Adam for most of the +optimization steps and periodically, only every few steps, nowcast (predict +future) parameters. We improve this approach by Neuron interaction and +Nowcasting (NiNo) networks. NiNo leverages neuron connectivity and graph neural +networks to more accurately nowcast parameters by learning in a supervised way +from a set of training trajectories over multiple tasks. We show that in some +networks, such as Transformers, neuron connectivity is non-trivial. By +accurately modeling neuron connectivity, we allow NiNo to accelerate Adam +training by up to 50\% in vision and language tasks. + +
+
+ comment: code https://github.com/SamsungSAILMontreal/nino +
+
+
+
+
+ + ☆ Theory, Analysis, and Best Practices for Sigmoid Self-Attention + + +
+ Attention is a key part of the transformer architecture. It is a +sequence-to-sequence mapping that transforms each sequence element into a +weighted sum of values. The weights are typically obtained as the softmax of +dot products between keys and queries. Recent work has explored alternatives to +softmax attention in transformers, such as ReLU and sigmoid activations. In +this work, we revisit sigmoid attention and conduct an in-depth theoretical and +empirical analysis. Theoretically, we prove that transformers with sigmoid +attention are universal function approximators and benefit from improved +regularity compared to softmax attention. Through detailed empirical analysis, +we identify stabilization of large initial attention norms during the early +stages of training as a crucial factor for the successful training of models +with sigmoid attention, outperforming prior attempts. We also introduce +FLASHSIGMOID, a hardware-aware and memory-efficient implementation of sigmoid +attention yielding a 17% inference kernel speed-up over FLASHATTENTION2 on H100 +GPUs. Experiments across language, vision, and speech show that properly +normalized sigmoid attention matches the strong performance of softmax +attention on a wide range of domains and scales, which previous attempts at +sigmoid attention were unable to fully achieve. Our work unifies prior art and +establishes best practices for sigmoid attention as a drop-in softmax +replacement in transformers. + +
+
+
+
+
+ + ☆ VILA-U: a Unified Foundation Model Integrating Visual Understanding and + Generation + + +
+ VILA-U is a Unified foundation model that integrates Video, Image, Language +understanding and generation. Traditional visual language models (VLMs) use +separate modules for understanding and generating visual content, which can +lead to misalignment and increased complexity. In contrast, VILA-U employs a +single autoregressive next-token prediction framework for both tasks, +eliminating the need for additional components like diffusion models. This +approach not only simplifies the model but also achieves near state-of-the-art +performance in visual language understanding and generation. The success of +VILA-U is attributed to two main factors: the unified vision tower that aligns +discrete visual tokens with textual inputs during pretraining, which enhances +visual perception, and autoregressive image generation can achieve similar +quality as diffusion models with high-quality dataset. This allows VILA-U to +perform comparably to more complex models using a fully token-based +autoregressive framework. + +
+
+ comment: 11 pages, 7 figures, 8 tables +
+
+
+
+
+ + ☆ Hybrid Spiking Neural Networks for Low-Power Intra-Cortical + Brain-Machine Interfaces + + +
+ Intra-cortical brain-machine interfaces (iBMIs) have the potential to +dramatically improve the lives of people with paraplegia by restoring their +ability to perform daily activities. However, current iBMIs suffer from +scalability and mobility limitations due to bulky hardware and wiring. Wireless +iBMIs offer a solution but are constrained by a limited data rate. To overcome +this challenge, we are investigating hybrid spiking neural networks for +embedded neural decoding in wireless iBMIs. The networks consist of a temporal +convolution-based compression followed by recurrent processing and a final +interpolation back to the original sequence length. As recurrent units, we +explore gated recurrent units (GRUs), leaky integrate-and-fire (LIF) neurons, +and a combination of both - spiking GRUs (sGRUs) and analyze their differences +in terms of accuracy, footprint, and activation sparsity. To that end, we train +decoders on the "Nonhuman Primate Reaching with Multichannel Sensorimotor +Cortex Electrophysiology" dataset and evaluate it using the NeuroBench +framework, targeting both tracks of the IEEE BioCAS Grand Challenge on Neural +Decoding. Our approach achieves high accuracy in predicting velocities of +primate reaching movements from multichannel primary motor cortex recordings +while maintaining a low number of synaptic operations, surpassing the current +baseline models in the NeuroBench framework. This work highlights the potential +of hybrid neural networks to facilitate wireless iBMIs with high decoding +precision and a substantial increase in the number of monitored neurons, paving +the way toward more advanced neuroprosthetic technologies. + +
+
+ comment: This work has been accepted at the 2024 IEEE Biomedical Circuits and + Systems Conference +
+
+
+
+
+ + ☆ RLPF: Reinforcement Learning from Prediction Feedback for User + Summarization with LLMs + + +
+ LLM-powered personalization agent systems employ Large Language Models (LLMs) +to predict users' behavior from their past activities. However, their +effectiveness often hinges on the ability to effectively leverage extensive, +long user historical data due to its inherent noise and length of such data. +Existing pretrained LLMs may generate summaries that are concise but lack the +necessary context for downstream tasks, hindering their utility in +personalization systems. To address these challenges, we introduce +Reinforcement Learning from Prediction Feedback (RLPF). RLPF fine-tunes LLMs to +generate concise, human-readable user summaries that are optimized for +downstream task performance. By maximizing the usefulness of the generated +summaries, RLPF effectively distills extensive user history data while +preserving essential information for downstream tasks. Our empirical evaluation +demonstrates significant improvements in both extrinsic downstream task utility +and intrinsic summary quality, surpassing baseline methods by up to 22% on +downstream task performance and achieving an up to 84.59% win rate on +Factuality, Abstractiveness, and Readability. RLPF also achieves a remarkable +74% reduction in context length while improving performance on 16 out of 19 +unseen tasks and/or datasets, showcasing its generalizability. This approach +offers a promising solution for enhancing LLM personalization by effectively +transforming long, noisy user histories into informative and human-readable +representations. + +
+
+
+
+
+ + ☆ Approximating Metric Magnitude of Point Sets + + +
+ Metric magnitude is a measure of the "size" of point clouds with many +desirable geometric properties. It has been adapted to various mathematical +contexts and recent work suggests that it can enhance machine learning and +optimization algorithms. But its usability is limited due to the computational +cost when the dataset is large or when the computation must be carried out +repeatedly (e.g. in model training). In this paper, we study the magnitude +computation problem, and show efficient ways of approximating it. We show that +it can be cast as a convex optimization problem, but not as a submodular +optimization. The paper describes two new algorithms - an iterative +approximation algorithm that converges fast and is accurate, and a subset +selection method that makes the computation even faster. It has been previously +proposed that magnitude of model sequences generated during stochastic gradient +descent is correlated to generalization gap. Extension of this result using our +more scalable algorithms shows that longer sequences in fact bear higher +correlations. We also describe new applications of magnitude in machine +learning - as an effective regularizer for neural network training, and as a +novel clustering criterion. + +
+
+
+
+
+ + ☆ Exploiting the Data Gap: Utilizing Non-ignorable Missingness to + Manipulate Model Learning + + +
+ Missing data is commonly encountered in practice, and when the missingness is +non-ignorable, effective remediation depends on knowledge of the missingness +mechanism. Learning the underlying missingness mechanism from the data is not +possible in general, so adversaries can exploit this fact by maliciously +engineering non-ignorable missingness mechanisms. Such Adversarial Missingness +(AM) attacks have only recently been motivated and introduced, and then +successfully tailored to mislead causal structure learning algorithms into +hiding specific cause-and-effect relationships. However, existing AM attacks +assume the modeler (victim) uses full-information maximum likelihood methods to +handle the missing data, and are of limited applicability when the modeler uses +different remediation strategies. In this work we focus on associational +learning in the context of AM attacks. We consider (i) complete case analysis, +(ii) mean imputation, and (iii) regression-based imputation as alternative +strategies used by the modeler. Instead of combinatorially searching for +missing entries, we propose a novel probabilistic approximation by deriving the +asymptotic forms of these methods used for handling the missing entries. We +then formulate the learning of the adversarial missingness mechanism as a +bi-level optimization problem. Experiments on generalized linear models show +that AM attacks can be used to change the p-values of features from significant +to insignificant in real datasets, such as the California-housing dataset, +while using relatively moderate amounts of missingness (<20%). Additionally, we +assess the robustness of our attacks against defense strategies based on data +valuation. + +
+
+
+
+
+ + ☆ Quantum Kernel Methods under Scrutiny: A Benchmarking Study + + +
+ Since the entry of kernel theory in the field of quantum machine learning, +quantum kernel methods (QKMs) have gained increasing attention with regard to +both probing promising applications and delivering intriguing research +insights. Two common approaches for computing the underlying Gram matrix have +emerged: fidelity quantum kernels (FQKs) and projected quantum kernels (PQKs). +Benchmarking these methods is crucial to gain robust insights and to understand +their practical utility. In this work, we present a comprehensive large-scale +study examining QKMs based on FQKs and PQKs across a manifold of design +choices. Our investigation encompasses both classification and regression tasks +for five dataset families and 64 datasets, systematically comparing the use of +FQKs and PQKs quantum support vector machines and kernel ridge regression. This +resulted in over 20,000 models that were trained and optimized using a +state-of-the-art hyperparameter search to ensure robust and comprehensive +insights. We delve into the importance of hyperparameters on model performance +scores and support our findings through rigorous correlation analyses. In this, +we also closely inspect two data encoding strategies. Moreover, we provide an +in-depth analysis addressing the design freedom of PQKs and explore the +underlying principles responsible for learning. Our goal is not to identify the +best-performing model for a specific task but to uncover the mechanisms that +lead to effective QKMs and reveal universal patterns. + +
+
+ comment: 19 pages main text including 12 figures, appendix 25 pages with 31 + figures +
+
+
+
+
+ + ☆ Gaussian-Mixture-Model Q-Functions for Reinforcement Learning by + Riemannian Optimization + + +
+ This paper establishes a novel role for Gaussian-mixture models (GMMs) as +functional approximators of Q-function losses in reinforcement learning (RL). +Unlike the existing RL literature, where GMMs play their typical role as +estimates of probability density functions, GMMs approximate here Q-function +losses. The new Q-function approximators, coined GMM-QFs, are incorporated in +Bellman residuals to promote a Riemannian-optimization task as a novel +policy-evaluation step in standard policy-iteration schemes. The paper +demonstrates how the hyperparameters (means and covariance matrices) of the +Gaussian kernels are learned from the data, opening thus the door of RL to the +powerful toolbox of Riemannian optimization. Numerical tests show that with no +use of training data, the proposed design outperforms state-of-the-art methods, +even deep Q-networks which use training data, on benchmark RL tasks. + +
+
+
+
+
+ + ☆ Evaluating Fairness in Transaction Fraud Models: Fairness Metrics, Bias + Audits, and Challenges + + +
+ Ensuring fairness in transaction fraud detection models is vital due to the +potential harms and legal implications of biased decision-making. Despite +extensive research on algorithmic fairness, there is a notable gap in the study +of bias in fraud detection models, mainly due to the field's unique challenges. +These challenges include the need for fairness metrics that account for fraud +data's imbalanced nature and the tradeoff between fraud protection and service +quality. To address this gap, we present a comprehensive fairness evaluation of +transaction fraud models using public synthetic datasets, marking the first +algorithmic bias audit in this domain. Our findings reveal three critical +insights: (1) Certain fairness metrics expose significant bias only after +normalization, highlighting the impact of class imbalance. (2) Bias is +significant in both service quality-related parity metrics and fraud +protection-related parity metrics. (3) The fairness through unawareness +approach, which involved removing sensitive attributes such as gender, does not +improve bias mitigation within these datasets, likely due to the presence of +correlated proxies. We also discuss socio-technical fairness-related challenges +in transaction fraud models. These insights underscore the need for a nuanced +approach to fairness in fraud detection, balancing protection and service +quality, and moving beyond simple bias mitigation strategies. Future work must +focus on refining fairness metrics and developing methods tailored to the +unique complexities of the transaction fraud domain. + +
+
+
+
+
+ + ☆ Provable Hyperparameter Tuning for Structured Pfaffian Settings + + +
+ Data-driven algorithm design automatically adapts algorithms to specific +application domains, achieving better performance. In the context of +parameterized algorithms, this approach involves tuning the algorithm +parameters using problem instances drawn from the problem distribution of the +target application domain. While empirical evidence supports the effectiveness +of data-driven algorithm design, providing theoretical guarantees for several +parameterized families remains challenging. This is due to the intricate +behaviors of their corresponding utility functions, which typically admit +piece-wise and discontinuity structures. In this work, we present refined +frameworks for providing learning guarantees for parameterized data-driven +algorithm design problems in both distributional and online learning settings. +For the distributional learning setting, we introduce the Pfaffian GJ +framework, an extension of the classical GJ framework, capable of providing +learning guarantees for function classes for which the computation involves +Pfaffian functions. Unlike the GJ framework, which is limited to function +classes with computation characterized by rational functions, our proposed +framework can deal with function classes involving Pfaffian functions, which +are much more general and widely applicable. We then show that for many +parameterized algorithms of interest, their utility function possesses a +refined piece-wise structure, which automatically translates to learning +guarantees using our proposed framework. For the online learning setting, we +provide a new tool for verifying dispersion property of a sequence of loss +functions. This sufficient condition allows no-regret learning for sequences of +piece-wise structured loss functions where the piece-wise structure involves +Pfaffian transition boundaries. + +
+
+
+
+
+ + ☆ Leveraging Machine Learning for Official Statistics: A Statistical + Manifesto + + +
+ It is important for official statistics production to apply ML with +statistical rigor, as it presents both opportunities and challenges. Although +machine learning has enjoyed rapid technological advances in recent years, its +application does not possess the methodological robustness necessary to produce +high quality statistical results. In order to account for all sources of error +in machine learning models, the Total Machine Learning Error (TMLE) is +presented as a framework analogous to the Total Survey Error Model used in +survey methodology. As a means of ensuring that ML models are both internally +valid as well as externally valid, the TMLE model addresses issues such as +representativeness and measurement errors. There are several case studies +presented, illustrating the importance of applying more rigor to the +application of machine learning in official statistics. + +
+
+ comment: 29 pages, 4 figures, 1 table. To appear in the proceedings of the + conference on Foundations and Advances of Machine Learning in Official + Statistics, which was held in Wiesbaden, from 3rd to 5th April, 2024 +
+
+
+
+
+ + ☆ A naive aggregation algorithm for improving generalization in a class of + learning problems + + +
+ In this brief paper, we present a naive aggregation algorithm for a typical +learning problem with expert advice setting, in which the task of improving +generalization, i.e., model validation, is embedded in the learning process as +a sequential decision-making problem. In particular, we consider a class of +learning problem of point estimations for modeling high-dimensional nonlinear +functions, where a group of experts update their parameter estimates using the +discrete-time version of gradient systems, with small additive noise term, +guided by the corresponding subsample datasets obtained from the original +dataset. Here, our main objective is to provide conditions under which such an +algorithm will sequentially determine a set of mixing distribution strategies +used for aggregating the experts' estimates that ultimately leading to an +optimal parameter estimate, i.e., as a consensus solution for all experts, +which is better than any individual expert's estimate in terms of improved +generalization or learning performances. Finally, as part of this work, we +present some numerical results for a typical case of nonlinear regression +problem. + +
+
+ comment: Brief paper, with 7 pages, 1 figure +
+
+
+
+
+ + ☆ AGR: Age Group fairness Reward for Bias Mitigation in LLMs + + +
+ LLMs can exhibit age biases, resulting in unequal treatment of individuals +across age groups. While much research has addressed racial and gender biases, +age bias remains little explored. The scarcity of instruction-tuning and +preference datasets for age bias hampers its detection and measurement, and +existing fine-tuning methods seldom address age-related fairness. In this +paper, we construct age bias preference datasets and instruction-tuning +datasets for RLHF. We introduce ARG, an age fairness reward to reduce +differences in the response quality of LLMs across different age groups. +Extensive experiments demonstrate that this reward significantly improves +response accuracy and reduces performance disparities across age groups. Our +source code and datasets are available at the anonymous +\href{https://anonymous.4open.science/r/FairRLHF-D445/readme.md}{link}. + +
+
+ comment: The first two authors contributed equally to this work. Corresponding + to Zhiqiang Wang. ACKNOWLEDGMENT: we would like to thank the computing + resources support from the State Key Laboratory of New Computer Software + Technologies at Nanjing University +
+
+
+
+
+ + ☆ A high-accuracy multi-model mixing retrosynthetic method + + +
+ The field of computer-aided synthesis planning (CASP) has seen rapid +advancements in recent years, achieving significant progress across various +algorithmic benchmarks. However, chemists often encounter numerous infeasible +reactions when using CASP in practice. This article delves into common errors +associated with CASP and introduces a product prediction model aimed at +enhancing the accuracy of single-step models. While the product prediction +model reduces the number of single-step reactions, it integrates multiple +single-step models to maintain the overall reaction count and increase reaction +diversity. Based on manual analysis and large-scale testing, the product +prediction model, combined with the multi-model ensemble approach, has been +proven to offer higher feasibility and greater diversity. + +
+
+
+
+
+ + ☆ Amortized Bayesian Workflow (Extended Abstract) + + +
+ Bayesian inference often faces a trade-off between computational speed and +sampling accuracy. We propose an adaptive workflow that integrates rapid +amortized inference with gold-standard MCMC techniques to achieve both speed +and accuracy when performing inference on many observed datasets. Our approach +uses principled diagnostics to guide the choice of inference method for each +dataset, moving along the Pareto front from fast amortized sampling to slower +but guaranteed-accurate MCMC when necessary. By reusing computations across +steps, our workflow creates synergies between amortized and MCMC-based +inference. We demonstrate the effectiveness of this integrated approach on a +generalized extreme value task with 1000 observed data sets, showing 90x time +efficiency gains while maintaining high posterior quality. + +
+
+ comment: Extended Abstract +
+
+
+
+
+ + ☆ Active learning for regression in engineering populations: A + risk-informed approach + + +
+ Regression is a fundamental prediction task common in data-centric +engineering applications that involves learning mappings between continuous +variables. In many engineering applications (e.g.\ structural health +monitoring), feature-label pairs used to learn such mappings are of limited +availability which hinders the effectiveness of traditional supervised machine +learning approaches. The current paper proposes a methodology for overcoming +the issue of data scarcity by combining active learning with hierarchical +Bayesian modelling. + Active learning is an approach for preferentially acquiring feature-label +pairs in a resource-efficient manner. In particular, the current work adopts a +risk-informed approach that leverages contextual information associated with +regression-based engineering decision-making tasks (e.g.\ inspection and +maintenance). Hierarchical Bayesian modelling allow multiple related regression +tasks to be learned over a population, capturing local and global effects. The +information sharing facilitated by this modelling approach means that +information acquired for one engineering system can improve predictive +performance across the population. + The proposed methodology is demonstrated using an experimental case study. +Specifically, multiple regressions are performed over a population of machining +tools, where the quantity of interest is the surface roughness of the +workpieces. An inspection and maintenance decision process is defined using +these regression tasks which is in turn used to construct the active-learning +algorithm. The novel methodology proposed is benchmarked against an uninformed +approach to label acquisition and independent modelling of the regression +tasks. It is shown that the proposed approach has superior performance in terms +of expected cost -- maintaining predictive performance while reducing the +number of inspections required. + +
+
+ comment: 19 pages, 12 figures, 3 tables, submitted to Data-Centric Engineering +
+
+
+
+
+ + ☆ Faster Sampling from Log-Concave Densities over Polytopes via Efficient + Linear Solvers ICLR 2024 + + +
+ We consider the problem of sampling from a log-concave distribution +$\pi(\theta) \propto e^{-f(\theta)}$ constrained to a polytope $K:=\{\theta \in +\mathbb{R}^d: A\theta \leq b\}$, where $A\in \mathbb{R}^{m\times d}$ and $b \in +\mathbb{R}^m$.The fastest-known algorithm \cite{mangoubi2022faster} for the +setting when $f$ is $O(1)$-Lipschitz or $O(1)$-smooth runs in roughly $O(md +\times md^{\omega -1})$ arithmetic operations, where the $md^{\omega -1}$ term +arises because each Markov chain step requires computing a matrix inversion and +determinant (here $\omega \approx 2.37$ is the matrix multiplication constant). +We present a nearly-optimal implementation of this Markov chain with per-step +complexity which is roughly the number of non-zero entries of $A$ while the +number of Markov chain steps remains the same. The key technical ingredients +are 1) to show that the matrices that arise in this Dikin walk change slowly, +2) to deploy efficient linear solvers that can leverage this slow change to +speed up matrix inversion by using information computed in previous steps, and +3) to speed up the computation of the determinantal term in the Metropolis +filter step via a randomized Taylor series-based estimator. + +
+
+ comment: The conference version of this paper appears in ICLR 2024 +
+
+
+
+
+ + ☆ Enhancing Uncertainty Quantification in Drug Discovery with Censored + Regression Labels + + +
+ In the early stages of drug discovery, decisions regarding which experiments +to pursue can be influenced by computational models. These decisions are +critical due to the time-consuming and expensive nature of the experiments. +Therefore, it is becoming essential to accurately quantify the uncertainty in +machine learning predictions, such that resources can be used optimally and +trust in the models improves. While computational methods for drug discovery +often suffer from limited data and sparse experimental observations, additional +information can exist in the form of censored labels that provide thresholds +rather than precise values of observations. However, the standard approaches +that quantify uncertainty in machine learning cannot fully utilize censored +labels. In this work, we adapt ensemble-based, Bayesian, and Gaussian models +with tools to learn from censored labels by using the Tobit model from survival +analysis. Our results demonstrate that despite the partial information +available in censored labels, they are essential to accurately and reliably +model the real pharmaceutical setting. + +
+
+
+
+
+ + ☆ A Unified Approach to Inferring Chemical Compounds with the Desired + Aqueous Solubility + + +
+ Aqueous solubility (AS) is a key physiochemical property that plays a crucial +role in drug discovery and material design. We report a novel unified approach +to predict and infer chemical compounds with the desired AS based on simple +deterministic graph-theoretic descriptors, multiple linear regression (MLR) and +mixed integer linear programming (MILP). Selected descriptors based on a +forward stepwise procedure enabled the simplest regression model, MLR, to +achieve significantly good prediction accuracy compared to the existing +approaches, achieving the accuracy in the range [0.7191, 0.9377] for 29 diverse +datasets. By simulating these descriptors and learning models as MILPs, we +inferred mathematically exact and optimal compounds with the desired AS, +prescribed structures, and up to 50 non-hydrogen atoms in a reasonable time +range [6, 1204] seconds. These findings indicate a strong correlation between +the simple graph-theoretic descriptors and the AS of compounds, potentially +leading to a deeper understanding of their AS without relying on widely used +complicated chemical descriptors and complex machine learning models that are +computationally expensive, and therefore difficult to use for inference. An +implementation of the proposed approach is available at +https://github.com/ku-dml/mol-infer/tree/master/AqSol. + +
+
+
+
+
+ + ☆ CoxKAN: Kolmogorov-Arnold Networks for Interpretable, High-Performance + Survival Analysis + + +
+ Survival analysis is a branch of statistics used for modeling the time until +a specific event occurs and is widely used in medicine, engineering, finance, +and many other fields. When choosing survival models, there is typically a +trade-off between performance and interpretability, where the highest +performance is achieved by black-box models based on deep learning. This is a +major problem in fields such as medicine where practitioners are reluctant to +blindly trust black-box models to make important patient decisions. +Kolmogorov-Arnold Networks (KANs) were recently proposed as an interpretable +and accurate alternative to multi-layer perceptrons (MLPs). We introduce +CoxKAN, a Cox proportional hazards Kolmogorov-Arnold Network for interpretable, +high-performance survival analysis. We evaluate the proposed CoxKAN on 4 +synthetic datasets and 9 real medical datasets. The synthetic experiments +demonstrate that CoxKAN accurately recovers interpretable symbolic formulae for +the hazard function, and effectively performs automatic feature selection. +Evaluation on the 9 real datasets show that CoxKAN consistently outperforms the +Cox proportional hazards model and achieves performance that is superior or +comparable to that of tuned MLPs. Furthermore, we find that CoxKAN identifies +complex interactions between predictor variables that would be extremely +difficult to recognise using existing survival methods, and automatically finds +symbolic formulae which uncover the precise effect of important biomarkers on +patient risk. + +
+
+
+
+
+ + ☆ AttentionX: Exploiting Consensus Discrepancy In Attention from A + Distributed Optimization Perspective + + +
+ In this paper, we extend the standard Attention in transformer by exploiting +the consensus discrepancy from a distributed optimization perspective, referred +to as AttentionX. It is noted that %the popular distributed optimization +algorithm \cite{Boyd11ADMM} and the primal-dual method of multipliers (PDMM) +\cite{Zhang16PDMM} is designed to iteratively solve a broad class of +distributed optimization problems over a pear-to-pear (P2P) network, where +neighbouring nodes gradually reach consensus as specified by predefined linear +edge-constraints in the optimization process. In particular, at each iteration +of PDMM, each node in a network first performs information-gathering from +neighbours and then performs local information-fusion. From a high-level point +of view, the $KQ$-softmax-based weighted summation of $V$-representations in +Attention corresponds information-gathering from neighbours while the +feature-processing via the feed-forward network (FFN) in transformer +corresponds to local information fusion. PDMM exploits the Lagrangian +multipliers to capture the historical consensus discrepancy in the form of +residual errors of the linear edge-constraints, which plays a crucial role for +the algorithm to converge. Inspired by PDMM, we propose AttentionX to +incorporate the consensus discrepancy in the output update-expression of the +standard Attention. The consensus discrepancy in AttentionX refers to the +difference between the weighted summation of $V$-representations and scaled +$V$-representions themselves. Experiments on ViT and nanoGPT show promising +performance. + +
+
+
+
+
+ + ☆ Hermes: Memory-Efficient Pipeline Inference for Large Models on Edge + Devices + + +
+ The application of Transformer-based large models has achieved numerous +success in recent years. However, the exponential growth in the parameters of +large models introduces formidable memory challenge for edge deployment. Prior +works to address this challenge mainly focus on optimizing the model structure +and adopting memory swapping methods. However, the former reduces the inference +accuracy, and the latter raises the inference latency. This paper introduces +PIPELOAD, a novel memory-efficient pipeline execution mechanism. It reduces +memory usage by incorporating dynamic memory management and minimizes inference +latency by employing parallel model loading. Based on PIPELOAD mechanism, we +present Hermes, a framework optimized for large model inference on edge +devices. We evaluate Hermes on Transformer-based models of different sizes. Our +experiments illustrate that Hermes achieves up to 4.24 X increase in inference +speed and 86.7% lower memory consumption than the state-of-the-art pipeline +mechanism for BERT and ViT models, 2.58 X increase in inference speed and 90.3% +lower memory consumption for GPT-style models. + +
+
+ comment: Accepted by the 42nd IEEE International Conference on Computer Design + (ICCD 2024) +
+
+
+
+
+ + ☆ WarpAdam: A new Adam optimizer based on Meta-Learning approach + + +
+ Optimal selection of optimization algorithms is crucial for training deep +learning models. The Adam optimizer has gained significant attention due to its +efficiency and wide applicability. However, to enhance the adaptability of +optimizers across diverse datasets, we propose an innovative optimization +strategy by integrating the 'warped gradient descend'concept from Meta Learning +into the Adam optimizer. In the conventional Adam optimizer, gradients are +utilized to compute estimates of gradient mean and variance, subsequently +updating model parameters. Our approach introduces a learnable distortion +matrix, denoted as P, which is employed for linearly transforming gradients. +This transformation slightly adjusts gradients during each iteration, enabling +the optimizer to better adapt to distinct dataset characteristics. By learning +an appropriate distortion matrix P, our method aims to adaptively adjust +gradient information across different data distributions, thereby enhancing +optimization performance. Our research showcases the potential of this novel +approach through theoretical insights and empirical evaluations. Experimental +results across various tasks and datasets validate the superiority of our +optimizer that integrates the 'warped gradient descend' concept in terms of +adaptability. Furthermore, we explore effective strategies for training the +adaptation matrix P and identify scenarios where this method can yield optimal +results. In summary, this study introduces an innovative approach that merges +the 'warped gradient descend' concept from Meta Learning with the Adam +optimizer. By introducing a learnable distortion matrix P within the optimizer, +we aim to enhance the model's generalization capability across diverse data +distributions, thus opening up new possibilities in the field of deep learning +optimization. + +
+
+
+
+
+ + ☆ Unmasking Covert Intrusions: Detection of Fault-Masking Cyberattacks on + Differential Protection Systems + + +
+ Line Current Differential Relays (LCDRs) are high-speed relays progressively +used to protect critical transmission lines. However, LCDRs are vulnerable to +cyberattacks. Fault-Masking Attacks (FMAs) are stealthy cyberattacks performed +by manipulating the remote measurements of the targeted LCDR to disguise faults +on the protected line. Hence, they remain undetected by this LCDR. In this +paper, we propose a two-module framework to detect FMAs. The first module is a +Mismatch Index (MI) developed from the protected transmission line's equivalent +physical model. The MI is triggered only if there is a significant mismatch in +the LCDR's local and remote measurements while the LCDR itself is untriggered, +which indicates an FMA. After the MI is triggered, the second module, a neural +network-based classifier, promptly confirms that the triggering event is a +physical fault that lies on the line protected by the LCDR before declaring the +occurrence of an FMA. The proposed framework is tested using the IEEE 39-bus +benchmark system. Our simulation results confirm that the proposed framework +can accurately detect FMAs on LCDRs and is not affected by normal system +disturbances, variations, or measurement noise. Our experimental results using +OPAL-RT's real-time simulator confirm the proposed solution's real-time +performance capability. + +
+
+ comment: Accepted to IEEE Transactions on Systems, Man, and Cybernetics: + Systems. \c{opyright} 2024 IEEE +
+
+
+
+
+ + ☆ Calibration of Network Confidence for Unsupervised Domain Adaptation + Using Estimated Accuracy + + +
+ This study addresses the problem of calibrating network confidence while +adapting a model that was originally trained on a source domain to a target +domain using unlabeled samples from the target domain. The absence of labels +from the target domain makes it impossible to directly calibrate the adapted +network on the target domain. To tackle this challenge, we introduce a +calibration procedure that relies on estimating the network's accuracy on the +target domain. The network accuracy is first computed on the labeled source +data and then is modified to represent the actual accuracy of the model on the +target domain. The proposed algorithm calibrates the prediction confidence +directly in the target domain by minimizing the disparity between the estimated +accuracy and the computed confidence. The experimental results show that our +method significantly outperforms existing methods, which rely on importance +weighting, across several standard datasets. + +
+
+
+
+
+ + ☆ Advancing Multi-Organ Disease Care: A Hierarchical Multi-Agent + Reinforcement Learning Framework + + +
+ Multi-organ diseases present significant challenges due to their simultaneous +impact on multiple organ systems, necessitating complex and adaptive treatment +strategies. Despite recent advancements in AI-powered healthcare decision +support systems, existing solutions are limited to individual organ systems. +They often ignore the intricate dependencies between organ system and thereby +fails to provide holistic treatment recommendations that are useful in +practice. We propose a novel hierarchical multi-agent reinforcement learning +(HMARL) framework to address these challenges. This framework uses dedicated +agents for each organ system, and model dynamic through explicit inter-agent +communication channels, enabling coordinated treatment strategies across +organs. Furthermore, we introduce a dual-layer state representation technique +to contextualize patient conditions at various hierarchical levels, enhancing +the treatment accuracy and relevance. Through extensive qualitative and +quantitative evaluations in managing sepsis (a complex multi-organ disease), +our approach demonstrates its ability to learn effective treatment policies +that significantly improve patient survival rates. This framework marks a +substantial advancement in clinical decision support systems, pioneering a +comprehensive approach for multi-organ treatment recommendations. + +
+
+
+
+
+ + ☆ Fast Forwarding Low-Rank Training + + +
+ Parameter efficient finetuning methods like low-rank adaptation (LoRA) aim to +reduce the computational costs of finetuning pretrained Language Models (LMs). +Enabled by these low-rank settings, we propose an even more efficient +optimization strategy: Fast Forward, a simple and effective approach to +accelerate large segments of training. In a Fast Forward stage, we repeat the +most recent optimizer step until the loss stops improving on a tiny validation +set. By alternating between regular optimization steps and Fast Forward stages, +Fast Forward provides up to an 87\% reduction in FLOPs and up to an 81\% +reduction in train time over standard SGD with Adam. We validate Fast Forward +by finetuning various models on different tasks and demonstrate that it speeds +up training without compromising model performance. Additionally, we analyze +when and how to apply Fast Forward. + +
+
+
+
+
+ + ☆ Towards Privacy-Preserving Relational Data Synthesis via Probabilistic + Relational Models + + +
+ Probabilistic relational models provide a well-established formalism to +combine first-order logic and probabilistic models, thereby allowing to +represent relationships between objects in a relational domain. At the same +time, the field of artificial intelligence requires increasingly large amounts +of relational training data for various machine learning tasks. Collecting +real-world data, however, is often challenging due to privacy concerns, data +protection regulations, high costs, and so on. To mitigate these challenges, +the generation of synthetic data is a promising approach. In this paper, we +solve the problem of generating synthetic relational data via probabilistic +relational models. In particular, we propose a fully-fledged pipeline to go +from relational database to probabilistic relational model, which can then be +used to sample new synthetic relational data points from its underlying +probability distribution. As part of our proposed pipeline, we introduce a +learning algorithm to construct a probabilistic relational model from a given +relational database. + +
+
+ comment: Accepted to the Proceedings of the 47th German Conference on + Artificial Intelligence (KI 2024) +
+
+
+
+
+ + ☆ Reassessing the Validity of Spurious Correlations Benchmarks + + +
+ Neural networks can fail when the data contains spurious correlations. To +understand this phenomenon, researchers have proposed numerous spurious +correlations benchmarks upon which to evaluate mitigation methods. However, we +observe that these benchmarks exhibit substantial disagreement, with the best +methods on one benchmark performing poorly on another. We explore this +disagreement, and examine benchmark validity by defining three desiderata that +a benchmark should satisfy in order to meaningfully evaluate methods. Our +results have implications for both benchmarks and mitigations: we find that +certain benchmarks are not meaningful measures of method performance, and that +several methods are not sufficiently robust for widespread use. We present a +simple recipe for practitioners to choose methods using the most similar +benchmark to their given problem. + +
+
+
+
+
+ + ☆ Residual Stream Analysis with Multi-Layer SAEs + + +
+ Sparse autoencoders (SAEs) are a promising approach to interpreting the +internal representations of transformer language models. However, standard SAEs +are trained separately on each transformer layer, making it difficult to use +them to study how information flows across layers. To solve this problem, we +introduce the multi-layer SAE (MLSAE): a single SAE trained on the residual +stream activation vectors from every transformer layer simultaneously. The +residual stream is usually understood as preserving information across layers, +so we expected to, and did, find individual SAE features that are active at +multiple layers. Interestingly, while a single SAE feature is active at +different layers for different prompts, for a single prompt, we find that a +single feature is far more likely to be active at a single layer. For larger +underlying models, we find that the cosine similarities between adjacent layers +in the residual stream are higher, so we expect more features to be active at +multiple layers. These results show that MLSAEs are a promising method to study +information flow in transformers. We release our code to train and analyze +MLSAEs at https://github.com/tim-lawson/mlsae. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ☆ The Prevalence of Neural Collapse in Neural Multivariate Regression + + +
+ Recently it has been observed that neural networks exhibit Neural Collapse +(NC) during the final stage of training for the classification problem. We +empirically show that multivariate regression, as employed in imitation +learning and other applications, exhibits Neural Regression Collapse (NRC), a +new form of neural collapse: (NRC1) The last-layer feature vectors collapse to +the subspace spanned by the $n$ principal components of the feature vectors, +where $n$ is the dimension of the targets (for univariate regression, $n=1$); +(NRC2) The last-layer feature vectors also collapse to the subspace spanned by +the last-layer weight vectors; (NRC3) The Gram matrix for the weight vectors +converges to a specific functional form that depends on the covariance matrix +of the targets. After empirically establishing the prevalence of (NRC1)-(NRC3) +for a variety of datasets and network architectures, we provide an explanation +of these phenomena by modeling the regression task in the context of the +Unconstrained Feature Model (UFM), in which the last layer feature vectors are +treated as free variables when minimizing the loss function. We show that when +the regularization parameters in the UFM model are strictly positive, then +(NRC1)-(NRC3) also emerge as solutions in the UFM optimization problem. We also +show that if the regularization parameters are equal to zero, then there is no +collapse. To our knowledge, this is the first empirical and theoretical study +of neural collapse in the context of regression. This extension is significant +not only because it broadens the applicability of neural collapse to a new +category of problems but also because it suggests that the phenomena of neural +collapse could be a universal behavior in deep learning. + +
+
+
+
+
+ + ☆ CISCA and CytoDArk0: a Cell Instance Segmentation and Classification + method for histo(patho)logical image Analyses and a new, open, Nissl-stained + dataset for brain cytoarchitecture studies + + +
+ Delineating and classifying individual cells in microscopy tissue images is a +complex task, yet it is a pivotal endeavor in various medical and biological +investigations. We propose a new deep learning framework (CISCA) for automatic +cell instance segmentation and classification in histological slices to support +detailed morphological and structural analysis or straightforward cell counting +in digital pathology workflows and brain cytoarchitecture studies. At the core +of CISCA lies a network architecture featuring a lightweight U-Net with three +heads in the decoder. The first head classifies pixels into boundaries between +neighboring cells, cell bodies, and background, while the second head regresses +four distance maps along four directions. The network outputs from the first +and second heads are integrated through a tailored post-processing step, which +ultimately yields the segmentation of individual cells. A third head enables +simultaneous classification of cells into relevant classes, if required. We +showcase the effectiveness of our method using four datasets, including CoNIC, +PanNuke, and MoNuSeg, which are publicly available H\&E datasets. Additionally, +we introduce CytoDArk0, a novel dataset consisting of Nissl-stained images of +the cortex, cerebellum, and hippocampus from mammals belonging to the orders +Cetartiodactyla and Primates. We evaluate CISCA in comparison to other +state-of-the-art methods, demonstrating CISCA's robustness and accuracy in +segmenting and classifying cells across diverse tissue types, magnifications, +and staining techniques. + +
+
+
+
+
+ + ☆ Towards Measuring Sell Side Outcomes in Buy Side Marketplace Experiments + using In-Experiment Bipartite Graph KDD 2024 + + +
+ In this study, we evaluate causal inference estimators for online controlled +bipartite graph experiments in a real marketplace setting. Our novel +contribution is constructing a bipartite graph using in-experiment data, rather +than relying on prior knowledge or historical data, the common approach in the +literature published to date. We build the bipartite graph from various +interactions between buyers and sellers in the marketplace, establishing a +novel research direction at the intersection of bipartite experiments and +mediation analysis. This approach is crucial for modern marketplaces aiming to +evaluate seller-side causal effects in buyer-side experiments, or vice versa. +We demonstrate our method using historical buyer-side experiments conducted at +Vinted, the largest second-hand marketplace in Europe with over 80M users. + +
+
+ comment: 5 pages, 3 figures, this work was presented at the KDD 2024 + Conference Undergraduate Consortium +
+
+
+
+
+ + ☆ Can OpenSource beat ChatGPT? -- A Comparative Study of Large Language + Models for Text-to-Code Generation + + +
+ In recent years, large language models (LLMs) have emerged as powerful tools +with potential applications in various fields, including software engineering. +Within the scope of this research, we evaluate five different state-of-the-art +LLMs - Bard, BingChat, ChatGPT, Llama2, and Code Llama - concerning their +capabilities for text-to-code generation. In an empirical study, we feed +prompts with textual descriptions of coding problems sourced from the +programming website LeetCode to the models with the task of creating solutions +in Python. Subsequently, the quality of the generated outputs is assessed using +the testing functionalities of LeetCode. The results indicate large differences +in performance between the investigated models. ChatGPT can handle these +typical programming challenges by far the most effectively, surpassing even +code-specialized models like Code Llama. To gain further insights, we measure +the runtime as well as the memory usage of the generated outputs and compared +them to the other code submissions on Leetcode. A detailed error analysis, +encompassing a comparison of the differences concerning correct indentation and +form of the generated code as well as an assignment of the incorrectly solved +tasks to certain error categories allows us to obtain a more nuanced picture of +the results and potential for improvement. The results also show a clear +pattern of increasingly incorrect produced code when the models are facing a +lot of context in the form of longer prompts. + +
+
+ comment: Conference Paper accepted at the 9th SwissText Conference (2024) +
+
+
+
+
+ + ☆ CUQ-GNN: Committee-based Graph Uncertainty Quantification using + Posterior Networks ECML + + +
+ In this work, we study the influence of domain-specific characteristics when +defining a meaningful notion of predictive uncertainty on graph data. +Previously, the so-called Graph Posterior Network (GPN) model has been proposed +to quantify uncertainty in node classification tasks. Given a graph, it uses +Normalizing Flows (NFs) to estimate class densities for each node independently +and converts those densities into Dirichlet pseudo-counts, which are then +dispersed through the graph using the personalized Page-Rank algorithm. The +architecture of GPNs is motivated by a set of three axioms on the properties of +its uncertainty estimates. We show that those axioms are not always satisfied +in practice and therefore propose the family of Committe-based Uncertainty +Quantification Graph Neural Networks (CUQ-GNNs), which combine standard Graph +Neural Networks with the NF-based uncertainty estimation of Posterior Networks +(PostNets). This approach adapts more flexibly to domain-specific demands on +the properties of uncertainty estimates. We compare CUQ-GNN against GPN and +other uncertainty quantification approaches on common node classification +benchmarks and show that it is effective at producing useful uncertainty +estimates. + +
+
+ comment: 17 pages, 4 figures, 1 table. Accepted at ECML PKDD 2024. arXiv admin + note: substantial text overlap with arXiv:2406.04041 +
+
+
+
+
+ + ☆ An efficient hp-Variational PINNs framework for incompressible + Navier-Stokes equations + + +
+ Physics-informed neural networks (PINNs) are able to solve partial +differential equations (PDEs) by incorporating the residuals of the PDEs into +their loss functions. Variational Physics-Informed Neural Networks (VPINNs) and +hp-VPINNs use the variational form of the PDE residuals in their loss function. +Although hp-VPINNs have shown promise over traditional PINNs, they suffer from +higher training times and lack a framework capable of handling complex +geometries, which limits their application to more complex PDEs. As such, +hp-VPINNs have not been applied in solving the Navier-Stokes equations, amongst +other problems in CFD, thus far. FastVPINNs was introduced to address these +challenges by incorporating tensor-based loss computations, significantly +improving the training efficiency. Moreover, by using the bilinear +transformation, the FastVPINNs framework was able to solve PDEs on complex +geometries. In the present work, we extend the FastVPINNs framework to +vector-valued problems, with a particular focus on solving the incompressible +Navier-Stokes equations for two-dimensional forward and inverse problems, +including problems such as the lid-driven cavity flow, the Kovasznay flow, and +flow past a backward-facing step for Reynolds numbers up to 200. Our results +demonstrate a 2x improvement in training time while maintaining the same order +of accuracy compared to PINNs algorithms documented in the literature. We +further showcase the framework's efficiency in solving inverse problems for the +incompressible Navier-Stokes equations by accurately identifying the Reynolds +number of the underlying flow. Additionally, the framework's ability to handle +complex geometries highlights its potential for broader applications in +computational fluid dynamics. This implementation opens new avenues for +research on hp-VPINNs, potentially extending their applicability to more +complex problems. + +
+
+ comment: 18 pages, 13 tables and 20 figures +
+
+
+
+
+ + ☆ Half-VAE: An Encoder-Free VAE to Bypass Explicit Inverse Mapping + + +
+ Inference and inverse problems are closely related concepts, both +fundamentally involving the deduction of unknown causes or parameters from +observed data. Bayesian inference, a powerful class of methods, is often +employed to solve a variety of problems, including those related to causal +inference. Variational inference, a subset of Bayesian inference, is primarily +used to efficiently approximate complex posterior distributions. Variational +Autoencoders (VAEs), which combine variational inference with deep learning, +have become widely applied across various domains. This study explores the +potential of VAEs for solving inverse problems, such as Independent Component +Analysis (ICA), without relying on an explicit inverse mapping process. Unlike +other VAE-based ICA methods, this approach discards the encoder in the VAE +architecture, directly setting the latent variables as trainable parameters. In +other words, the latent variables are no longer outputs of the encoder but are +instead optimized directly through the objective function to converge to +appropriate values. We find that, with a suitable prior setup, the latent +variables, represented by trainable parameters, can exhibit mutually +independent properties as the parameters converge, all without the need for an +encoding process. This approach, referred to as the Half-VAE, bypasses the +inverse mapping process by eliminating the encoder. This study demonstrates the +feasibility of using the Half-VAE to solve ICA without the need for an explicit +inverse mapping process. + +
+
+
+
+
+ + ☆ Active-Passive Federated Learning for Vertically Partitioned Multi-view + Data + + +
+ Vertical federated learning is a natural and elegant approach to integrate +multi-view data vertically partitioned across devices (clients) while +preserving their privacies. Apart from the model training, existing methods +requires the collaboration of all clients in the model inference. However, the +model inference is probably maintained for service in a long time, while the +collaboration, especially when the clients belong to different organizations, +is unpredictable in real-world scenarios, such as concellation of contract, +network unavailablity, etc., resulting in the failure of them. To address this +issue, we, at the first attempt, propose a flexible Active-Passive Federated +learning (APFed) framework. Specifically, the active client is the initiator of +a learning task and responsible to build the complete model, while the passive +clients only serve as assistants. Once the model built, the active client can +make inference independently. In addition, we instance the APFed framework into +two classification methods with employing the reconstruction loss and the +contrastive loss on passive clients, respectively. Meanwhile, the two methods +are tested in a set of experiments and achieves desired results, validating +their effectiveness. + +
+
+
+
+
+ + ☆ Can LLMs Generate Novel Research Ideas? A Large-Scale Human Study with + 100+ NLP Researchers + + +
+ Recent advancements in large language models (LLMs) have sparked optimism +about their potential to accelerate scientific discovery, with a growing number +of works proposing research agents that autonomously generate and validate new +ideas. Despite this, no evaluations have shown that LLM systems can take the +very first step of producing novel, expert-level ideas, let alone perform the +entire research process. We address this by establishing an experimental design +that evaluates research idea generation while controlling for confounders and +performs the first head-to-head comparison between expert NLP researchers and +an LLM ideation agent. By recruiting over 100 NLP researchers to write novel +ideas and blind reviews of both LLM and human ideas, we obtain the first +statistically significant conclusion on current LLM capabilities for research +ideation: we find LLM-generated ideas are judged as more novel (p < 0.05) than +human expert ideas while being judged slightly weaker on feasibility. Studying +our agent baselines closely, we identify open problems in building and +evaluating research agents, including failures of LLM self-evaluation and their +lack of diversity in generation. Finally, we acknowledge that human judgements +of novelty can be difficult, even by experts, and propose an end-to-end study +design which recruits researchers to execute these ideas into full projects, +enabling us to study whether these novelty and feasibility judgements result in +meaningful differences in research outcome. + +
+
+ comment: main paper is 20 pages +
+
+
+
+
+ + ☆ MixNet: Joining Force of Classical and Modern Approaches Toward the + Comprehensive Pipeline in Motor Imagery EEG Classification + + +
+ Recent advances in deep learning (DL) have significantly impacted motor +imagery (MI)-based brain-computer interface (BCI) systems, enhancing the +decoding of electroencephalography (EEG) signals. However, most studies +struggle to identify discriminative patterns across subjects during MI tasks, +limiting MI classification performance. In this article, we propose MixNet, a +novel classification framework designed to overcome this limitation by +utilizing spectral-spatial signals from MI data, along with a multitask +learning architecture named MIN2Net, for classification. Here, the +spectral-spatial signals are generated using the filter-bank common spatial +patterns (FBCSPs) method on MI data. Since the multitask learning architecture +is used for the classification task, the learning in each task may exhibit +different generalization rates and potential overfitting across tasks. To +address this issue, we implement adaptive gradient blending, simultaneously +regulating multiple loss weights and adjusting the learning pace for each task +based on its generalization/overfitting tendencies. Experimental results on six +benchmark data sets of different data sizes demonstrate that MixNet +consistently outperforms all state-of-the-art algorithms in subject-dependent +and -independent settings. Finally, the low-density EEG MI classification +results show that MixNet outperforms all state-of-the-art algorithms, offering +promising implications for Internet of Thing (IoT) applications, such as +lightweight and portable EEG wearable devices based on low-density montages. + +
+
+ comment: Supplementary materials and source codes are available on-line at + https://github.com/Max-Phairot-A/MixNet +
+
+
+
+
+ + ☆ The Role of Graph Topology in the Performance of Biomedical Knowledge + Graph Completion Models + + +
+ Knowledge Graph Completion has been increasingly adopted as a useful method +for several tasks in biomedical research, like drug repurposing or drug-target +identification. To that end, a variety of datasets and Knowledge Graph +Embedding models has been proposed over the years. However, little is known +about the properties that render a dataset useful for a given task and, even +though theoretical properties of Knowledge Graph Embedding models are well +understood, their practical utility in this field remains controversial. We +conduct a comprehensive investigation into the topological properties of +publicly available biomedical Knowledge Graphs and establish links to the +accuracy observed in real-world applications. By releasing all model +predictions and a new suite of analysis tools we invite the community to build +upon our work and continue improving the understanding of these crucial +applications. + +
+
+
+
+
+ + ☆ Ultra-imbalanced classification guided by statistical information + + +
+ Imbalanced data are frequently encountered in real-world classification +tasks. Previous works on imbalanced learning mostly focused on learning with a +minority class of few samples. However, the notion of imbalance also applies to +cases where the minority class contains abundant samples, which is usually the +case for industrial applications like fraud detection in the area of financial +risk management. In this paper, we take a population-level approach to +imbalanced learning by proposing a new formulation called +\emph{ultra-imbalanced classification} (UIC). Under UIC, loss functions behave +differently even if infinite amount of training samples are available. To +understand the intrinsic difficulty of UIC problems, we borrow ideas from +information theory and establish a framework to compare different loss +functions through the lens of statistical information. A novel learning +objective termed Tunable Boosting Loss is developed which is provably resistant +against data imbalance under UIC, as well as being empirically efficient +verified by extensive experimental studies on both public and industrial +datasets. + +
+
+
+
+
+ + ☆ UI-JEPA: Towards Active Perception of User Intent through Onscreen User + Activity + + +
+ Generating user intent from a sequence of user interface (UI) actions is a +core challenge in comprehensive UI understanding. Recent advancements in +multimodal large language models (MLLMs) have led to substantial progress in +this area, but their demands for extensive model parameters, computing power, +and high latency makes them impractical for scenarios requiring lightweight, +on-device solutions with low latency or heightened privacy. Additionally, the +lack of high-quality datasets has hindered the development of such lightweight +models. To address these challenges, we propose UI-JEPA, a novel framework that +employs masking strategies to learn abstract UI embeddings from unlabeled data +through self-supervised learning, combined with an LLM decoder fine-tuned for +user intent prediction. We also introduce two new UI-grounded multimodal +datasets, "Intent in the Wild" (IIW) and "Intent in the Tame" (IIT), designed +for few-shot and zero-shot UI understanding tasks. IIW consists of 1.7K videos +across 219 intent categories, while IIT contains 914 videos across 10 +categories. We establish the first baselines for these datasets, showing that +representations learned using a JEPA-style objective, combined with an LLM +decoder, can achieve user intent predictions that match the performance of +state-of-the-art large MLLMs, but with significantly reduced annotation and +deployment resources. Measured by intent similarity scores, UI-JEPA outperforms +GPT-4 Turbo and Claude 3.5 Sonnet by 10.0% and 7.2% respectively, averaged +across two datasets. Notably, UI-JEPA accomplishes the performance with a 50.5x +reduction in computational cost and a 6.6x improvement in latency in the IIW +dataset. These results underscore the effectiveness of UI-JEPA, highlighting +its potential for lightweight, high-performance UI understanding. + +
+
+
+
+
+ + ☆ Study of Brain Network in Alzheimers Disease Using Wavelet-Based Graph + Theory Method + + +
+ Alzheimer's disease (AD) is a neurodegenerative disorder marked by memory +loss and cognitive decline, making early detection vital for timely +intervention. However, early diagnosis is challenging due to the heterogeneous +presentation of symptoms. Resting-state fMRI (rs-fMRI) captures spontaneous +brain activity and functional connectivity, which are known to be disrupted in +AD and mild cognitive impairment (MCI). Traditional methods, such as Pearson's +correlation, have been used to calculate association matrices, but these +approaches often overlook the dynamic and non-stationary nature of brain +activity. In this study, we introduce a novel method that integrates discrete +wavelet transform (DWT) and graph theory to model the dynamic behavior of brain +networks. By decomposing rs-fMRI signals using DWT, our approach captures the +time-frequency representation of brain activity, allowing for a more nuanced +analysis of the underlying network dynamics. Graph theory provides a robust +mathematical framework to analyze these complex networks, while machine +learning is employed to automate the discrimination of different stages of AD +based on learned patterns from different frequency bands. We applied our method +to a dataset of rs-fMRI images from the Alzheimer's Disease Neuroimaging +Initiative (ADNI) database, demonstrating its potential as an early diagnostic +tool for AD and for monitoring disease progression. Our statistical analysis +identifies specific brain regions and connections that are affected in AD and +MCI, at different frequency bands, offering deeper insights into the disease's +impact on brain function. + +
+
+
+
+
+ + ☆ Online Residual Learning from Offline Experts for Pedestrian Tracking + + +
+ In this paper, we consider the problem of predicting unknown targets from +data. We propose Online Residual Learning (ORL), a method that combines online +adaptation with offline-trained predictions. At a lower level, we employ +multiple offline predictions generated before or at the beginning of the +prediction horizon. We augment every offline prediction by learning their +respective residual error concerning the true target state online, using the +recursive least squares algorithm. At a higher level, we treat the augmented +lower-level predictors as experts, adopting the Prediction with Expert Advice +framework. We utilize an adaptive softmax weighting scheme to form an aggregate +prediction and provide guarantees for ORL in terms of regret. We employ ORL to +boost performance in the setting of online pedestrian trajectory prediction. +Based on data from the Stanford Drone Dataset, we show that ORL can demonstrate +best-of-both-worlds performance. + +
+
+ comment: Accepted to CDC 2024 +
+
+
+
+
+ + ☆ FEM-based Neural Networks for Solving Incompressible Fluid Flows and + Related Inverse Problems + + +
+ The numerical simulation and optimization of technical systems described by +partial differential equations is expensive, especially in multi-query +scenarios in which the underlying equations have to be solved for different +parameters. A comparatively new approach in this context is to combine the good +approximation properties of neural networks (for parameter dependence) with the +classical finite element method (for discretization). However, instead of +considering the solution mapping of the PDE from the parameter space into the +FEM-discretized solution space as a purely data-driven regression problem, +so-called physically informed regression problems have proven to be useful. In +these, the equation residual is minimized during the training of the neural +network, i.e. the neural network "learns" the physics underlying the problem. +In this paper, we extend this approach to saddle-point and non-linear fluid +dynamics problems, respectively, namely stationary Stokes and stationary +Navier-Stokes equations. In particular, we propose a modification of the +existing approach: Instead of minimizing the plain vanilla equation residual +during training, we minimize the equation residual modified by a +preconditioner. By analogy with the linear case, this also improves the +condition in the present non-linear case. Our numerical examples demonstrate +that this approach significantly reduces the training effort and greatly +increases accuracy and generalizability. Finally, we show the application of +the resulting parameterized model to a related inverse problem. + +
+
+
+
+
+ + ☆ D4: Text-guided diffusion model-based domain adaptive data augmentation + for vineyard shoot detection + + +
+ In an agricultural field, plant phenotyping using object detection models is +gaining attention. However, collecting the training data necessary to create +generic and high-precision models is extremely challenging due to the +difficulty of annotation and the diversity of domains. Furthermore, it is +difficult to transfer training data across different crops, and although +machine learning models effective for specific environments, conditions, or +crops have been developed, they cannot be widely applied in actual fields. In +this study, we propose a generative data augmentation method (D4) for vineyard +shoot detection. D4 uses a pre-trained text-guided diffusion model based on a +large number of original images culled from video data collected by unmanned +ground vehicles or other means, and a small number of annotated datasets. The +proposed method generates new annotated images with background information +adapted to the target domain while retaining annotation information necessary +for object detection. In addition, D4 overcomes the lack of training data in +agriculture, including the difficulty of annotation and diversity of domains. +We confirmed that this generative data augmentation method improved the mean +average precision by up to 28.65% for the BBox detection task and the average +precision by up to 13.73% for the keypoint detection task for vineyard shoot +detection. Our generative data augmentation method D4 is expected to +simultaneously solve the cost and domain diversity issues of training data +generation in agriculture and improve the generalization performance of +detection models. + +
+
+
+
+
+ + ☆ Heterogeneity-Aware Cooperative Federated Edge Learning with Adaptive + Computation and Communication Compression + + +
+ Motivated by the drawbacks of cloud-based federated learning (FL), +cooperative federated edge learning (CFEL) has been proposed to improve +efficiency for FL over mobile edge networks, where multiple edge servers +collaboratively coordinate the distributed model training across a large number +of edge devices. However, CFEL faces critical challenges arising from dynamic +and heterogeneous device properties, which slow down the convergence and +increase resource consumption. This paper proposes a heterogeneity-aware CFEL +scheme called \textit{Heterogeneity-Aware Cooperative Edge-based Federated +Averaging} (HCEF) that aims to maximize the model accuracy while minimizing the +training time and energy consumption via adaptive computation and communication +compression in CFEL. By theoretically analyzing how local update frequency and +gradient compression affect the convergence error bound in CFEL, we develop an +efficient online control algorithm for HCEF to dynamically determine local +update frequencies and compression ratios for heterogeneous devices. +Experimental results show that compared with prior schemes, the proposed HCEF +scheme can maintain higher model accuracy while reducing training latency and +improving energy efficiency simultaneously. + +
+
+ comment: 20 pages, 7 figures +
+
+
+
+
+ + ☆ Over-parameterized regression methods and their application to + semi-supervised learning + + +
+ The minimum norm least squares is an estimation strategy under an +over-parameterized case and, in machine learning, is known as a helpful tool +for understanding a nature of deep learning. In this paper, to apply it in a +context of non-parametric regression problems, we established several methods +which are based on thresholding of SVD (singular value decomposition) +components, wihch are referred to as SVD regression methods. We considered +several methods that are singular value based thresholding, hard-thresholding +with cross validation, universal thresholding and bridge thresholding. +Information on output samples is not utilized in the first method while it is +utilized in the other methods. We then applied them to semi-supervised +learning, in which unlabeled input samples are incorporated into kernel +functions in a regressor. The experimental results for real data showed that, +depending on the datasets, the SVD regression methods is superior to a naive +ridge regression method. Unfortunately, there were no clear advantage of the +methods utilizing information on output samples. Furthermore, for depending on +datasets, incorporation of unlabeled input samples into kernels is found to +have certain advantages. + +
+
+
+
+
+ + ☆ Goal-Reaching Policy Learning from Non-Expert Observations via Effective + Subgoal Guidance + + +
+ In this work, we address the challenging problem of long-horizon +goal-reaching policy learning from non-expert, action-free observation data. +Unlike fully labeled expert data, our data is more accessible and avoids the +costly process of action labeling. Additionally, compared to online learning, +which often involves aimless exploration, our data provides useful guidance for +more efficient exploration. To achieve our goal, we propose a novel subgoal +guidance learning strategy. The motivation behind this strategy is that +long-horizon goals offer limited guidance for efficient exploration and +accurate state transition. We develop a diffusion strategy-based high-level +policy to generate reasonable subgoals as waypoints, preferring states that +more easily lead to the final goal. Additionally, we learn state-goal value +functions to encourage efficient subgoal reaching. These two components +naturally integrate into the off-policy actor-critic framework, enabling +efficient goal attainment through informative exploration. We evaluate our +method on complex robotic navigation and manipulation tasks, demonstrating a +significant performance advantage over existing methods. Our ablation study +further shows that our method is robust to observation data with various +corruptions. + +
+
+ comment: Accepted to CoRL 2024 +
+
+
+
+
+ + ☆ An Efficient and Generalizable Symbolic Regression Method for Time + Series Analysis + + +
+ Time series analysis and prediction methods currently excel in quantitative +analysis, offering accurate future predictions and diverse statistical +indicators, but generally falling short in elucidating the underlying evolution +patterns of time series. To gain a more comprehensive understanding and provide +insightful explanations, we utilize symbolic regression techniques to derive +explicit expressions for the non-linear dynamics in the evolution of time +series variables. However, these techniques face challenges in computational +efficiency and generalizability across diverse real-world time series data. To +overcome these challenges, we propose \textbf{N}eural-\textbf{E}nhanced +\textbf{Mo}nte-Carlo \textbf{T}ree \textbf{S}earch (NEMoTS) for time series. +NEMoTS leverages the exploration-exploitation balance of Monte-Carlo Tree +Search (MCTS), significantly reducing the search space in symbolic regression +and improving expression quality. Furthermore, by integrating neural networks +with MCTS, NEMoTS not only capitalizes on their superior fitting capabilities +to concentrate on more pertinent operations post-search space reduction, but +also replaces the complex and time-consuming simulation process, thereby +substantially improving computational efficiency and generalizability in time +series analysis. NEMoTS offers an efficient and comprehensive approach to time +series analysis. Experiments with three real-world datasets demonstrate +NEMoTS's significant superiority in performance, efficiency, reliability, and +interpretability, making it well-suited for large-scale real-world time series +data. + +
+
+
+
+
+ + ☆ Entry-Specific Matrix Estimation under Arbitrary Sampling Patterns + through the Lens of Network Flows + + +
+ Matrix completion tackles the task of predicting missing values in a low-rank +matrix based on a sparse set of observed entries. It is often assumed that the +observation pattern is generated uniformly at random or has a very specific +structure tuned to a given algorithm. There is still a gap in our understanding +when it comes to arbitrary sampling patterns. Given an arbitrary sampling +pattern, we introduce a matrix completion algorithm based on network flows in +the bipartite graph induced by the observation pattern. For additive matrices, +the particular flow we used is the electrical flow and we establish error upper +bounds customized to each entry as a function of the observation set, along +with matching minimax lower bounds. Our results show that the minimax squared +error for recovery of a particular entry in the matrix is proportional to the +effective resistance of the corresponding edge in the graph. Furthermore, we +show that our estimator is equivalent to the least squares estimator. We apply +our estimator to the two-way fixed effects model and show that it enables us to +accurately infer individual causal effects and the unit-specific and +time-specific confounders. For rank-$1$ matrices, we use edge-disjoint paths to +form an estimator that achieves minimax optimal estimation when the sampling is +sufficiently dense. Our discovery introduces a new family of estimators +parametrized by network flows, which provide a fine-grained and intuitive +understanding of the impact of the given sampling pattern on the relative +difficulty of estimation at an entry-specific level. This graph-based approach +allows us to quantify the inherent complexity of matrix completion for +individual entries, rather than relying solely on global measures of +performance. + +
+
+
+
+
+ + ☆ Bi-modality Images Transfer with a Discrete Process Matching Method + + +
+ Recently, medical image synthesis gains more and more popularity, along with +the rapid development of generative models. Medical image synthesis aims to +generate an unacquired image modality, often from other observed data +modalities. Synthesized images can be used for clinical diagnostic assistance, +data augmentation for model training and validation or image quality improving. +In the meanwhile, the flow-based models are among the successful generative +models for the ability of generating realistic and high-quality synthetic +images. However, most flow-based models require to calculate flow ordinary +different equation (ODE) evolution steps in transfer process, for which the +performances are significantly limited by heavy computation time due to a large +number of time iterations. In this paper, we propose a novel flow-based model, +namely Discrete Process Matching (DPM) to accomplish the bi-modality image +transfer tasks. Different to other flow matching based models, we propose to +utilize both forward and backward ODE flow and enhance the consistency on the +intermediate images of few discrete time steps, resulting in a transfer process +with much less iteration steps while maintaining high-quality generations for +both modalities. Our experiments on three datasets of MRI T1/T2 and CT/MRI +demonstrate that DPM outperforms other state-of-the-art flow-based methods for +bi-modality image synthesis, achieving higher image quality with less +computation time cost. + +
+
+
+
+
+ + ☆ Average Causal Effect Estimation in DAGs with Hidden Variables: + Extensions of Back-Door and Front-Door Criteria + + +
+ The identification theory for causal effects in directed acyclic graphs +(DAGs) with hidden variables is well-developed, but methods for estimating and +inferring functionals beyond the g-formula remain limited. Previous studies +have proposed semiparametric estimators for identifiable functionals in a broad +class of DAGs with hidden variables. While demonstrating double robustness in +some models, existing estimators face challenges, particularly with density +estimation and numerical integration for continuous variables, and their +estimates may fall outside the parameter space of the target estimand. Their +asymptotic properties are also underexplored, especially when using flexible +statistical and machine learning models for nuisance estimation. This study +addresses these challenges by introducing novel one-step corrected plug-in and +targeted minimum loss-based estimators of causal effects for a class of DAGs +that extend classical back-door and front-door criteria (known as the treatment +primal fixability criterion in prior literature). These estimators leverage +machine learning to minimize modeling assumptions while ensuring key +statistical properties such as asymptotic linearity, double robustness, +efficiency, and staying within the bounds of the target parameter space. We +establish conditions for nuisance functional estimates in terms of L2(P)-norms +to achieve root-n consistent causal effect estimates. To facilitate practical +application, we have developed the flexCausal package in R. + +
+
+
+
+
+ + ☆ Algorithmic Collusion Without Threats + + +
+ There has been substantial recent concern that pricing algorithms might learn +to ``collude.'' Supra-competitive prices can emerge as a Nash equilibrium of +repeated pricing games, in which sellers play strategies which threaten to +punish their competitors who refuse to support high prices, and these +strategies can be automatically learned. In fact, a standard economic intuition +is that supra-competitive prices emerge from either the use of threats, or a +failure of one party to optimize their payoff. Is this intuition correct? Would +preventing threats in algorithmic decision-making prevent supra-competitive +prices when sellers are optimizing for their own revenue? No. We show that +supra-competitive prices can emerge even when both players are using algorithms +which do not encode threats, and which optimize for their own revenue. We study +sequential pricing games in which a first mover deploys an algorithm and then a +second mover optimizes within the resulting environment. We show that if the +first mover deploys any algorithm with a no-regret guarantee, and then the +second mover even approximately optimizes within this now static environment, +monopoly-like prices arise. The result holds for any no-regret learning +algorithm deployed by the first mover and for any pricing policy of the second +mover that obtains them profit at least as high as a random pricing would -- +and hence the result applies even when the second mover is optimizing only +within a space of non-responsive pricing distributions which are incapable of +encoding threats. In fact, there exists a set of strategies, neither of which +explicitly encode threats that form a Nash equilibrium of the simultaneous +pricing game in algorithm space, and lead to near monopoly prices. This +suggests that the definition of ``algorithmic collusion'' may need to be +expanded, to include strategies without explicitly encoded threats. + +
+
+
+
+
+ + ☆ Epistemic Uncertainty and Observation Noise with the Neural Tangent + Kernel + + +
+ Recent work has shown that training wide neural networks with gradient +descent is formally equivalent to computing the mean of the posterior +distribution in a Gaussian Process (GP) with the Neural Tangent Kernel (NTK) as +the prior covariance and zero aleatoric noise \parencite{jacot2018neural}. In +this paper, we extend this framework in two ways. First, we show how to deal +with non-zero aleatoric noise. Second, we derive an estimator for the posterior +covariance, giving us a handle on epistemic uncertainty. Our proposed approach +integrates seamlessly with standard training pipelines, as it involves training +a small number of additional predictors using gradient descent on a mean +squared error loss. We demonstrate the proof-of-concept of our method through +empirical evaluation on synthetic regression. + +
+
+ comment: 11 pages including appendix +
+
+
+
+
+ + ☆ The Veracity Problem: Detecting False Information and its Propagation on + Online Social Media Networks + + +
+ Detecting false information on social media is critical in mitigating its +negative societal impacts. To reduce the propagation of false information, +automated detection provide scalable, unbiased, and cost-effective methods. +However, there are three potential research areas identified which once solved +improve detection. First, current AI-based solutions often provide a +uni-dimensional analysis on a complex, multi-dimensional issue, with solutions +differing based on the features used. Furthermore, these methods do not account +for the temporal and dynamic changes observed within the document's life cycle. +Second, there has been little research on the detection of coordinated +information campaigns and in understanding the intent of the actors and the +campaign. Thirdly, there is a lack of consideration of cross-platform analysis, +with existing datasets focusing on a single platform, such as X, and detection +models designed for specific platform. + This work aims to develop methods for effective detection of false +information and its propagation. To this end, firstly we aim to propose the +creation of an ensemble multi-faceted framework that leverages multiple aspects +of false information. Secondly, we propose a method to identify actors and +their intent when working in coordination to manipulate a narrative. Thirdly, +we aim to analyse the impact of cross-platform interactions on the propagation +of false information via the creation of a new dataset. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Modeling, Inference, and Prediction in Mobility-Based Compartmental + Models for Epidemiology + + +
+ Classical compartmental models in epidemiology often assume a homogeneous +population for simplicity, which neglects the inherent heterogeneity among +individuals. This assumption frequently leads to inaccurate predictions when +applied to real-world data. For example, evidence has shown that classical +models overestimate the final pandemic size in the H1N1-2009 and COVID-19 +outbreaks. To address this issue, we introduce individual mobility as a key +factor in disease transmission and control. We characterize disease dynamics +using mobility distribution functions for each compartment and propose a +mobility-based compartmental model that incorporates population heterogeneity. +Our results demonstrate that, for the same basic reproduction number, our +mobility-based model predicts a smaller final pandemic size compared to the +classical models, effectively addressing the common overestimation problem. +Additionally, we infer mobility distributions from the time series of the +infected population. We provide sufficient conditions for uniquely identifying +the mobility distribution from a dataset and propose a machine-learning-based +approach to learn mobility from both synthesized and real-world data. + +
+
+ comment: 19 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Re-evaluating Retrosynthesis Algorithms with Syntheseus + + +
+ Automated Synthesis Planning has recently re-emerged as a research area at +the intersection of chemistry and machine learning. Despite the appearance of +steady progress, we argue that imperfect benchmarks and inconsistent +comparisons mask systematic shortcomings of existing techniques, and +unnecessarily hamper progress. To remedy this, we present a synthesis planning +library with an extensive benchmarking framework, called syntheseus, which +promotes best practice by default, enabling consistent meaningful evaluation of +single-step models and multi-step planning algorithms. We demonstrate the +capabilities of syntheseus by re-evaluating several previous retrosynthesis +algorithms, and find that the ranking of state-of-the-art models changes in +controlled evaluation experiments. We end with guidance for future works in +this area, and call the community to engage in the discussion on how to improve +benchmarks for synthesis planning. + +
+
+ comment: Accepted for publication in Faraday Discussions +
+
+
+
+
+ + ♻ ☆ Deep Limit Model-free Prediction in Regression + + +
+ In this paper, we provide a novel Model-free approach based on Deep Neural +Network (DNN) to accomplish point prediction and prediction interval under a +general regression setting. Usually, people rely on parametric or +non-parametric models to bridge dependent and independent variables (Y and X). +However, this classical method relies heavily on the correct model +specification. Even for the non-parametric approach, some additive form is +often assumed. A newly proposed Model-free prediction principle sheds light on +a prediction procedure without any model assumption. Previous work regarding +this principle has shown better performance than other standard alternatives. +Recently, DNN, one of the machine learning methods, has received increasing +attention due to its great performance in practice. Guided by the Model-free +prediction idea, we attempt to apply a fully connected forward DNN to map X and +some appropriate reference random variable Z to Y. The targeted DNN is trained +by minimizing a specially designed loss function so that the randomness of Y +conditional on X is outsourced to Z through the trained DNN. Our method is more +stable and accurate compared to other DNN-based counterparts, especially for +optimal point predictions. With a specific prediction procedure, our prediction +interval can capture the estimation variability so that it can render a better +coverage rate for finite sample cases. The superior performance of our method +is verified by simulation and empirical studies. + +
+
+
+
+
+ + ♻ ☆ TacoGFN: Target-conditioned GFlowNet for Structure-based Drug Design + + +
+ Searching the vast chemical space for drug-like molecules that bind with a +protein pocket is a challenging task in drug discovery. Recently, +structure-based generative models have been introduced which promise to be more +efficient by learning to generate molecules for any given protein structure. +However, since they learn the distribution of a limited protein-ligand complex +dataset, structure-based methods do not yet outperform optimization-based +methods that generate binding molecules for just one pocket. To overcome +limitations on data while leveraging learning across protein targets, we choose +to model the reward distribution conditioned on pocket structure, instead of +the training data distribution. We design TacoGFN, a novel GFlowNet-based +approach for structure-based drug design, which can generate molecules +conditioned on any protein pocket structure with probabilities proportional to +its affinity and property rewards. In the generative setting for +CrossDocked2020 benchmark, TacoGFN attains a state-of-the-art success rate of +$56.0\%$ and $-8.44$ kcal/mol in median Vina Dock score while improving the +generation time by multiple orders of magnitude. Fine-tuning TacoGFN further +improves the median Vina Dock score to $-10.93$ kcal/mol and the success rate +to $88.8\%$, outperforming all optimization-based methods. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR), + 09/2024 +
+
+
+
+
+ + ♻ ☆ NUMOSIM: A Synthetic Mobility Dataset with Anomaly Detection Benchmarks + + +
+ Collecting real-world mobility data is challenging. It is often fraught with +privacy concerns, logistical difficulties, and inherent biases. Moreover, +accurately annotating anomalies in large-scale data is nearly impossible, as it +demands meticulous effort to distinguish subtle and complex patterns. These +challenges significantly impede progress in geospatial anomaly detection +research by restricting access to reliable data and complicating the rigorous +evaluation, comparison, and benchmarking of methodologies. To address these +limitations, we introduce a synthetic mobility dataset, NUMOSIM, that provides +a controlled, ethical, and diverse environment for benchmarking anomaly +detection techniques. NUMOSIM simulates a wide array of realistic mobility +scenarios, encompassing both typical and anomalous behaviours, generated +through advanced deep learning models trained on real mobility data. This +approach allows NUMOSIM to accurately replicate the complexities of real-world +movement patterns while strategically injecting anomalies to challenge and +evaluate detection algorithms based on how effectively they capture the +interplay between demographic, geospatial, and temporal factors. Our goal is to +advance geospatial mobility analysis by offering a realistic benchmark for +improving anomaly detection and mobility modeling techniques. To support this, +we provide open access to the NUMOSIM dataset, along with comprehensive +documentation, evaluation metrics, and benchmark results. + +
+
+
+
+
+ + ♻ ☆ Are LLM-based methods good enough for detecting unfair terms of service? + + +
+ Countless terms of service (ToS) are being signed everyday by users all over +the world while interacting with all kinds of apps and websites. More often +than not, these online contracts spanning double-digit pages are signed blindly +by users who simply want immediate access to the desired service. What would +normally require a consultation with a legal team, has now become a mundane +activity consisting of a few clicks where users potentially sign away their +rights, for instance in terms of their data privacy, to countless online +entities/companies. Large language models (LLMs) are good at parsing long +text-based documents, and could potentially be adopted to help users when +dealing with dubious clauses in ToS and their underlying privacy policies. To +investigate the utility of existing models for this task, we first build a +dataset consisting of 12 questions applied individually to a set of privacy +policies crawled from popular websites. Thereafter, a series of open-source as +well as commercial chatbots such as ChatGPT, are queried over each question, +with the answers being compared to a given ground truth. Our results show that +some open-source models are able to provide a higher accuracy compared to some +commercial models. However, the best performance is recorded from a commercial +chatbot (ChatGPT4). Overall, all models perform only slightly better than +random at this task. Consequently, their performance needs to be significantly +improved before they can be adopted at large for this purpose. + +
+
+
+
+
+ + ♻ ☆ Universal randomised signatures for generative time series modelling + + +
+ Randomised signature has been proposed as a flexible and easily implementable +alternative to the well-established path signature. In this article, we employ +randomised signature to introduce a generative model for financial time series +data in the spirit of reservoir computing. Specifically, we propose a novel +Wasserstein-type distance based on discrete-time randomised signatures. This +metric on the space of probability measures captures the distance between +(conditional) distributions. Its use is justified by our novel universal +approximation results for randomised signatures on the space of continuous +functions taking the underlying path as an input. We then use our metric as the +loss function in a non-adversarial generator model for synthetic time series +data based on a reservoir neural stochastic differential equation. We compare +the results of our model to benchmarks from the existing literature. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ♻ ☆ The Faiss library + + +
+ Vector databases typically manage large collections of embedding vectors. +Currently, AI applications are growing rapidly, and so is the number of +embeddings that need to be stored and indexed. The Faiss library is dedicated +to vector similarity search, a core functionality of vector databases. Faiss is +a toolkit of indexing methods and related primitives used to search, cluster, +compress and transform vectors. This paper describes the trade-off space of +vector search and the design principles of Faiss in terms of structure, +approach to optimization and interfacing. We benchmark key features of the +library and discuss a few selected applications to highlight its broad +applicability. + +
+
+
+
+
+ + ♻ ☆ Deep learning modelling of manufacturing and build variations on + multi-stage axial compressors aerodynamics + + +
+ Applications of deep learning to physical simulations such as Computational +Fluid Dynamics have recently experienced a surge in interest, and their +viability has been demonstrated in different domains. However, due to the +highly complex, turbulent and three-dimensional flows, they have not yet been +proven usable for turbomachinery applications. Multi-stage axial compressors +for gas turbine applications represent a remarkably challenging case, due to +the high-dimensionality of the regression of the flow-field from geometrical +and operational variables. This paper demonstrates the development and +application of a deep learning framework for predictions of the flow field and +aerodynamic performance of multi-stage axial compressors. A physics-based +dimensionality reduction unlocks the potential for flow-field predictions, as +it re-formulates the regression problem from an un-structured to a structured +one, as well as reducing the number of degrees of freedom. Compared to +traditional "black-box" surrogate models, it provides explainability to the +predictions of overall performance by identifying the corresponding aerodynamic +drivers. This is applied to model the effect of manufacturing and build +variations, as the associated performance scatter is known to have a +significant impact on $CO_2$ emissions, therefore posing a challenge of great +industrial and environmental relevance. The proposed architecture is proven to +achieve an accuracy comparable to that of the CFD benchmark, in real-time, for +an industrially relevant application. The deployed model, is readily integrated +within the manufacturing and build process of gas turbines, thus providing the +opportunity to analytically assess the impact on performance with actionable +and explainable data. + +
+
+
+
+
+ + ♻ ☆ Standing on the shoulders of giants + + +
+ Although fundamental to the advancement of Machine Learning, the classic +evaluation metrics extracted from the confusion matrix, such as precision and +F1, are limited. Such metrics only offer a quantitative view of the models' +performance, without considering the complexity of the data or the quality of +the hit. To overcome these limitations, recent research has introduced the use +of psychometric metrics such as Item Response Theory (IRT), which allows an +assessment at the level of latent characteristics of instances. This work +investigates how IRT concepts can enrich a confusion matrix in order to +identify which model is the most appropriate among options with similar +performance. In the study carried out, IRT does not replace, but complements +classical metrics by offering a new layer of evaluation and observation of the +fine behavior of models in specific instances. It was also observed that there +is 97% confidence that the score from the IRT has different contributions from +66% of the classical metrics analyzed. + +
+
+ comment: 15 pages, 8 figures, 3 tables, submitted for the BRACIS'24 conference +
+
+
+
+
+ + ♻ ☆ Extension of Recurrent Kernels to different Reservoir Computing + topologies + + +
+ Reservoir Computing (RC) has become popular in recent years due to its fast +and efficient computational capabilities. Standard RC has been shown to be +equivalent in the asymptotic limit to Recurrent Kernels, which helps in +analyzing its expressive power. However, many well-established RC paradigms, +such as Leaky RC, Sparse RC, and Deep RC, are yet to be analyzed in such a way. +This study aims to fill this gap by providing an empirical analysis of the +equivalence of specific RC architectures with their corresponding Recurrent +Kernel formulation. We conduct a convergence study by varying the activation +function implemented in each architecture. Our study also sheds light on the +role of sparse connections in RC architectures and propose an optimal sparsity +level that depends on the reservoir size. Furthermore, our systematic analysis +shows that in Deep RC models, convergence is better achieved with successive +reservoirs of decreasing sizes. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Integer-only Quantized Transformers for Embedded FPGA-based Time-series + Forecasting in AIoT + + +
+ This paper presents the design of a hardware accelerator for Transformers, +optimized for on-device time-series forecasting in AIoT systems. It integrates +integer-only quantization and Quantization-Aware Training with optimized +hardware designs to realize 6-bit and 4-bit quantized Transformer models, which +achieved precision comparable to 8-bit quantized models from related research. +Utilizing a complete implementation on an embedded FPGA (Xilinx Spartan-7 +XC7S15), we examine the feasibility of deploying Transformer models on embedded +IoT devices. This includes a thorough analysis of achievable precision, +resource utilization, timing, power, and energy consumption for on-device +inference. Our results indicate that while sufficient performance can be +attained, the optimization process is not trivial. For instance, reducing the +quantization bitwidth does not consistently result in decreased latency or +energy consumption, underscoring the necessity of systematically exploring +various optimization combinations. Compared to an 8-bit quantized Transformer +model in related studies, our 4-bit quantized Transformer model increases test +loss by only 0.63%, operates up to 132.33x faster, and consumes 48.19x less +energy. + +
+
+ comment: 7 pages, 3 figures, 4 tables. The paper was accepted by 2024 IEEE + Annual Congress on Artificial Intelligence of Things (IEEE AIoT) and got best + paper award +
+
+
+
+
+ + ♻ ☆ A Black-Box Physics-Informed Estimator based on Gaussian Process + Regression for Robot Inverse Dynamics Identification + + +
+ Learning the inverse dynamics of robots directly from data, adopting a +black-box approach, is interesting for several real-world scenarios where +limited knowledge about the system is available. In this paper, we propose a +black-box model based on Gaussian Process (GP) Regression for the +identification of the inverse dynamics of robotic manipulators. The proposed +model relies on a novel multidimensional kernel, called \textit{Lagrangian +Inspired Polynomial} (\kernelInitials{}) kernel. The \kernelInitials{} kernel +is based on two main ideas. First, instead of directly modeling the inverse +dynamics components, we model as GPs the kinetic and potential energy of the +system. The GP prior on the inverse dynamics components is derived from those +on the energies by applying the properties of GPs under linear operators. +Second, as regards the energy prior definition, we prove a polynomial structure +of the kinetic and potential energy, and we derive a polynomial kernel that +encodes this property. As a consequence, the proposed model allows also to +estimate the kinetic and potential energy without requiring any label on these +quantities. Results on simulation and on two real robotic manipulators, namely +a 7 DOF Franka Emika Panda, and a 6 DOF MELFA RV4FL, show that the proposed +model outperforms state-of-the-art black-box estimators based both on Gaussian +Processes and Neural Networks in terms of accuracy, generality and data +efficiency. The experiments on the MELFA robot also demonstrate that our +approach achieves performance comparable to fine-tuned model-based estimators, +despite requiring less prior information. + +
+
+
+
+
+ + ♻ ☆ RSF-Conv: Rotation-and-Scale Equivariant Fourier Parameterized + Convolution for Retinal Vessel Segmentation + + +
+ Retinal vessel segmentation is of great clinical significance for the +diagnosis of many eye-related diseases, but it is still a formidable challenge +due to the intricate vascular morphology. With the skillful characterization of +the translation symmetry existing in retinal vessels, convolutional neural +networks (CNNs) have achieved great success in retinal vessel segmentation. +However, the rotation-and-scale symmetry, as a more widespread image prior in +retinal vessels, fails to be characterized by CNNs. Therefore, we propose a +rotation-and-scale equivariant Fourier parameterized convolution (RSF-Conv) +specifically for retinal vessel segmentation, and provide the corresponding +equivariance analysis. As a general module, RSF-Conv can be integrated into +existing networks in a plug-and-play manner while significantly reducing the +number of parameters. For instance, we replace the traditional convolution +filters in U-Net and Iter-Net with RSF-Convs, and faithfully conduct +comprehensive experiments. RSF-Conv+U-Net and RSF-Conv+Iter-Net not only have +slight advantages under in-domain evaluation, but more importantly, outperform +all comparison methods by a significant margin under out-of-domain evaluation. +It indicates the remarkable generalization of RSF-Conv, which holds greater +practical clinical significance for the prevalent cross-device and +cross-hospital challenges in clinical practice. To comprehensively demonstrate +the effectiveness of RSF-Conv, we also apply RSF-Conv+U-Net and +RSF-Conv+Iter-Net to retinal artery/vein classification and achieve promising +performance as well, indicating its clinical application potential. + +
+
+
+
+
+ + ♻ ☆ Probabilistic Matching of Real and Generated Data Statistics in + Generative Adversarial Networks + + +
+ Generative adversarial networks constitute a powerful approach to generative +modeling. While generated samples often are indistinguishable from real data, +there is no guarantee that they will follow the true data distribution. For +scientific applications in particular, it is essential that the true +distribution is well captured by the generated distribution. In this work, we +propose a method to ensure that the distributions of certain generated data +statistics coincide with the respective distributions of the real data. In +order to achieve this, we add a new loss term to the generator loss function, +which quantifies the difference between these distributions via suitable +f-divergences. Kernel density estimation is employed to obtain representations +of the true distributions, and to estimate the corresponding generated +distributions from minibatch values at each iteration. When compared to other +methods, our approach has the advantage that the complete shapes of the +distributions are taken into account. We evaluate the method on a synthetic +dataset and a real-world dataset and demonstrate improved performance of our +approach. + +
+
+
+
+
+ + ♻ ☆ On The Expressivity of Recurrent Neural Cascades AAAI 2024 + + +
+ Recurrent Neural Cascades (RNCs) are the recurrent neural networks with no +cyclic dependencies among recurrent neurons. This class of recurrent networks +has received a lot of attention in practice. Besides training methods for a +fixed architecture such as backpropagation, the cascade architecture naturally +allows for constructive learning methods, where recurrent nodes are added +incrementally one at a time, often yielding smaller networks. Furthermore, +acyclicity amounts to a structural prior that even for the same number of +neurons yields a more favourable sample complexity compared to a +fully-connected architecture. A central question is whether the advantages of +the cascade architecture come at the cost of a reduced expressivity. We provide +new insights into this question. We show that the regular languages captured by +RNCs with sign and tanh activation with positive recurrent weights are the +star-free regular languages. In order to establish our results we developed a +novel framework where capabilities of RNCs are accessed by analysing which +semigroups and groups a single neuron is able to implement. A notable +implication of our framework is that RNCs can achieve the expressivity of all +regular languages by introducing neurons that can implement groups. + +
+
+ comment: Full version with appendix of a paper with the same title that + appears in the proceedings of AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Video alignment using unsupervised learning of local and global features + + +
+ In this paper, we tackle the problem of video alignment, the process of +matching the frames of a pair of videos containing similar actions. The main +challenge in video alignment is that accurate correspondence should be +established despite the differences in the execution processes and appearances +between the two videos. We introduce an unsupervised method for alignment that +uses global and local features of the frames. In particular, we introduce +effective features for each video frame by means of three machine vision tools: +person detection, pose estimation, and VGG network. Then the features are +processed and combined to construct a multidimensional time series that +represent the video. The resulting time series are used to align videos of the +same actions using a novel version of dynamic time warping named Diagonalized +Dynamic Time Warping(DDTW). The main advantage of our approach is that no +training is required, which makes it applicable for any new type of action +without any need to collect training samples for it. Additionally, our approach +can be used for framewise labeling of action phases in a dataset with only a +few labeled videos. For evaluation, we considered video synchronization and +phase classification tasks on the Penn action and subset of UCF101 datasets. +Also, for an effective evaluation of the video synchronization task, we present +a new metric called Enclosed Area Error(EAE). The results show that our method +outperforms previous state-of-the-art methods, such as TCC, and other +self-supervised and weakly supervised methods. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ When big data actually are low-rank, or entrywise approximation of + certain function-generated matrices + + +
+ The article concerns low-rank approximation of matrices generated by sampling +a smooth function of two $m$-dimensional variables. We refute an argument made +in the literature to prove that, for a specific class of analytic functions, +such matrices admit accurate entrywise approximation of rank that is +independent of $m$ -- a claim known as "big-data matrices are approximately +low-rank". We provide a theoretical explanation of the numerical results +presented in support of this claim, describing three narrower classes of +functions for which $n \times n$ function-generated matrices can be +approximated within an entrywise error of order $\varepsilon$ with rank +$\mathcal{O}(\log(n) \varepsilon^{-2} \mathrm{polylog}(\varepsilon^{-1}))$ that +is independent of the dimension $m$: (i) functions of the inner product of the +two variables, (ii) functions of the Euclidean distance between the variables, +and (iii) shift-invariant positive-definite kernels. We extend our argument to +tensor-train approximation of tensors generated with functions of the +multi-linear product of their $m$-dimensional variables. We discuss our results +in the context of low-rank approximation of (a) growing datasets and (b) +attention in transformer neural networks. + +
+
+ comment: Extended Sections 1 and 2 +
+
+
+
+
+ + ♻ ☆ Uncertainty Modeling in Graph Neural Networks via Stochastic + Differential Equations + + +
+ We address the problem of learning uncertainty-aware representations for +graph-structured data. While Graph Neural Ordinary Differential Equations +(GNODE) are effective in learning node representations, they fail to quantify +uncertainty. To address this, we introduce Latent Graph Neural Stochastic +Differential Equations (LGNSDE), which enhance GNODE by embedding randomness +through Brownian motion to quantify uncertainty. We provide theoretical +guarantees for LGNSDE and empirically show better performance in uncertainty +quantification. + +
+
+ comment: 9 pages including appendix +
+
+
+
+
+ + ♻ ☆ Unveiling the Unborn: Advancing Fetal Health Classification through + Machine Learning + + +
+ Fetal health classification is a critical task in obstetrics, enabling early +identification and management of potential health problems. However, it remains +challenging due to data complexity and limited labeled samples. This research +paper presents a novel machine-learning approach for fetal health +classification, leveraging a LightGBM classifier trained on a comprehensive +dataset. The proposed model achieves an impressive accuracy of 98.31% on a test +set. Our findings demonstrate the potential of machine learning in enhancing +fetal health classification, offering a more objective and accurate assessment. +Notably, our approach combines various features, such as fetal heart rate, +uterine contractions, and maternal blood pressure, to provide a comprehensive +evaluation. This methodology holds promise for improving early detection and +treatment of fetal health issues, ensuring better outcomes for both mothers and +babies. Beyond the high accuracy achieved, the novelty of our approach lies in +its comprehensive feature selection and assessment methodology. By +incorporating multiple data points, our model offers a more holistic and +reliable evaluation compared to traditional methods. This research has +significant implications in the field of obstetrics, paving the way for +advancements in early detection and intervention of fetal health concerns. +Future work involves validating the model on a larger dataset and developing a +clinical application. Ultimately, we anticipate that our research will +revolutionize the assessment and management of fetal health, contributing to +improved healthcare outcomes for expectant mothers and their babies. + +
+
+
+
+
+ + ♻ ☆ Scalable mixed-domain Gaussian process modeling and model reduction for + longitudinal data + + +
+ Gaussian process (GP) models that combine both categorical and continuous +input variables have found use in longitudinal data analysis of and computer +experiments. However, standard inference for these models has the typical cubic +scaling, and common scalable approximation schemes for GPs cannot be applied +since the covariance function is non-continuous. In this work, we derive a +basis function approximation scheme for mixed-domain covariance functions, +which scales linearly with respect to the number of observations and total +number of basis functions. The proposed approach is naturally applicable to +also Bayesian GP regression with discrete observation models. We demonstrate +the scalability of the approach and compare model reduction techniques for +additive GP models in a longitudinal data context. We confirm that we can +approximate the exact GP model accurately in a fraction of the runtime compared +to fitting the corresponding exact model. In addition, we demonstrate a +scalable model reduction workflow for obtaining smaller and more interpretable +models when dealing with a large number of candidate predictors. + +
+
+
+
+
+ + ♻ ☆ Partial Rankings of Optimizers + + +
+ We introduce a framework for benchmarking optimizers according to multiple +criteria over various test functions. Based on a recently introduced union-free +generic depth function for partial orders/rankings, it fully exploits the +ordinal information and allows for incomparability. Our method describes the +distribution of all partial orders/rankings, avoiding the notorious +shortcomings of aggregation. This permits to identify test functions that +produce central or outlying rankings of optimizers and to assess the quality of +benchmarking suites. + +
+
+
+
+
+ + ♻ ☆ Oops, I Sampled it Again: Reinterpreting Confidence Intervals in + Few-Shot Learning + + +
+ The predominant method for computing confidence intervals (CI) in few-shot +learning (FSL) is based on sampling the tasks with replacement, i.e.\ allowing +the same samples to appear in multiple tasks. This makes the CI misleading in +that it takes into account the randomness of the sampler but not the data +itself. To quantify the extent of this problem, we conduct a comparative +analysis between CIs computed with and without replacement. These reveal a +notable underestimation by the predominant method. This observation calls for a +reevaluation of how we interpret confidence intervals and the resulting +conclusions in FSL comparative studies. Our research demonstrates that the use +of paired tests can partially address this issue. Additionally, we explore +methods to further reduce the (size of the) CI by strategically sampling tasks +of a specific size. We also introduce a new optimized benchmark, which can be +accessed at https://github.com/RafLaf/FSL-benchmark-again + +
+
+
+
+
+ + ♻ ☆ QET: Enhancing Quantized LLM Parameters and KV cache Compression through + Element Substitution and Residual Clustering + + +
+ The matrix quantization entails representing matrix elements in a more +space-efficient form to reduce storage usage, with dequantization restoring the +original matrix for use. We formulate the Quantization Error Minimization (QEM) +problem as minimizing the distance between a matrix before and after +quantization, under the condition that the quantized matrix occupies the same +memory space. Matrix quantization is crucial in various applications, including +Large Language Models (LLMs) weight quantization, vector databases, KV cache +quantization, graph compression, and image compression. Recent advancements in +LLMs, such as GPT-4 and BERT, have highlighted the importance of matrix +compression due to the large size of parameters and KV cache, which are stored +as matrices. + We propose Quantum Entanglement Trees (QET) to address the QEM problem by +leveraging the local orderliness of matrix elements, involving iterative +element swapping to form a locally ordered matrix. This matrix is then grouped +and quantized by columns. To enhance QET, we introduce two optimizations: +further quantizing residuals to reduce MSE, and using masking and batch +processing to accelerate the algorithm. + Experimental results demonstrate that QET can effectively reduce MSE to +5.05%, 13.33%, and 11.89% of the current best method on the LLM dataset, K +cache, and V cache, respectively. Our contributions include the abstraction of +the QEM problem, the design of the QET algorithm, and the proposal of two +optimizations to improve accuracy and speed. + +
+
+
+
+
+ + ♻ ☆ Large-scale Urban Facility Location Selection with Knowledge-informed + Reinforcement Learning + + +
+ The facility location problem (FLP) is a classical combinatorial optimization +challenge aimed at strategically laying out facilities to maximize their +accessibility. In this paper, we propose a reinforcement learning method +tailored to solve large-scale urban FLP, capable of producing near-optimal +solutions at superfast inference speed. We distill the essential swap operation +from local search, and simulate it by intelligently selecting edges on a graph +of urban regions, guided by a knowledge-informed graph neural network, thus +sidestepping the need for heavy computation of local search. Extensive +experiments on four US cities with different geospatial conditions demonstrate +that our approach can achieve comparable performance to commercial solvers with +less than 5\% accessibility loss, while displaying up to 1000 times speedup. We +deploy our model as an online geospatial application at +https://huggingface.co/spaces/randommmm/MFLP. + +
+
+ comment: Sigspatial2024 +
+
+
+
+
+ + ♻ ☆ PAGE: Parametric Generative Explainer for Graph Neural Network + + +
+ This article introduces PAGE, a parameterized generative interpretive +framework. PAGE is capable of providing faithful explanations for any graph +neural network without necessitating prior knowledge or internal details. +Specifically, we train the auto-encoder to generate explanatory substructures +by designing appropriate training strategy. Due to the dimensionality reduction +of features in the latent space of the auto-encoder, it becomes easier to +extract causal features leading to the model's output, which can be easily +employed to generate explanations. To accomplish this, we introduce an +additional discriminator to capture the causality between latent causal +features and the model's output. By designing appropriate optimization +objectives, the well-trained discriminator can be employed to constrain the +encoder in generating enhanced causal features. Finally, these features are +mapped to substructures of the input graph through the decoder to serve as +explanations. Compared to existing methods, PAGE operates at the sample scale +rather than nodes or edges, eliminating the need for perturbation or encoding +processes as seen in previous methods. Experimental results on both +artificially synthesized and real-world datasets demonstrate that our approach +not only exhibits the highest faithfulness and accuracy but also significantly +outperforms baseline models in terms of efficiency. + +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes/formatting improvements. V3: improvements from journal + revisions. V4: fix figures +
+
+
+
+
+ + ♻ ☆ A unified law of robustness for Bregman divergence losses + + +
+ In contemporary deep learning practice, models are often trained to near zero +loss i.e. to nearly interpolate the training data. However, the number of +parameters in the model is usually far more than the number of data points $n$, +the theoretical minimum needed for interpolation: a phenomenon referred to as +overparameterization. In an interesting piece of work that contributes to the +considerable research that has been devoted to understand overparameterization, +Bubeck and Sellke showed that for a broad class of covariate distributions +(specifically those satisfying a natural notion of concentration of measure), +overparameterization is necessary for robust interpolation i.e. if the +interpolating function is required to be Lipschitz. However, their robustness +results were proved only in the setting of regression with square loss. In +practice, however many other kinds of losses are used, e.g. cross entropy loss +for classification. In this work, we generalize Bubeck and Selke's result to +Bregman divergence losses, which form a common generalization of square loss +and cross-entropy loss. Our generalization relies on identifying a +bias-variance type decomposition that lies at the heart of the proof and Bubeck +and Sellke. + +
+
+ comment: 18 pages; fixed a typo in a citation +
+
+
+
+
+ + ♻ ☆ Certifiable Black-Box Attacks with Randomized Adversarial Examples: + Breaking Defenses with Provable Confidence CCS 2024 + + +
+ Black-box adversarial attacks have demonstrated strong potential to +compromise machine learning models by iteratively querying the target model or +leveraging transferability from a local surrogate model. Recently, such attacks +can be effectively mitigated by state-of-the-art (SOTA) defenses, e.g., +detection via the pattern of sequential queries, or injecting noise into the +model. To our best knowledge, we take the first step to study a new paradigm of +black-box attacks with provable guarantees -- certifiable black-box attacks +that can guarantee the attack success probability (ASP) of adversarial examples +before querying over the target model. This new black-box attack unveils +significant vulnerabilities of machine learning models, compared to traditional +empirical black-box attacks, e.g., breaking strong SOTA defenses with provable +confidence, constructing a space of (infinite) adversarial examples with high +ASP, and the ASP of the generated adversarial examples is theoretically +guaranteed without verification/queries over the target model. Specifically, we +establish a novel theoretical foundation for ensuring the ASP of the black-box +attack with randomized adversarial examples (AEs). Then, we propose several +novel techniques to craft the randomized AEs while reducing the perturbation +size for better imperceptibility. Finally, we have comprehensively evaluated +the certifiable black-box attacks on the CIFAR10/100, ImageNet, and LibriSpeech +datasets, while benchmarking with 16 SOTA black-box attacks, against various +SOTA defenses in the domains of computer vision and speech recognition. Both +theoretical and experimental results have validated the significance of the +proposed attack. The code and all the benchmarks are available at +\url{https://github.com/datasec-lab/CertifiedAttack}. + +
+
+ comment: accepted by ACM CCS 2024 +
+
+
+
+
+ + ♻ ☆ Rethinking Molecular Design: Integrating Latent Variable and + Auto-Regressive Models for Goal Directed Generation + + +
+ De novo molecule design has become a highly active research area, advanced +significantly through the use of state-of-the-art generative models. Despite +these advances, several fundamental questions remain unanswered as the field +increasingly focuses on more complex generative models and sophisticated +molecular representations as an answer to the challenges of drug design. In +this paper, we return to the simplest representation of molecules, and +investigate overlooked limitations of classical generative approaches, +particularly Variational Autoencoders (VAEs) and auto-regressive models. We +propose a hybrid model in the form of a novel regularizer that leverages the +strengths of both to improve validity, conditional generation, and style +transfer of molecular sequences. Additionally, we provide an in depth +discussion of overlooked assumptions of these models' behaviour. + +
+
+
+
+
+ + ♻ ☆ Contrastive Graph Pooling for Explainable Classification of Brain + Networks + + +
+ Functional magnetic resonance imaging (fMRI) is a commonly used technique to +measure neural activation. Its application has been particularly important in +identifying underlying neurodegenerative conditions such as Parkinson's, +Alzheimer's, and Autism. Recent analysis of fMRI data models the brain as a +graph and extracts features by graph neural networks (GNNs). However, the +unique characteristics of fMRI data require a special design of GNN. Tailoring +GNN to generate effective and domain-explainable features remains challenging. +In this paper, we propose a contrastive dual-attention block and a +differentiable graph pooling method called ContrastPool to better utilize GNN +for brain networks, meeting fMRI-specific requirements. We apply our method to +5 resting-state fMRI brain network datasets of 3 diseases and demonstrate its +superiority over state-of-the-art baselines. Our case study confirms that the +patterns extracted by our method match the domain knowledge in neuroscience +literature, and disclose direct and interesting insights. Our contributions +underscore the potential of ContrastPool for advancing the understanding of +brain networks and neurodegenerative conditions. The source code is available +at https://github.com/AngusMonroe/ContrastPool. + +
+
+
+
+
+ + ♻ ☆ PowerFlowMultiNet: Multigraph Neural Networks for Unbalanced Three-Phase + Distribution Systems + + +
+ Efficiently solving unbalanced three-phase power flow in distribution grids +is pivotal for grid analysis and simulation. There is a pressing need for +scalable algorithms capable of handling large-scale unbalanced power grids that +can provide accurate and fast solutions. To address this, deep learning +techniques, especially Graph Neural Networks (GNNs), have emerged. However, +existing literature primarily focuses on balanced networks, leaving a critical +gap in supporting unbalanced three-phase power grids. This letter introduces +PowerFlowMultiNet, a novel multigraph GNN framework explicitly designed for +unbalanced three-phase power grids. The proposed approach models each phase +separately in a multigraph representation, effectively capturing the inherent +asymmetry in unbalanced grids. A graph embedding mechanism utilizing message +passing is introduced to capture spatial dependencies within the power system +network. PowerFlowMultiNet outperforms traditional methods and other deep +learning approaches in terms of accuracy and computational speed. Rigorous +testing reveals significantly lower error rates and a notable hundredfold +increase in computational speed for large power networks compared to +model-based methods. + +
+
+
+
+
+ + ♻ ☆ Invariant kernels on Riemannian symmetric spaces: a harmonic-analytic + approach + + +
+ This work aims to prove that the classical Gaussian kernel, when defined on a +non-Euclidean symmetric space, is never positive-definite for any choice of +parameter. To achieve this goal, the paper develops new geometric and +analytical arguments. These provide a rigorous characterization of the +positive-definiteness of the Gaussian kernel, which is complete but for a +limited number of scenarios in low dimensions that are treated by numerical +computations. Chief among these results are the L$^{\!\scriptscriptstyle +p}$-$\hspace{0.02cm}$Godement theorems (where $p = 1,2$), which provide +verifiable necessary and sufficient conditions for a kernel defined on a +symmetric space of non-compact type to be positive-definite. A celebrated +theorem, sometimes called the Bochner-Godement theorem, already gives such +conditions and is far more general in its scope, but is especially hard to +apply. Beyond the connection with the Gaussian kernel, the new results in this +work lay out a blueprint for the study of invariant kernels on symmetric +spaces, bringing forth specific harmonic analysis tools that suggest many +future applications. + +
+
+
+
+
+ + ♻ ☆ Locally Convex Global Loss Network for Decision-Focused Learning + + +
+ In decision-making problem under uncertainty, predicting unknown parameters +is often considered independent of the optimization part. Decision-focused +Learning (DFL) is a task-oriented framework to integrate prediction and +optimization by adapting predictive model to give better decision for the +corresponding task. Here, an inevitable challenge arises when computing +gradients of the optimal decision with respect to the parameters. Existing +researches cope this issue by smoothly reforming surrogate optimization or +construct surrogate loss function that mimic task loss. However, they are +applied to restricted optimization domain. In this paper, we propose Locally +Convex Global Loss Network (LCGLN), a global surrogate loss model which can be +implemented in a general DFL paradigm. LCGLN learns task loss via partial input +convex neural network which is guaranteed to be convex for chosen inputs, while +keeping the non-convex global structure for the other inputs. This enables +LCGLN to admit general DFL through only a single surrogate loss without any +sense for choosing appropriate parametric forms. We confirm effectiveness and +flexibility of LCGLN by evaluating our proposed model with three stochastic +decision-making problems. + +
+
+
+
+
+ + ♻ ☆ Minimax Optimal Algorithms with Fixed-$k$-Nearest Neighbors + + +
+ This paper presents how to perform minimax optimal classification, +regression, and density estimation based on fixed-$k$ nearest neighbor (NN) +searches. We consider a distributed learning scenario, in which a massive +dataset is split into smaller groups, where the $k$-NNs are found for a query +point with respect to each subset of data. We propose \emph{optimal} rules to +aggregate the fixed-$k$-NN information for classification, regression, and +density estimation that achieve minimax optimal rates for the respective +problems. We show that the distributed algorithm with a fixed $k$ over a +sufficiently large number of groups attains a minimax optimal error rate up to +a multiplicative logarithmic factor under some regularity conditions. Roughly +speaking, distributed $k$-NN rules with $M$ groups has a performance comparable +to the standard $\Theta(kM)$-NN rules even for fixed $k$. + +
+
+ comment: 65 pages, 5 figures. The manuscript has been revised from scratch + compared to the previous version. Notable differences include (1) updated + statements and corrected proofs for classification and regression, (2) + explicit statements and proofs for distance-selective rules, and (3) new + analogous estimators for density estimation +
+
+
+
+
+ + ♻ ☆ Innovative Speech-Based Deep Learning Approaches for Parkinson's Disease + Classification: A Systematic Review + + +
+ Parkinson's disease (PD), the second most prevalent neurodegenerative +disorder worldwide, frequently presents with early-stage speech impairments. +Recent advancements in Artificial Intelligence (AI), particularly deep learning +(DL), have significantly enhanced PD diagnosis through the analysis of speech +data. Nevertheless, the progress of research is restricted by the limited +availability of publicly accessible speech-based PD datasets, primarily due to +privacy concerns. The goal of this systematic review is to explore the current +landscape of speech-based DL approaches for PD classification, based on 33 +scientific works published between January 2020 and March 2024. We discuss +their available resources, capabilities, and potential limitations, and issues +related to bias, explainability, and privacy. Furthermore, this review provides +an overview of publicly accessible speech-based datasets and open-source +material for PD. The DL approaches identified are categorized into end-to-end +(E2E) learning, transfer learning (TL), and deep acoustic feature extraction +(DAFE). Among E2E approaches, Convolutional Neural Networks (CNNs) are +prevalent, though Transformers are increasingly popular. E2E approaches face +challenges such as limited data and computational resources, especially with +Transformers. TL addresses these issues by providing more robust PD diagnosis +and better generalizability across languages. DAFE aims to improve the +explainability and interpretability of results by examining the specific +effects of deep features on both other DL approaches and more traditional +machine learning (ML) methods. However, it often underperforms compared to E2E +and TL approaches. + +
+
+ comment: van Gelderen, L., & Tejedor-Garc\'ia, C. (2024). Innovative + Speech-Based Deep Learning Approaches for Parkinson's Disease Classification: + A Systematic Review. Applied Sciences, 14(17). doi:10.3390/app14177873 This + research was funded by the NWO research programme NGF AiNed Fellowship Grants + under the project Responsible AI for Voice Diagnostics (RAIVD) - grant number + NGF.1607.22.013 +
+
+
+
+
+ + ♻ ☆ AI-guided inverse design and discovery of recyclable vitrimeric polymers + + +
+ Vitrimer is a new, exciting class of sustainable polymers with the ability to +heal due to their dynamic covalent adaptive network that can go through +associative rearrangement reactions. However, a limited choice of constituent +molecules restricts their property space, prohibiting full realization of their +potential applications. To overcome this challenge, we couple molecular +dynamics (MD) simulations and a novel graph variational autoencoder (VAE) +machine learning model for inverse design of vitrimer chemistries with desired +glass transition temperature (Tg) and synthesize a novel vitrimer polymer. We +build the first vitrimer dataset of one million chemistries and calculate Tg on +8,424 of them by high-throughput MD simulations calibrated by a Gaussian +process model. The proposed novel VAE employs dual graph encoders and a latent +dimension overlapping scheme which allows for individual representation of +multi-component vitrimers. By constructing a continuous latent space containing +necessary information of vitrimers, we demonstrate high accuracy and efficiency +of our framework in discovering novel vitrimers with desirable Tg beyond the +training regime. To validate the effectiveness of our framework in experiments, +we generate novel vitrimer chemistries with a target Tg = 323 K. By +incorporating chemical intuition, we synthesize a vitrimer with Tg of 311-317 +K, and experimentally demonstrate healability and flowability. The proposed +framework offers an exciting tool for polymer chemists to design and synthesize +novel, sustainable vitrimer polymers for a facet of applications. + +
+
+
+
+
+ + ♻ ☆ A Hybrid Framework for Spatial Interpolation: Merging Data-driven with + Domain Knowledge + + +
+ Estimating spatially distributed information through the interpolation of +scattered observation datasets often overlooks the critical role of domain +knowledge in understanding spatial dependencies. Additionally, the features of +these data sets are typically limited to the spatial coordinates of the +scattered observation locations. In this paper, we propose a hybrid framework +that integrates data-driven spatial dependency feature extraction with +rule-assisted spatial dependency function mapping to augment domain knowledge. +We demonstrate the superior performance of our framework in two comparative +application scenarios, highlighting its ability to capture more localized +spatial features in the reconstructed distribution fields. Furthermore, we +underscore its potential to enhance nonlinear estimation capabilities through +the application of transformed fuzzy rules and to quantify the inherent +uncertainties associated with the observation data sets. Our framework +introduces an innovative approach to spatial information estimation by +synergistically combining observational data with rule-assisted domain +knowledge. + +
+
+ comment: 21 pages, 13 figures; typos corrected, references updated; few typos + in few equations corrected, changed to Tex source +
+
+
+
+
+ + ♻ ☆ DCEM: A deep complementary energy method for solid mechanics + + +
+ In recent years, the rapid advancement of deep learning has significantly +impacted various fields, particularly in solving partial differential equations +(PDEs) in the realm of solid mechanics, benefiting greatly from the remarkable +approximation capabilities of neural networks. In solving PDEs, +Physics-Informed Neural Networks (PINNs) and the Deep Energy Method (DEM) have +garnered substantial attention. The principle of minimum potential energy and +complementary energy are two important variational principles in solid +mechanics. However, the well-known Deep Energy Method (DEM) is based on the +principle of minimum potential energy, but there lacks the important form of +minimum complementary energy. To bridge this gap, we propose the deep +complementary energy method (DCEM) based on the principle of minimum +complementary energy. The output function of DCEM is the stress function, which +inherently satisfies the equilibrium equation. We present numerical results +using the Prandtl and Airy stress functions, and compare DCEM with existing +PINNs and DEM algorithms when modeling representative mechanical problems. The +results demonstrate that DCEM outperforms DEM in terms of stress accuracy and +efficiency and has an advantage in dealing with complex displacement boundary +conditions, which is supported by theoretical analyses and numerical +simulations. We extend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy +partial differential equations. Furthermore, we propose a deep complementary +energy operator method (DCEM-O) by combining operator learning with physical +equations. Initially, we train DCEM-O using high-fidelity numerical results and +then incorporate complementary energy. DCEM-P and DCEM-O further enhance the +accuracy and efficiency of DCEM. + +
+
+ comment: 50 pages, 32 figures +
+
+
+
+
+ + ♻ ☆ Beyond Unconstrained Features: Neural Collapse for Shallow Neural + Networks with General Data + + +
+ Neural collapse (NC) is a phenomenon that emerges at the terminal phase of +the training (TPT) of deep neural networks (DNNs). The features of the data in +the same class collapse to their respective sample means and the sample means +exhibit a simplex equiangular tight frame (ETF). In the past few years, there +has been a surge of works that focus on explaining why the NC occurs and how it +affects generalization. Since the DNNs are notoriously difficult to analyze, +most works mainly focus on the unconstrained feature model (UFM). While the UFM +explains the NC to some extent, it fails to provide a complete picture of how +the network architecture and the dataset affect NC. In this work, we focus on +shallow ReLU neural networks and try to understand how the width, depth, data +dimension, and statistical property of the training dataset influence the +neural collapse. We provide a complete characterization of when the NC occurs +for two or three-layer neural networks. For two-layer ReLU neural networks, a +sufficient condition on when the global minimizer of the regularized empirical +risk function exhibits the NC configuration depends on the data dimension, +sample size, and the signal-to-noise ratio in the data instead of the network +width. For three-layer neural networks, we show that the NC occurs as long as +the first layer is sufficiently wide. Regarding the connection between NC and +generalization, we show the generalization heavily depends on the SNR +(signal-to-noise ratio) in the data: even if the NC occurs, the generalization +can still be bad provided that the SNR in the data is too low. Our results +significantly extend the state-of-the-art theoretical analysis of the N C under +the UFM by characterizing the emergence of the N C under shallow nonlinear +networks and showing how it depends on data properties and network +architecture. + +
+
+
+
+
+ + ♻ ☆ The Stochastic Proximal Distance Algorithm + + +
+ Stochastic versions of proximal methods have gained much attention in +statistics and machine learning. These algorithms tend to admit simple, +scalable forms, and enjoy numerical stability via implicit updates. In this +work, we propose and analyze a stochastic version of the recently proposed +proximal distance algorithm, a class of iterative optimization methods that +recover a desired constrained estimation problem as a penalty parameter $\rho +\rightarrow \infty$. By uncovering connections to related stochastic proximal +methods and interpreting the penalty parameter as the learning rate, we justify +heuristics used in practical manifestations of the proximal distance method, +establishing their convergence guarantees for the first time. Moreover, we +extend recent theoretical devices to establish finite error bounds and a +complete characterization of convergence rates regimes. We validate our +analysis via a thorough empirical study, also showing that unsurprisingly, the +proposed method outpaces batch versions on popular learning tasks. + +
+
+
+
+
+ + ♻ ☆ Gated Ensemble of Spatio-temporal Mixture of Experts for Multi-task + Learning in Ride-hailing System + + +
+ Ride-hailing system requires efficient management of dynamic demand and +supply to ensure optimal service delivery, pricing strategies, and operational +efficiency. Designing spatio-temporal forecasting models separately in a +task-wise and city-wise manner to forecast demand and supply-demand gap in a +ride-hailing system poses a burden for the expanding transportation network +companies. Therefore, a multi-task learning architecture is proposed in this +study by developing gated ensemble of spatio-temporal mixture of experts +network (GESME-Net) with convolutional recurrent neural network (CRNN), +convolutional neural network (CNN), and recurrent neural network (RNN) for +simultaneously forecasting these spatio-temporal tasks in a city as well as +across different cities. Furthermore, a task adaptation layer is integrated +with the architecture for learning joint representation in multi-task learning +and revealing the contribution of the input features utilized in prediction. +The proposed architecture is tested with data from Didi Chuxing for: (i) +simultaneously forecasting demand and supply-demand gap in Beijing, and (ii) +simultaneously forecasting demand across Chengdu and Xian. In both scenarios, +models from our proposed architecture outperformed the single-task and +multi-task deep learning benchmarks and ensemble-based machine learning +algorithms. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2012.08868 +
+
+
+
+
+ + ♻ ☆ Can Differentiable Decision Trees Enable Interpretable Reward Learning + from Human Feedback? + + +
+ Reinforcement Learning from Human Feedback (RLHF) has emerged as a popular +paradigm for capturing human intent to alleviate the challenges of +hand-crafting the reward values. Despite the increasing interest in RLHF, most +works learn black box reward functions that while expressive are difficult to +interpret and often require running the whole costly process of RL before we +can even decipher if these frameworks are actually aligned with human +preferences. We propose and evaluate a novel approach for learning expressive +and interpretable reward functions from preferences using Differentiable +Decision Trees (DDTs). Our experiments across several domains, including +CartPole, Visual Gridworld environments and Atari games, provide evidence that +the tree structure of our learned reward function is useful in determining the +extent to which the reward function is aligned with human preferences. We also +provide experimental evidence that not only shows that reward DDTs can often +achieve competitive RL performance when compared with larger capacity deep +neural network reward functions but also demonstrates the diagnostic utility of +our framework in checking alignment of learned reward functions. We also +observe that the choice between soft and hard (argmax) output of reward DDT +reveals a tension between wanting highly shaped rewards to ensure good RL +performance, while also wanting simpler, more interpretable rewards. Videos and +code, are available at: https://sites.google.com/view/ddt-rlhf + +
+
+ comment: Accepted at RLC 2024 +
+
+
+
+
+ + ♻ ☆ SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated + Responses + + +
+ Can LLMs consistently improve their previous outputs for better results? For +this to be true, LLMs would need to be better at discriminating among +previously-generated alternatives, than generating initial responses. We +explore the validity of this hypothesis in practice. We first formulate a +unified framework that allows us to compare the generative and discriminative +capability of any model on any task. In our resulting experimental analysis of +several open-source and industrial LLMs, we observe that models are not +reliably better at discriminating among previously-generated alternatives than +generating initial responses. This finding challenges the notion that LLMs may +be able to enhance their performance only through their own judgment. + +
+
+
+
+
+ + ♻ Foundational Challenges in Assuring Alignment and Safety of Large + Language Models + + +
+ This work identifies 18 foundational challenges in assuring the alignment and +safety of large language models (LLMs). These challenges are organized into +three different categories: scientific understanding of LLMs, development and +deployment methods, and sociotechnical challenges. Based on the identified +challenges, we pose $200+$ concrete research questions. + +
+
+
+
+
+ + ♻ ☆ StraightLine: An End-to-End Resource-Aware Scheduler for Machine + Learning Application Requests + + +
+ The life cycle of machine learning (ML) applications consists of two stages: +model development and model deployment. However, traditional ML systems (e.g., +training-specific or inference-specific systems) focus on one particular stage +or phase of the life cycle of ML applications. These systems often aim at +optimizing model training or accelerating model inference, and they frequently +assume homogeneous infrastructure, which may not always reflect real-world +scenarios that include cloud data centers, local servers, containers, and +serverless platforms. We present StraightLine, an end-to-end resource-aware +scheduler that schedules the optimal resources (e.g., container, virtual +machine, or serverless) for different ML application requests in a hybrid +infrastructure. The key innovation is an empirical dynamic placing algorithm +that intelligently places requests based on their unique characteristics (e.g., +request frequency, input data size, and data distribution). In contrast to +existing ML systems, StraightLine offers end-to-end resource-aware placement, +thereby it can significantly reduce response time and failure rate for model +deployment when facing different computing resources in the hybrid +infrastructure. + +
+
+ comment: 6 pages, 8 figures, to appear in AIoTC'24 +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ HiSC4D: Human-centered interaction and 4D Scene Capture in Large-scale + Space Using Wearable IMUs and LiDAR + + +
+ We introduce HiSC4D, a novel Human-centered interaction and 4D Scene Capture +method, aimed at accurately and efficiently creating a dynamic digital world, +containing large-scale indoor-outdoor scenes, diverse human motions, rich +human-human interactions, and human-environment interactions. By utilizing +body-mounted IMUs and a head-mounted LiDAR, HiSC4D can capture egocentric human +motions in unconstrained space without the need for external devices and +pre-built maps. This affords great flexibility and accessibility for +human-centered interaction and 4D scene capturing in various environments. +Taking into account that IMUs can capture human spatially unrestricted poses +but are prone to drifting for long-period using, and while LiDAR is stable for +global localization but rough for local positions and orientations, HiSC4D +employs a joint optimization method, harmonizing all sensors and utilizing +environment cues, yielding promising results for long-term capture in large +scenes. To promote research of egocentric human interaction in large scenes and +facilitate downstream tasks, we also present a dataset, containing 8 sequences +in 4 large scenes (200 to 5,000 $m^2$), providing 36k frames of accurate 4D +human motions with SMPL annotations and dynamic scenes, 31k frames of cropped +human point clouds, and scene mesh of the environment. A variety of scenarios, +such as the basketball gym and commercial street, alongside challenging human +motions, such as daily greeting, one-on-one basketball playing, and tour +guiding, demonstrate the effectiveness and the generalization ability of +HiSC4D. The dataset and code will be publicated on +www.lidarhumanmotion.net/hisc4d available for research purposes. + +
+
+ comment: 17 pages, 10 figures, Jornal +
+
+
+
+
+ + ☆ Question-Answering Dense Video Events + + +
+ Multimodal Large Language Models (MLLMs) have shown excellent performance in +question-answering of single-event videos. In this paper, we present +question-answering dense video events, a novel task that requires answering and +grounding the dense-event questions in long videos, thus challenging MLLMs to +faithfully comprehend and reason about multiple events occurring over extended +time periods. To facilitate the study, we construct DeVE-QA - a dataset +featuring 78K questions about 26K events on 10.6K long videos. We then +benchmark and show that existing MLLMs excelling at single-event QA struggle to +perform well in DeVE-QA. For improvement, we propose DeVi, a novel +training-free MLLM approach that highlights a hierarchical captioning module, a +temporal event memory module, and a self-consistency checking module to +respectively detect, contextualize and memorize, and ground dense-events in +long videos for question answering. Extensive experiments show that DeVi is +superior at answering dense-event questions and grounding relevant video +moments. Compared with existing MLLMs, it achieves a remarkable increase of 4.1 +percent and 3.7 percent for G(round)QA accuracy on DeVE-QA and NExT-GQA +respectively. + +
+
+
+
+
+ + ☆ 3D-GP-LMVIC: Learning-based Multi-View Image Coding with 3D Gaussian + Geometric Priors + + +
+ Multi-view image compression is vital for 3D-related applications. To +effectively model correlations between views, existing methods typically +predict disparity between two views on a 2D plane, which works well for small +disparities, such as in stereo images, but struggles with larger disparities +caused by significant view changes. To address this, we propose a novel +approach: learning-based multi-view image coding with 3D Gaussian geometric +priors (3D-GP-LMVIC). Our method leverages 3D Gaussian Splatting to derive +geometric priors of the 3D scene, enabling more accurate disparity estimation +across views within the compression model. Additionally, we introduce a depth +map compression model to reduce redundancy in geometric information between +views. A multi-view sequence ordering method is also proposed to enhance +correlations between adjacent views. Experimental results demonstrate that +3D-GP-LMVIC surpasses both traditional and learning-based methods in +performance, while maintaining fast encoding and decoding speed. + +
+
+ comment: 19pages, 8 figures, conference +
+
+
+
+
+ + ♻ ☆ MSLIQA: Enhancing Learning Representations for Image Quality Assessment + through Multi-Scale Learning + + +
+ No-Reference Image Quality Assessment (NR-IQA) remains a challenging task due +to the diversity of distortions and the lack of large annotated datasets. Many +studies have attempted to tackle these challenges by developing more accurate +NR-IQA models, often employing complex and computationally expensive networks, +or by bridging the domain gap between various distortions to enhance +performance on test datasets. In our work, we improve the performance of a +generic lightweight NR-IQA model by introducing a novel augmentation strategy +that boosts its performance by almost 28\%. This augmentation strategy enables +the network to better discriminate between different distortions in various +parts of the image by zooming in and out. Additionally, the inclusion of +test-time augmentation further enhances performance, making our lightweight +network's results comparable to the current state-of-the-art models, simply +through the use of augmentations. + +
+
+
+
+
+ + ♻ ☆ LAR-IQA: A Lightweight, Accurate, and Robust No-Reference Image Quality + Assessment Model + + +
+ Recent advancements in the field of No-Reference Image Quality Assessment +(NR-IQA) using deep learning techniques demonstrate high performance across +multiple open-source datasets. However, such models are typically very large +and complex making them not so suitable for real-world deployment, especially +on resource- and battery-constrained mobile devices. To address this +limitation, we propose a compact, lightweight NR-IQA model that achieves +state-of-the-art (SOTA) performance on ECCV AIM UHD-IQA challenge validation +and test datasets while being also nearly 5.7 times faster than the fastest +SOTA model. Our model features a dual-branch architecture, with each branch +separately trained on synthetically and authentically distorted images which +enhances the model's generalizability across different distortion types. To +improve robustness under diverse real-world visual conditions, we additionally +incorporate multiple color spaces during the training process. We also +demonstrate the higher accuracy of recently proposed Kolmogorov-Arnold Networks +(KANs) for final quality regression as compared to the conventional Multi-Layer +Perceptrons (MLPs). Our evaluation considering various open-source datasets +highlights the practical, high-accuracy, and robust performance of our proposed +lightweight model. Code: https://github.com/nasimjamshidi/LAR-IQA. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 75 + +
+
+
+ + ☆ Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene + Understanding + + +
+ Complex 3D scene understanding has gained increasing attention, with scene +encoding strategies playing a crucial role in this success. However, the +optimal scene encoding strategies for various scenarios remain unclear, +particularly compared to their image-based counterparts. To address this issue, +we present a comprehensive study that probes various visual encoding models for +3D scene understanding, identifying the strengths and limitations of each model +across different scenarios. Our evaluation spans seven vision foundation +encoders, including image-based, video-based, and 3D foundation models. We +evaluate these models in four tasks: Vision-Language Scene Reasoning, Visual +Grounding, Segmentation, and Registration, each focusing on different aspects +of scene understanding. Our evaluations yield key findings: DINOv2 demonstrates +superior performance, video models excel in object-level tasks, diffusion +models benefit geometric tasks, and language-pretrained models show unexpected +limitations in language-related tasks. These insights challenge some +conventional understandings, provide novel perspectives on leveraging visual +foundation models, and highlight the need for more flexible encoder selection +in future vision-language and scene-understanding tasks. + +
+
+ comment: Project page: https://yunzeman.github.io/lexicon3d , Github: + https://github.com/YunzeMan/Lexicon3D +
+
+
+
+
+ + ☆ WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild + + +
+ The increasing availability of real-world conversation data offers exciting +opportunities for researchers to study user-chatbot interactions. However, the +sheer volume of this data makes manually examining individual conversations +impractical. To overcome this challenge, we introduce WildVis, an interactive +tool that enables fast, versatile, and large-scale conversation analysis. +WildVis provides search and visualization capabilities in the text and +embedding spaces based on a list of criteria. To manage million-scale datasets, +we implemented optimizations including search index construction, embedding +precomputation and compression, and caching to ensure responsive user +interactions within seconds. We demonstrate WildVis's utility through three +case studies: facilitating chatbot misuse research, visualizing and comparing +topic distributions across datasets, and characterizing user-specific +conversation patterns. WildVis is open-source and designed to be extendable, +supporting additional datasets and customized search and visualization +functionalities. + +
+
+
+
+
+ + ☆ Attention Heads of Large Language Models: A Survey + + +
+ Since the advent of ChatGPT, Large Language Models (LLMs) have excelled in +various tasks but remain largely as black-box systems. Consequently, their +development relies heavily on data-driven approaches, limiting performance +enhancement through changes in internal architecture and reasoning pathways. As +a result, many researchers have begun exploring the potential internal +mechanisms of LLMs, aiming to identify the essence of their reasoning +bottlenecks, with most studies focusing on attention heads. Our survey aims to +shed light on the internal reasoning processes of LLMs by concentrating on the +interpretability and underlying mechanisms of attention heads. We first distill +the human thought process into a four-stage framework: Knowledge Recalling, +In-Context Identification, Latent Reasoning, and Expression Preparation. Using +this framework, we systematically review existing research to identify and +categorize the functions of specific attention heads. Furthermore, we summarize +the experimental methodologies used to discover these special heads, dividing +them into two categories: Modeling-Free methods and Modeling-Required methods. +Also, we outline relevant evaluation methods and benchmarks. Finally, we +discuss the limitations of current research and propose several potential +future directions. Our reference list is open-sourced at +\url{https://github.com/IAAR-Shanghai/Awesome-Attention-Heads}. + +
+
+ comment: 20 pages, 11 figures, 4 tables +
+
+
+
+
+ + ☆ Planning In Natural Language Improves LLM Search For Code Generation + + +
+ While scaling training compute has led to remarkable improvements in large +language models (LLMs), scaling inference compute has not yet yielded analogous +gains. We hypothesize that a core missing component is a lack of diverse LLM +outputs, leading to inefficient search due to models repeatedly sampling highly +similar, yet incorrect generations. We empirically demonstrate that this lack +of diversity can be mitigated by searching over candidate plans for solving a +problem in natural language. Based on this insight, we propose PLANSEARCH, a +novel search algorithm which shows strong results across HumanEval+, MBPP+, and +LiveCodeBench (a contamination-free benchmark for competitive coding). +PLANSEARCH generates a diverse set of observations about the problem and then +uses these observations to construct plans for solving the problem. By +searching over plans in natural language rather than directly over code +solutions, PLANSEARCH explores a significantly more diverse range of potential +solutions compared to baseline search methods. Using PLANSEARCH on top of +Claude 3.5 Sonnet achieves a state-of-the-art pass@200 of 77.0% on +LiveCodeBench, outperforming both the best score achieved without search +(pass@1 = 41.4%) and using standard repeated sampling (pass@200 = 60.6%). +Finally, we show that, across all models, search algorithms, and benchmarks +analyzed, we can accurately predict performance gains due to search as a direct +function of the diversity over generated ideas. + +
+
+
+
+
+ + ☆ RAG based Question-Answering for Contextual Response Prediction System CIKM'24 + + +
+ Large Language Models (LLMs) have shown versatility in various Natural +Language Processing (NLP) tasks, including their potential as effective +question-answering systems. However, to provide precise and relevant +information in response to specific customer queries in industry settings, LLMs +require access to a comprehensive knowledge base to avoid hallucinations. +Retrieval Augmented Generation (RAG) emerges as a promising technique to +address this challenge. Yet, developing an accurate question-answering +framework for real-world applications using RAG entails several challenges: 1) +data availability issues, 2) evaluating the quality of generated content, and +3) the costly nature of human evaluation. In this paper, we introduce an +end-to-end framework that employs LLMs with RAG capabilities for industry use +cases. Given a customer query, the proposed system retrieves relevant knowledge +documents and leverages them, along with previous chat history, to generate +response suggestions for customer service agents in the contact centers of a +major retail company. Through comprehensive automated and human evaluations, we +show that this solution outperforms the current BERT-based algorithms in +accuracy and relevance. Our findings suggest that RAG-based LLMs can be an +excellent support to human customer service representatives by lightening their +workload. + +
+
+ comment: Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise, + CIKM'24. 6 pages +
+
+
+
+
+ + ☆ A Different Level Text Protection Mechanism With Differential Privacy + + +
+ The article introduces a method for extracting words of different degrees of +importance based on the BERT pre-training model and proves the effectiveness of +this method. The article also discusses the impact of maintaining the same +perturbation results for words of different importance on the overall text +utility. This method can be applied to long text protection. + +
+
+
+
+
+ + ☆ LAST: Language Model Aware Speech Tokenization + + +
+ Speech tokenization serves as the foundation of speech language model (LM), +enabling them to perform various tasks such as spoken language modeling, +text-to-speech, speech-to-text, etc. Most speech tokenizers are trained +independently of the LM training process, relying on separate acoustic models +and quantization methods. Following such an approach may create a mismatch +between the tokenization process and its usage afterward. In this study, we +propose a novel approach to training a speech tokenizer by leveraging +objectives from pre-trained textual LMs. We advocate for the integration of +this objective into the process of learning discrete speech representations. +Our aim is to transform features from a pre-trained speech model into a new +feature space that enables better clustering for speech LMs. We empirically +investigate the impact of various model design choices, including speech +vocabulary size and text LM size. Our results demonstrate the proposed +tokenization method outperforms the evaluated baselines considering both spoken +language modeling and speech-to-text. More importantly, unlike prior work, the +proposed method allows the utilization of a single pre-trained LM for +processing both speech and text inputs, setting it apart from conventional +tokenization approaches. + +
+
+
+
+
+ + ☆ A Fused Large Language Model for Predicting Startup Success + + +
+ Investors are continuously seeking profitable investment opportunities in +startups and, hence, for effective decision-making, need to predict a startup's +probability of success. Nowadays, investors can use not only various +fundamental information about a startup (e.g., the age of the startup, the +number of founders, and the business sector) but also textual description of a +startup's innovation and business model, which is widely available through +online venture capital (VC) platforms such as Crunchbase. To support the +decision-making of investors, we develop a machine learning approach with the +aim of locating successful startups on VC platforms. Specifically, we develop, +train, and evaluate a tailored, fused large language model to predict startup +success. Thereby, we assess to what extent self-descriptions on VC platforms +are predictive of startup success. Using 20,172 online profiles from +Crunchbase, we find that our fused large language model can predict startup +success, with textual self-descriptions being responsible for a significant +part of the predictive power. Our work provides a decision support tool for +investors to find profitable investment opportunities. + +
+
+
+
+
+ + ☆ The representation landscape of few-shot learning and fine-tuning in + large language models + + +
+ In-context learning (ICL) and supervised fine-tuning (SFT) are two common +strategies for improving the performance of modern large language models (LLMs) +on specific tasks. Despite their different natures, these strategies often lead +to comparable performance gains. However, little is known about whether they +induce similar representations inside LLMs. We approach this problem by +analyzing the probability landscape of their hidden representations in the two +cases. More specifically, we compare how LLMs solve the same question-answering +task, finding that ICL and SFT create very different internal structures, in +both cases undergoing a sharp transition in the middle of the network. In the +first half of the network, ICL shapes interpretable representations +hierarchically organized according to their semantic content. In contrast, the +probability landscape obtained with SFT is fuzzier and semantically mixed. In +the second half of the model, the fine-tuned representations develop +probability modes that better encode the identity of answers, while the +landscape of ICL representations is characterized by less defined peaks. Our +approach reveals the diverse computational strategies developed inside LLMs to +solve the same task across different conditions, allowing us to make a step +towards designing optimal methods to extract information from language models. + +
+
+
+
+
+ + ☆ LLM-based multi-agent poetry generation in non-cooperative environments + + +
+ Despite substantial progress of large language models (LLMs) for automatic +poetry generation, the generated poetry lacks diversity while the training +process differs greatly from human learning. Under the rationale that the +learning process of the poetry generation systems should be more human-like and +their output more diverse and novel, we introduce a framework based on social +learning where we emphasize non-cooperative interactions besides cooperative +interactions to encourage diversity. Our experiments are the first attempt at +LLM-based multi-agent systems in non-cooperative environments for poetry +generation employing both TRAINING-BASED agents (GPT-2) and PROMPTING-BASED +agents (GPT-3 and GPT-4). Our evaluation based on 96k generated poems shows +that our framework benefits the poetry generation process for TRAINING-BASED +agents resulting in 1) a 3.0-3.7 percentage point (pp) increase in diversity +and a 5.6-11.3 pp increase in novelty according to distinct and novel n-grams. +The generated poetry from TRAINING-BASED agents also exhibits group divergence +in terms of lexicons, styles and semantics. PROMPTING-BASED agents in our +framework also benefit from non-cooperative environments and a more diverse +ensemble of models with non-homogeneous agents has the potential to further +enhance diversity, with an increase of 7.0-17.5 pp according to our +experiments. However, PROMPTING-BASED agents show a decrease in lexical +diversity over time and do not exhibit the group-based divergence intended in +the social network. Our paper argues for a paradigm shift in creative tasks +such as automatic poetry generation to include social learning processes (via +LLM-based agent modeling) similar to human interaction. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ On the Limited Generalization Capability of the Implicit Reward Model + Induced by Direct Preference Optimization + + +
+ Reinforcement Learning from Human Feedback (RLHF) is an effective approach +for aligning language models to human preferences. Central to RLHF is learning +a reward function for scoring human preferences. Two main approaches for +learning a reward model are 1) training an EXplicit Reward Model (EXRM) as in +RLHF, and 2) using an implicit reward learned from preference data through +methods such as Direct Preference Optimization (DPO). Prior work has shown that +the implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in +the limit. DPORM's effectiveness directly implies the optimality of the learned +policy, and also has practical implication for LLM alignment methods including +iterative DPO. However, it is unclear how well DPORM empirically matches the +performance of EXRM. This work studies the accuracy at distinguishing preferred +and rejected answers for both DPORM and EXRM. Our findings indicate that even +though DPORM fits the training dataset comparably, it generalizes less +effectively than EXRM, especially when the validation datasets contain +distribution shifts. Across five out-of-distribution settings, DPORM has a mean +drop in accuracy of 3% and a maximum drop of 7%. These findings highlight that +DPORM has limited generalization ability and substantiates the integration of +an explicit reward model in iterative DPO approaches. + +
+
+ comment: 12 pages, 8 tables, 2 figures +
+
+
+
+
+ + ☆ CDM: A Reliable Metric for Fair and Accurate Formula Recognition + Evaluation + + +
+ Formula recognition presents significant challenges due to the complicated +structure and varied notation of mathematical expressions. Despite continuous +advancements in formula recognition models, the evaluation metrics employed by +these models, such as BLEU and Edit Distance, still exhibit notable +limitations. They overlook the fact that the same formula has diverse +representations and is highly sensitive to the distribution of training data, +thereby causing the unfairness in formula recognition evaluation. To this end, +we propose a Character Detection Matching (CDM) metric, ensuring the evaluation +objectivity by designing a image-level rather than LaTex-level metric score. +Specifically, CDM renders both the model-predicted LaTeX and the ground-truth +LaTeX formulas into image-formatted formulas, then employs visual feature +extraction and localization techniques for precise character-level matching, +incorporating spatial position information. Such a spatially-aware and +character-matching method offers a more accurate and equitable evaluation +compared with previous BLEU and Edit Distance metrics that rely solely on +text-based character matching. Experimentally, we evaluated various formula +recognition models using CDM, BLEU, and ExpRate metrics. Their results +demonstrate that the CDM aligns more closely with human evaluation standards +and provides a fairer comparison across different models by eliminating +discrepancies caused by diverse formula representations. + +
+
+ comment: Project Website: + https://github.com/opendatalab/UniMERNet/tree/main/cdm +
+
+
+
+
+ + ☆ Attend First, Consolidate Later: On the Importance of Attention in + Different LLM Layers + + +
+ In decoder-based LLMs, the representation of a given layer serves two +purposes: as input to the next layer during the computation of the current +token; and as input to the attention mechanism of future tokens. In this work, +we show that the importance of the latter role might be overestimated. To show +that, we start by manipulating the representations of previous tokens; e.g. by +replacing the hidden states at some layer k with random vectors. Our +experimenting with four LLMs and four tasks show that this operation often +leads to small to negligible drop in performance. Importantly, this happens if +the manipulation occurs in the top part of the model-k is in the final 30-50% +of the layers. In contrast, doing the same manipulation in earlier layers might +lead to chance level performance. We continue by switching the hidden state of +certain tokens with hidden states of other tokens from another prompt; e.g., +replacing the word "Italy" with "France" in "What is the capital of Italy?". We +find that when applying this switch in the top 1/3 of the model, the model +ignores it (answering "Rome"). However if we apply it before, the model +conforms to the switch ("Paris"). Our results hint at a two stage process in +transformer-based LLMs: the first part gathers input from previous tokens, +while the second mainly processes that information internally. + +
+
+
+
+
+ + ☆ 100 instances is all you need: predicting the success of a new LLM on + unseen data by testing on a few instances KDD + + +
+ Predicting the performance of LLMs on individual task instances is essential +to ensure their reliability in high-stakes applications. To do so, a +possibility is to evaluate the considered LLM on a set of task instances and +train an assessor to predict its performance based on features of the +instances. However, this approach requires evaluating each new LLM on a +sufficiently large set of task instances to train an assessor specific to it. +In this work, we leverage the evaluation results of previously tested LLMs to +reduce the number of evaluations required to predict the performance of a new +LLM. In practice, we propose to test the new LLM on a small set of reference +instances and train a generic assessor which predicts the performance of the +LLM on an instance based on the performance of the former on the reference set +and features of the instance of interest. We conduct empirical studies on +HELM-Lite and KindsOfReasoning, a collection of existing reasoning datasets +that we introduce, where we evaluate all instruction-fine-tuned OpenAI models +until the January 2024 version of GPT4. When predicting performance on +instances with the same distribution as those used to train the generic +assessor, we find this achieves performance comparable to the LLM-specific +assessors trained on the full set of instances. Additionally, we find that +randomly selecting the reference instances performs as well as some advanced +selection methods we tested. For out of distribution, however, no clear winner +emerges and the overall performance is worse, suggesting that the inherent +predictability of LLMs is low. + +
+
+ comment: Presented at the 2024 KDD workshop on Evaluation and Trustworthiness + of Generative AI Models +
+
+
+
+
+ + ☆ From MOOC to MAIC: Reshaping Online Teaching and Learning through + LLM-driven Agents + + +
+ Since the first instances of online education, where courses were uploaded to +accessible and shared online platforms, this form of scaling the dissemination +of human knowledge to reach a broader audience has sparked extensive discussion +and widespread adoption. Recognizing that personalized learning still holds +significant potential for improvement, new AI technologies have been +continuously integrated into this learning format, resulting in a variety of +educational AI applications such as educational recommendation and intelligent +tutoring. The emergence of intelligence in large language models (LLMs) has +allowed for these educational enhancements to be built upon a unified +foundational model, enabling deeper integration. In this context, we propose +MAIC (Massive AI-empowered Course), a new form of online education that +leverages LLM-driven multi-agent systems to construct an AI-augmented +classroom, balancing scalability with adaptivity. Beyond exploring the +conceptual framework and technical innovations, we conduct preliminary +experiments at Tsinghua University, one of China's leading universities. +Drawing from over 100,000 learning records of more than 500 students, we obtain +a series of valuable observations and initial analyses. This project will +continue to evolve, ultimately aiming to establish a comprehensive open +platform that supports and unifies research, technology, and applications in +exploring the possibilities of online education in the era of large model AI. +We envision this platform as a collaborative hub, bringing together educators, +researchers, and innovators to collectively explore the future of AI-driven +online education. + +
+
+
+
+
+ + ☆ How Much Data is Enough Data? Fine-Tuning Large Language Models for + In-House Translation: Performance Evaluation Across Multiple Dataset Sizes + + +
+ Decoder-only LLMs have shown impressive performance in MT due to their +ability to learn from extensive datasets and generate high-quality +translations. However, LLMs often struggle with the nuances and style required +for organisation-specific translation. In this study, we explore the +effectiveness of fine-tuning Large Language Models (LLMs), particularly Llama 3 +8B Instruct, leveraging translation memories (TMs), as a valuable resource to +enhance accuracy and efficiency. We investigate the impact of fine-tuning the +Llama 3 model using TMs from a specific organisation in the software sector. +Our experiments cover five translation directions across languages of varying +resource levels (English to Brazilian Portuguese, Czech, German, Finnish, and +Korean). We analyse diverse sizes of training datasets (1k to 207k segments) to +evaluate their influence on translation quality. We fine-tune separate models +for each training set and evaluate their performance based on automatic +metrics, BLEU, chrF++, TER, and COMET. Our findings reveal improvement in +translation performance with larger datasets across all metrics. On average, +BLEU and COMET scores increase by 13 and 25 points, respectively, on the +largest training set against the baseline model. Notably, there is a +performance deterioration in comparison with the baseline model when +fine-tuning on only 1k and 2k examples; however, we observe a substantial +improvement as the training dataset size increases. The study highlights the +potential of integrating TMs with LLMs to create bespoke translation models +tailored to the specific needs of businesses, thus enhancing translation +quality and reducing turn-around times. This approach offers a valuable insight +for organisations seeking to leverage TMs and LLMs for optimal translation +outcomes, especially in narrower domains. + +
+
+
+
+
+ + ☆ Fine-tuning large language models for domain adaptation: Exploration of + training strategies, scaling, model merging and synergistic capabilities + + +
+ The advancement of Large Language Models (LLMs) for domain applications in +fields such as materials science and engineering depends on the development of +fine-tuning strategies that adapt models for specialized, technical +capabilities. In this work, we explore the effects of Continued Pretraining +(CPT), Supervised Fine-Tuning (SFT), and various preference-based optimization +approaches, including Direct Preference Optimization (DPO) and Odds Ratio +Preference Optimization (ORPO), on fine-tuned LLM performance. Our analysis +shows how these strategies influence model outcomes and reveals that the +merging of multiple fine-tuned models can lead to the emergence of capabilities +that surpass the individual contributions of the parent models. We find that +model merging leads to new functionalities that neither parent model could +achieve alone, leading to improved performance in domain-specific assessments. +Experiments with different model architectures are presented, including Llama +3.1 8B and Mistral 7B models, where similar behaviors are observed. Exploring +whether the results hold also for much smaller models, we use a tiny LLM with +1.7 billion parameters and show that very small LLMs do not necessarily feature +emergent capabilities under model merging, suggesting that model scaling may be +a key component. In open-ended yet consistent chat conversations between a +human and AI models, our assessment reveals detailed insights into how +different model variants perform and show that the smallest model achieves a +high intelligence score across key criteria including reasoning depth, +creativity, clarity, and quantitative precision. Other experiments include the +development of image generation prompts based on disparate biological material +design concepts, to create new microstructures, architectural concepts, and +urban design based on biological materials-inspired construction principles. + +
+
+
+
+
+ + ☆ Rx Strategist: Prescription Verification using LLM Agents System + + +
+ To protect patient safety, modern pharmaceutical complexity demands strict +prescription verification. We offer a new approach - Rx Strategist - that makes +use of knowledge graphs and different search strategies to enhance the power of +Large Language Models (LLMs) inside an agentic framework. This multifaceted +technique allows for a multi-stage LLM pipeline and reliable information +retrieval from a custom-built active ingredient database. Different facets of +prescription verification, such as indication, dose, and possible drug +interactions, are covered in each stage of the pipeline. We alleviate the +drawbacks of monolithic LLM techniques by spreading reasoning over these +stages, improving correctness and reliability while reducing memory demands. +Our findings demonstrate that Rx Strategist surpasses many current LLMs, +achieving performance comparable to that of a highly experienced clinical +pharmacist. In the complicated world of modern medications, this combination of +LLMs with organized knowledge and sophisticated search methods presents a +viable avenue for reducing prescription errors and enhancing patient outcomes. + +
+
+ comment: 17 Pages, 6 Figures, Under Review +
+
+
+
+
+ + ☆ CogniDual Framework: Self-Training Large Language Models within a + Dual-System Theoretical Framework for Improving Cognitive Tasks + + +
+ Cognitive psychology investigates perception, attention, memory, language, +problem-solving, decision-making, and reasoning. Kahneman's dual-system theory +elucidates the human decision-making process, distinguishing between the rapid, +intuitive System 1 and the deliberative, rational System 2. Recent advancements +have positioned large language Models (LLMs) as formidable tools nearing +human-level proficiency in various cognitive tasks. Nonetheless, the presence +of a dual-system framework analogous to human cognition in LLMs remains +unexplored. This study introduces the \textbf{CogniDual Framework for LLMs} +(CFLLMs), designed to assess whether LLMs can, through self-training, evolve +from deliberate deduction to intuitive responses, thereby emulating the human +process of acquiring and mastering new information. Our findings reveal the +cognitive mechanisms behind LLMs' response generation, enhancing our +understanding of their capabilities in cognitive psychology. Practically, +self-trained models can provide faster responses to certain queries, reducing +computational demands during inference. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models through Natural Language Processing to + provide interpretable Machine Learning predictions of mental deterioration in + real time + + +
+ Based on official estimates, 50 million people worldwide are affected by +dementia, and this number increases by 10 million new patients every year. +Without a cure, clinical prognostication and early intervention represent the +most effective ways to delay its progression. To this end, Artificial +Intelligence and computational linguistics can be exploited for natural +language analysis, personalized assessment, monitoring, and treatment. However, +traditional approaches need more semantic knowledge management and +explicability capabilities. Moreover, using Large Language Models (LLMs) for +cognitive decline diagnosis is still scarce, even though these models represent +the most advanced way for clinical-patient communication using intelligent +systems. Consequently, we leverage an LLM using the latest Natural Language +Processing (NLP) techniques in a chatbot solution to provide interpretable +Machine Learning prediction of cognitive decline in real-time. +Linguistic-conceptual features are exploited for appropriate natural language +analysis. Through explainability, we aim to fight potential biases of the +models and improve their potential to help clinical workers in their diagnosis +decisions. More in detail, the proposed pipeline is composed of (i) data +extraction employing NLP-based prompt engineering; (ii) stream-based data +processing including feature engineering, analysis, and selection; (iii) +real-time classification; and (iv) the explainability dashboard to provide +visual and natural language descriptions of the prediction outcome. +Classification results exceed 80 % in all evaluation metrics, with a recall +value for the mental deterioration class about 85 %. To sum up, we contribute +with an affordable, flexible, non-invasive, personalized diagnostic system to +this work. + +
+
+
+
+
+ + ☆ Con-ReCall: Detecting Pre-training Data in LLMs via Contrastive Decoding + + +
+ The training data in large language models is key to their success, but it +also presents privacy and security risks, as it may contain sensitive +information. Detecting pre-training data is crucial for mitigating these +concerns. Existing methods typically analyze target text in isolation or solely +with non-member contexts, overlooking potential insights from simultaneously +considering both member and non-member contexts. While previous work suggested +that member contexts provide little information due to the minor distributional +shift they induce, our analysis reveals that these subtle shifts can be +effectively leveraged when contrasted with non-member contexts. In this paper, +we propose Con-ReCall, a novel approach that leverages the asymmetric +distributional shifts induced by member and non-member contexts through +contrastive decoding, amplifying subtle differences to enhance membership +inference. Extensive empirical evaluations demonstrate that Con-ReCall achieves +state-of-the-art performance on the WikiMIA benchmark and is robust against +various text manipulation techniques. + +
+
+
+
+
+ + ☆ Sketch: A Toolkit for Streamlining LLM Operations + + +
+ Large language models (LLMs) represented by GPT family have achieved +remarkable success. The characteristics of LLMs lie in their ability to +accommodate a wide range of tasks through a generative approach. However, the +flexibility of their output format poses challenges in controlling and +harnessing the model's outputs, thereby constraining the application of LLMs in +various domains. In this work, we present Sketch, an innovative toolkit +designed to streamline LLM operations across diverse fields. Sketch comprises +the following components: (1) a suite of task description schemas and prompt +templates encompassing various NLP tasks; (2) a user-friendly, interactive +process for building structured output LLM services tailored to various NLP +tasks; (3) an open-source dataset for output format control, along with tools +for dataset construction; and (4) an open-source model based on +LLaMA3-8B-Instruct that adeptly comprehends and adheres to output formatting +instructions. We anticipate this initiative to bring considerable convenience +to LLM users, achieving the goal of ''plug-and-play'' for various applications. +The components of Sketch will be progressively open-sourced at +https://github.com/cofe-ai/Sketch. + +
+
+
+
+
+ + ☆ Normal forms in Virus Machines + + +
+ In the present work, we further study the computational power of virus +machines (VMs in short). VMs provide a computing paradigm inspired by the +transmission and replication networks of viruses. VMs consist of process units +(called hosts) structured by a directed graph whose arcs are called channels +and an instruction graph that controls the transmissions of virus objects among +hosts. The present work complements our understanding of the computing power of +VMs by introducing normal forms; these expressions restrict the features in a +given computing model. Some of the features that we restrict in our normal +forms include (a) the number of hosts, (b) the number of instructions, and (c) +the number of virus objects in each host. After we recall some known results on +the computing power of VMs we give our normal forms, such as the size of the +loops in the network, proving new characterisations of family of sets, such as +the finite sets, semilinear sets, or NRE. + +
+
+
+
+
+ + ☆ N-gram Prediction and Word Difference Representations for Language + Modeling + + +
+ Causal language modeling (CLM) serves as the foundational framework +underpinning remarkable successes of recent large language models (LLMs). +Despite its success, the training approach for next word prediction poses a +potential risk of causing the model to overly focus on local dependencies +within a sentence. While prior studies have been introduced to predict future N +words simultaneously, they were primarily applied to tasks such as masked +language modeling (MLM) and neural machine translation (NMT). In this study, we +introduce a simple N-gram prediction framework for the CLM task. Moreover, we +introduce word difference representation (WDR) as a surrogate and +contextualized target representation during model training on the basis of +N-gram prediction framework. To further enhance the quality of next word +prediction, we propose an ensemble method that incorporates the future N words' +prediction results. Empirical evaluations across multiple benchmark datasets +encompassing CLM and NMT tasks demonstrate the significant advantages of our +proposed methods over the conventional CLM. + +
+
+
+
+
+ + ☆ LLM Detectors Still Fall Short of Real World: Case of LLM-Generated + Short News-Like Posts EMNLP + + +
+ With the emergence of widely available powerful LLMs, disinformation +generated by large Language Models (LLMs) has become a major concern. +Historically, LLM detectors have been touted as a solution, but their +effectiveness in the real world is still to be proven. In this paper, we focus +on an important setting in information operations -- short news-like posts +generated by moderately sophisticated attackers. + We demonstrate that existing LLM detectors, whether zero-shot or +purpose-trained, are not ready for real-world use in that setting. All tested +zero-shot detectors perform inconsistently with prior benchmarks and are highly +vulnerable to sampling temperature increase, a trivial attack absent from +recent benchmarks. A purpose-trained detector generalizing across LLMs and +unseen attacks can be developed, but it fails to generalize to new +human-written texts. + We argue that the former indicates domain-specific benchmarking is needed, +while the latter suggests a trade-off between the adversarial evasion +resilience and overfitting to the reference human text, with both needing +evaluation in benchmarks and currently absent. We believe this suggests a +re-consideration of current LLM detector benchmarking approaches and provides a +dynamically extensible benchmark to allow it +(https://github.com/Reliable-Information-Lab-HEVS/dynamic_llm_detector_benchmark). + +
+
+ comment: 20 pages, 7 tables, 13 figures, under consideration for EMNLP +
+
+
+
+
+ + ☆ iText2KG: Incremental Knowledge Graphs Construction Using Large Language + Models + + +
+ Most available data is unstructured, making it challenging to access valuable +information. Automatically building Knowledge Graphs (KGs) is crucial for +structuring data and making it accessible, allowing users to search for +information effectively. KGs also facilitate insights, inference, and +reasoning. Traditional NLP methods, such as named entity recognition and +relation extraction, are key in information retrieval but face limitations, +including the use of predefined entity types and the need for supervised +learning. Current research leverages large language models' capabilities, such +as zero- or few-shot learning. However, unresolved and semantically duplicated +entities and relations still pose challenges, leading to inconsistent graphs +and requiring extensive post-processing. Additionally, most approaches are +topic-dependent. In this paper, we propose iText2KG, a method for incremental, +topic-independent KG construction without post-processing. This plug-and-play, +zero-shot method is applicable across a wide range of KG construction scenarios +and comprises four modules: Document Distiller, Incremental Entity Extractor, +Incremental Relation Extractor, and Graph Integrator and Visualization. Our +method demonstrates superior performance compared to baseline methods across +three scenarios: converting scientific papers to graphs, websites to graphs, +and CVs to graphs. + +
+
+ comment: Accepted at The International Web Information Systems Engineering + conference (the WISE conference) 2024 +
+
+
+
+
+ + ☆ ChartMoE: Mixture of Expert Connector for Advanced Chart Understanding + + +
+ Automatic chart understanding is crucial for content comprehension and +document parsing. Multimodal large language models (MLLMs) have demonstrated +remarkable capabilities in chart understanding through domain-specific +alignment and fine-tuning. However, the application of alignment training +within the chart domain is still underexplored. To address this, we propose +ChartMoE, which employs the mixture of expert (MoE) architecture to replace the +traditional linear projector to bridge the modality gap. Specifically, we train +multiple linear connectors through distinct alignment tasks, which are utilized +as the foundational initialization parameters for different experts. +Additionally, we introduce ChartMoE-Align, a dataset with over 900K +chart-table-JSON-code quadruples to conduct three alignment tasks +(chart-table/JSON/code). Combined with the vanilla connector, we initialize +different experts in four distinct ways and adopt high-quality knowledge +learning to further refine the MoE connector and LLM parameters. Extensive +experiments demonstrate the effectiveness of the MoE connector and our +initialization strategy, e.g., ChartMoE improves the accuracy of the previous +state-of-the-art from 80.48% to 84.64% on the ChartQA benchmark. + +
+
+
+
+
+ + ☆ Strategic Chain-of-Thought: Guiding Accurate Reasoning in LLMs through + Strategy Elicitation + + +
+ The Chain-of-Thought (CoT) paradigm has emerged as a critical approach for +enhancing the reasoning capabilities of large language models (LLMs). However, +despite their widespread adoption and success, CoT methods often exhibit +instability due to their inability to consistently ensure the quality of +generated reasoning paths, leading to sub-optimal reasoning performance. To +address this challenge, we propose the \textbf{Strategic Chain-of-Thought} +(SCoT), a novel methodology designed to refine LLM performance by integrating +strategic knowledge prior to generating intermediate reasoning steps. SCoT +employs a two-stage approach within a single prompt: first eliciting an +effective problem-solving strategy, which is then used to guide the generation +of high-quality CoT paths and final answers. Our experiments across eight +challenging reasoning datasets demonstrate significant improvements, including +a 21.05\% increase on the GSM8K dataset and 24.13\% on the Tracking\_Objects +dataset, respectively, using the Llama3-8b model. Additionally, we extend the +SCoT framework to develop a few-shot method with automatically matched +demonstrations, yielding even stronger results. These findings underscore the +efficacy of SCoT, highlighting its potential to substantially enhance LLM +performance in complex reasoning tasks. + +
+
+
+
+
+ + ☆ GraphInsight: Unlocking Insights in Large Language Models for Graph + Structure Understanding + + +
+ Although Large Language Models (LLMs) have demonstrated potential in +processing graphs, they struggle with comprehending graphical structure +information through prompts of graph description sequences, especially as the +graph size increases. We attribute this challenge to the uneven memory +performance of LLMs across different positions in graph description sequences, +known as ''positional biases''. To address this, we propose GraphInsight, a +novel framework aimed at improving LLMs' comprehension of both macro- and +micro-level graphical information. GraphInsight is grounded in two key +strategies: 1) placing critical graphical information in positions where LLMs +exhibit stronger memory performance, and 2) investigating a lightweight +external knowledge base for regions with weaker memory performance, inspired by +retrieval-augmented generation (RAG). Moreover, GraphInsight explores +integrating these two strategies into LLM agent processes for composite graph +tasks that require multi-step reasoning. Extensive empirical studies on +benchmarks with a wide range of evaluation tasks show that GraphInsight +significantly outperforms all other graph description methods (e.g., prompting +techniques and reordering strategies) in understanding graph structures of +varying sizes. + +
+
+
+
+
+ + ☆ Understanding LLM Development Through Longitudinal Study: Insights from + the Open Ko-LLM Leaderboard + + +
+ This paper conducts a longitudinal study over eleven months to address the +limitations of prior research on the Open Ko-LLM Leaderboard, which have relied +on empirical studies with restricted observation periods of only five months. +By extending the analysis duration, we aim to provide a more comprehensive +understanding of the progression in developing Korean large language models +(LLMs). Our study is guided by three primary research questions: (1) What are +the specific challenges in improving LLM performance across diverse tasks on +the Open Ko-LLM Leaderboard over time? (2) How does model size impact task +performance correlations across various benchmarks? (3) How have the patterns +in leaderboard rankings shifted over time on the Open Ko-LLM Leaderboard?. By +analyzing 1,769 models over this period, our research offers a comprehensive +examination of the ongoing advancements in LLMs and the evolving nature of +evaluation frameworks. + +
+
+
+
+
+ + ☆ E2CL: Exploration-based Error Correction Learning for Embodied Agents + + +
+ Language models are exhibiting increasing capability in knowledge utilization +and reasoning. However, when applied as agents in embodied environments, they +often suffer from misalignment between their intrinsic knowledge and +environmental knowledge, leading to infeasible actions. Traditional environment +alignment methods, such as supervised learning on expert trajectories and +reinforcement learning, face limitations in covering environmental knowledge +and achieving efficient convergence, respectively. Inspired by human learning, +we propose Exploration-based Error Correction Learning (E2CL), a novel +framework that leverages exploration-induced errors and environmental feedback +to enhance environment alignment for LM-based agents. E2CL incorporates +teacher-guided and teacher-free exploration to gather environmental feedback +and correct erroneous actions. The agent learns to provide feedback and +self-correct, thereby enhancing its adaptability to target environments. +Evaluations in the Virtualhome environment demonstrate that E2CL-trained agents +outperform those trained by baseline methods and exhibit superior +self-correction capabilities. + +
+
+
+
+
+ + ☆ Preserving Empirical Probabilities in BERT for Small-sample Clinical + Entity Recognition + + +
+ Named Entity Recognition (NER) encounters the challenge of unbalanced labels, +where certain entity types are overrepresented while others are +underrepresented in real-world datasets. This imbalance can lead to biased +models that perform poorly on minority entity classes, impeding accurate and +equitable entity recognition. This paper explores the effects of unbalanced +entity labels of the BERT-based pre-trained model. We analyze the different +mechanisms of loss calculation and loss propagation for the task of token +classification on randomized datasets. Then we propose ways to improve the +token classification for the highly imbalanced task of clinical entity +recognition. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Enhancing Healthcare LLM Trust with Atypical Presentations Recalibration + + +
+ Black-box large language models (LLMs) are increasingly deployed in various +environments, making it essential for these models to effectively convey their +confidence and uncertainty, especially in high-stakes settings. However, these +models often exhibit overconfidence, leading to potential risks and +misjudgments. Existing techniques for eliciting and calibrating LLM confidence +have primarily focused on general reasoning datasets, yielding only modest +improvements. Accurate calibration is crucial for informed decision-making and +preventing adverse outcomes but remains challenging due to the complexity and +variability of tasks these models perform. In this work, we investigate the +miscalibration behavior of black-box LLMs within the healthcare setting. We +propose a novel method, \textit{Atypical Presentations Recalibration}, which +leverages atypical presentations to adjust the model's confidence estimates. +Our approach significantly improves calibration, reducing calibration errors by +approximately 60\% on three medical question answering datasets and +outperforming existing methods such as vanilla verbalized confidence, CoT +verbalized confidence and others. Additionally, we provide an in-depth analysis +of the role of atypicality within the recalibration framework. + +
+
+
+
+
+ + ☆ xLAM: A Family of Large Action Models to Empower AI Agent Systems + + +
+ Autonomous agents powered by large language models (LLMs) have attracted +significant research interest. However, the open-source community faces many +challenges in developing specialized models for agent tasks, driven by the +scarcity of high-quality agent datasets and the absence of standard protocols +in this area. We introduce and publicly release xLAM, a series of large action +models designed for AI agent tasks. The xLAM series includes five models with +both dense and mixture-of-expert architectures, ranging from 1B to 8x22B +parameters, trained using a scalable, flexible pipeline that unifies, augments, +and synthesizes diverse datasets to enhance AI agents' generalizability and +performance across varied environments. Our experimental results demonstrate +that xLAM consistently delivers exceptional performance across multiple agent +ability benchmarks, notably securing the 1st position on the Berkeley +Function-Calling Leaderboard, outperforming GPT-4, Claude-3, and many other +models in terms of tool use. By releasing the xLAM series, we aim to advance +the performance of open-source LLMs for autonomous AI agents, potentially +accelerating progress and democratizing access to high-performance models for +agent tasks. Models are available at +https://huggingface.co/collections/Salesforce/xlam-models-65f00e2a0a63bbcd1c2dade4 + +
+
+ comment: Technical report for the Salesforce xLAM model series +
+
+
+
+
+ + ☆ An Effective Deployment of Diffusion LM for Data Augmentation in + Low-Resource Sentiment Classification + + +
+ Sentiment classification (SC) often suffers from low-resource challenges such +as domain-specific contexts, imbalanced label distributions, and few-shot +scenarios. The potential of the diffusion language model (LM) for textual data +augmentation (DA) remains unexplored, moreover, textual DA methods struggle to +balance the diversity and consistency of new samples. Most DA methods either +perform logical modifications or rephrase less important tokens in the original +sequence with the language model. In the context of SC, strong emotional tokens +could act critically on the sentiment of the whole sequence. Therefore, +contrary to rephrasing less important context, we propose DiffusionCLS to +leverage a diffusion LM to capture in-domain knowledge and generate pseudo +samples by reconstructing strong label-related tokens. This approach ensures a +balance between consistency and diversity, avoiding the introduction of noise +and augmenting crucial features of datasets. DiffusionCLS also comprises a +Noise-Resistant Training objective to help the model generalize. Experiments +demonstrate the effectiveness of our method in various low-resource scenarios +including domain-specific and domain-general problems. Ablation studies confirm +the effectiveness of our framework's modules, and visualization studies +highlight optimal deployment conditions, reinforcing our conclusions. + +
+
+
+
+
+ + ☆ Bypassing DARCY Defense: Indistinguishable Universal Adversarial + Triggers + + +
+ Neural networks (NN) classification models for Natural Language Processing +(NLP) are vulnerable to the Universal Adversarial Triggers (UAT) attack that +triggers a model to produce a specific prediction for any input. DARCY borrows +the "honeypot" concept to bait multiple trapdoors, effectively detecting the +adversarial examples generated by UAT. Unfortunately, we find a new UAT +generation method, called IndisUAT, which produces triggers (i.e., tokens) and +uses them to craft adversarial examples whose feature distribution is +indistinguishable from that of the benign examples in a randomly-chosen +category at the detection layer of DARCY. The produced adversarial examples +incur the maximal loss of predicting results in the DARCY-protected models. +Meanwhile, the produced triggers are effective in black-box models for text +generation, text inference, and reading comprehension. Finally, the evaluation +results under NN models for NLP tasks indicate that the IndisUAT method can +effectively circumvent DARCY and penetrate other defenses. For example, +IndisUAT can reduce the true positive rate of DARCY's detection by at least +40.8% and 90.6%, and drop the accuracy by at least 33.3% and 51.6% in the RNN +and CNN models, respectively. IndisUAT reduces the accuracy of the BERT's +adversarial defense model by at least 34.0%, and makes the GPT-2 language model +spew racist outputs even when conditioned on non-racial context. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ MARAGS: A Multi-Adapter System for Multi-Task Retrieval Augmented + Generation Question Answering KDD + + +
+ In this paper we present a multi-adapter retrieval augmented generation +system (MARAGS) for Meta's Comprehensive RAG (CRAG) competition for KDD CUP +2024. CRAG is a question answering dataset contains 3 different subtasks aimed +at realistic question and answering RAG related tasks, with a diverse set of +question topics, question types, time dynamic answers, and questions featuring +entities of varying popularity. + Our system follows a standard setup for web based RAG, which uses processed +web pages to provide context for an LLM to produce generations, while also +querying API endpoints for additional information. MARAGS also utilizes +multiple different adapters to solve the various requirements for these tasks +with a standard cross-encoder model for ranking candidate passages relevant for +answering the question. Our system achieved 2nd place for Task 1 as well as 3rd +place on Task 2. + +
+
+ comment: Accepted to CRAG KDD Cup 24 Workshop +
+
+
+
+
+ + ☆ Continual Skill and Task Learning via Dialogue + + +
+ Continual and interactive robot learning is a challenging problem as the +robot is present with human users who expect the robot to learn novel skills to +solve novel tasks perpetually with sample efficiency. In this work we present a +framework for robots to query and learn visuo-motor robot skills and task +relevant information via natural language dialog interactions with human users. +Previous approaches either focus on improving the performance of instruction +following agents, or passively learn novel skills or concepts. Instead, we used +dialog combined with a language-skill grounding embedding to query or confirm +skills and/or tasks requested by a user. To achieve this goal, we developed and +integrated three different components for our agent. Firstly, we propose a +novel visual-motor control policy ACT with Low Rank Adaptation (ACT-LoRA), +which enables the existing SoTA ACT model to perform few-shot continual +learning. Secondly, we develop an alignment model that projects demonstrations +across skill embodiments into a shared embedding allowing us to know when to +ask questions and/or demonstrations from users. Finally, we integrated an +existing LLM to interact with a human user to perform grounded interactive +continual skill learning to solve a task. Our ACT-LoRA model learns novel +fine-tuned skills with a 100% accuracy when trained with only five +demonstrations for a novel skill while still maintaining a 74.75% accuracy on +pre-trained skills in the RLBench dataset where other models fall significantly +short. We also performed a human-subjects study with 8 subjects to demonstrate +the continual learning capabilities of our combined framework. We achieve a +success rate of 75% in the task of sandwich making with the real robot learning +from participant data demonstrating that robots can learn novel skills or task +knowledge from dialogue with non-expert users using our approach. + +
+
+
+
+
+ + ☆ MaterialBENCH: Evaluating College-Level Materials Science + Problem-Solving Abilities of Large Language Models + + +
+ A college-level benchmark dataset for large language models (LLMs) in the +materials science field, MaterialBENCH, is constructed. This dataset consists +of problem-answer pairs, based on university textbooks. There are two types of +problems: one is the free-response answer type, and the other is the +multiple-choice type. Multiple-choice problems are constructed by adding three +incorrect answers as choices to a correct answer, so that LLMs can choose one +of the four as a response. Most of the problems for free-response answer and +multiple-choice types overlap except for the format of the answers. We also +conduct experiments using the MaterialBENCH on LLMs, including ChatGPT-3.5, +ChatGPT-4, Bard (at the time of the experiments), and GPT-3.5 and GPT-4 with +the OpenAI API. The differences and similarities in the performance of LLMs +measured by the MaterialBENCH are analyzed and discussed. Performance +differences between the free-response type and multiple-choice type in the same +models and the influence of using system massages on multiple-choice problems +are also studied. We anticipate that MaterialBENCH will encourage further +developments of LLMs in reasoning abilities to solve more complicated problems +and eventually contribute to materials research and discovery. + +
+
+
+
+
+ + ☆ Debate on Graph: a Flexible and Reliable Reasoning Framework for Large + Language Models + + +
+ Large Language Models (LLMs) may suffer from hallucinations in real-world +applications due to the lack of relevant knowledge. In contrast, knowledge +graphs encompass extensive, multi-relational structures that store a vast array +of symbolic facts. Consequently, integrating LLMs with knowledge graphs has +been extensively explored, with Knowledge Graph Question Answering (KGQA) +serving as a critical touchstone for the integration. This task requires LLMs +to answer natural language questions by retrieving relevant triples from +knowledge graphs. However, existing methods face two significant challenges: +\textit{excessively long reasoning paths distracting from the answer +generation}, and \textit{false-positive relations hindering the path +refinement}. In this paper, we propose an iterative interactive KGQA framework +that leverages the interactive learning capabilities of LLMs to perform +reasoning and Debating over Graphs (DoG). Specifically, DoG employs a +subgraph-focusing mechanism, allowing LLMs to perform answer trying after each +reasoning step, thereby mitigating the impact of lengthy reasoning paths. On +the other hand, DoG utilizes a multi-role debate team to gradually simplify +complex questions, reducing the influence of false-positive relations. This +debate mechanism ensures the reliability of the reasoning process. Experimental +results on five public datasets demonstrate the effectiveness and superiority +of our architecture. Notably, DoG outperforms the state-of-the-art method ToG +by 23.7\% and 9.1\% in accuracy on WebQuestions and GrailQA, respectively. +Furthermore, the integration experiments with various LLMs on the mentioned +datasets highlight the flexibility of DoG. Code is available at +\url{https://github.com/reml-group/DoG}. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase + Recommendation + + +
+ Online sellers and advertisers are recommended keyphrases for their listed +products, which they bid on to enhance their sales. One popular paradigm that +generates such recommendations is Extreme Multi-Label Classification (XMC), +which involves tagging/mapping keyphrases to items. We outline the limitations +of using traditional item-query based tagging or mapping techniques for +keyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an +innovative graph-based approach that recommends keyphrases to sellers using +extraction of token permutations from item titles. Additionally, we demonstrate +that relying on traditional metrics such as precision/recall can be misleading +in practical applications, thereby necessitating a combination of metrics to +evaluate performance in real-world scenarios. These metrics are designed to +assess the relevance of keyphrases to items and the potential for buyer +outreach. GraphEx outperforms production models at eBay, achieving the +objectives mentioned above. It supports near real-time inferencing in +resource-constrained production environments and scales effectively for +billions of items. + +
+
+
+
+
+ + ☆ Experimentation in Content Moderation using RWKV + + +
+ This paper investigates the RWKV model's efficacy in content moderation +through targeted experimentation. We introduce a novel dataset specifically +designed for distillation into smaller models, enhancing content moderation +practices. This comprehensive dataset encompasses images, videos, sounds, and +text data that present societal challenges. Leveraging advanced Large Language +Models (LLMs), we generated an extensive set of responses -- 558,958 for text +and 83,625 for images -- to train and refine content moderation systems. Our +core experimentation involved fine-tuning the RWKV model, capitalizing on its +CPU-efficient architecture to address large-scale content moderation tasks. By +highlighting the dataset's potential for knowledge distillation, this study not +only demonstrates RWKV's capability in improving the accuracy and efficiency of +content moderation systems but also paves the way for developing more compact, +resource-efficient models in this domain. Datasets and models can be found in +HuggingFace: https://huggingface.co/modrwkv + +
+
+
+
+
+ + ☆ CACER: Clinical Concept Annotations for Cancer Events and Relations + + +
+ Clinical notes contain unstructured representations of patient histories, +including the relationships between medical problems and prescription drugs. To +investigate the relationship between cancer drugs and their associated symptom +burden, we extract structured, semantic representations of medical problem and +drug information from the clinical narratives of oncology notes. We present +Clinical Concept Annotations for Cancer Events and Relations (CACER), a novel +corpus with fine-grained annotations for over 48,000 medical problems and drug +events and 10,000 drug-problem and problem-problem relations. Leveraging CACER, +we develop and evaluate transformer-based information extraction (IE) models +such as BERT, Flan-T5, Llama3, and GPT-4 using fine-tuning and in-context +learning (ICL). In event extraction, the fine-tuned BERT and Llama3 models +achieved the highest performance at 88.2-88.0 F1, which is comparable to the +inter-annotator agreement (IAA) of 88.4 F1. In relation extraction, the +fine-tuned BERT, Flan-T5, and Llama3 achieved the highest performance at +61.8-65.3 F1. GPT-4 with ICL achieved the worst performance across both tasks. +The fine-tuned models significantly outperformed GPT-4 in ICL, highlighting the +importance of annotated training data and model optimization. Furthermore, the +BERT models performed similarly to Llama3. For our task, LLMs offer no +performance advantage over the smaller BERT models. The results emphasize the +need for annotated training data to optimize models. Multiple fine-tuned +transformer models achieved performance comparable to IAA for several +extraction tasks. + +
+
+ comment: This is a pre-copy-editing, author-produced PDF of an article + accepted for publication in JAMIA following peer review. The definitive + publisher-authenticated version is available online at + https://academic.oup.com/jamia/advance-article/doi/10.1093/jamia/ocae231/7748302 +
+
+
+
+
+ + ☆ Sirius: Contextual Sparsity with Correction for Efficient LLMs + + +
+ With the blossom of large language models (LLMs), inference efficiency +becomes increasingly important. Various approximation methods are proposed to +reduce the cost at inference time. Contextual Sparsity (CS) is appealing for +its training-free nature and its ability to reach a higher compression ratio +seemingly without quality degradation. However, after a comprehensive +evaluation of contextual sparsity methods on various complex generation tasks, +we find that although CS succeeds in prompt-understanding tasks, CS +significantly degrades the model performance for reasoning, deduction, and +knowledge-based tasks. Despite the gap in end-to-end accuracy, we observed that +sparse models often share general problem-solving logic and require only a few +token corrections to recover the original model performance. This paper +introduces Sirius, an efficient correction mechanism, which significantly +recovers CS models quality on reasoning tasks while maintaining its efficiency +gain. Sirius is evaluated on 6 models with 8 difficult generation tasks in +reasoning, math, and coding and shows consistent effectiveness and efficiency. +Also, we carefully develop a system implementation for Sirius and show that +Sirius achieves roughly 20% reduction in latency for 8B model on-chip and 35% +reduction for 70B model offloading. We open-source our implementation of Sirius +at https://github.com/Infini-AI-Lab/Sirius.git. + +
+
+
+
+
+ + ☆ Persona Setting Pitfall: Persistent Outgroup Biases in Large Language + Models Arising from Social Identity Adoption + + +
+ Drawing parallels between human cognition and artificial intelligence, we +explored how large language models (LLMs) internalize identities imposed by +targeted prompts. Informed by Social Identity Theory, these identity +assignments lead LLMs to distinguish between "we" (the ingroup) and "they" (the +outgroup). This self-categorization generates both ingroup favoritism and +outgroup bias. Nonetheless, existing literature has predominantly focused on +ingroup favoritism, often overlooking outgroup bias, which is a fundamental +source of intergroup prejudice and discrimination. Our experiment addresses +this gap by demonstrating that outgroup bias manifests as strongly as ingroup +favoritism. Furthermore, we successfully mitigated the inherent pro-liberal, +anti-conservative bias in LLMs by guiding them to adopt the perspectives of the +initially disfavored group. These results were replicated in the context of +gender bias. Our findings highlight the potential to develop more equitable and +balanced language models. + +
+
+ comment: 23 pages, 5 figures +
+
+
+
+
+ + ☆ How Do Your Code LLMs Perform? Empowering Code Instruction Tuning with + High-Quality Data + + +
+ Recently, there has been a growing interest in studying how to construct +better code instruction tuning data. However, we observe Code models trained +with these datasets exhibit high performance on HumanEval but perform worse on +other benchmarks such as LiveCodeBench. Upon further investigation, we find +that many datasets suffer from severe data leakage. After cleaning up most of +the leaked data, some well-known high-quality datasets perform poorly. This +discovery reveals a new challenge: identifying which dataset genuinely qualify +as high-quality code instruction data. To address this, we propose an efficient +code data pruning strategy for selecting good samples. Our approach is based on +three dimensions: instruction complexity, response quality, and instruction +diversity. Based on our selected data, we present XCoder, a family of models +finetuned from LLaMA3. Our experiments show XCoder achieves new +state-of-the-art performance using fewer training data, which verify the +effectiveness of our data strategy. Moreover, we perform a comprehensive +analysis on the data composition and find existing code datasets have different +characteristics according to their construction methods, which provide new +insights for future code LLMs. Our models and dataset are released in +https://github.com/banksy23/XCoder + +
+
+ comment: Working in progress +
+
+
+
+
+ + ♻ ☆ PESTS: Persian_English Cross Lingual Corpus for Semantic Textual + Similarity + + +
+ One of the components of natural language processing that has received a lot +of investigation recently is semantic textual similarity. In computational +linguistics and natural language processing, assessing the semantic similarity +of words, phrases, paragraphs, and texts is crucial. Calculating the degree of +semantic resemblance between two textual pieces, paragraphs, or phrases +provided in both monolingual and cross-lingual versions is known as semantic +similarity. Cross lingual semantic similarity requires corpora in which there +are sentence pairs in both the source and target languages with a degree of +semantic similarity between them. Many existing cross lingual semantic +similarity models use a machine translation due to the unavailability of cross +lingual semantic similarity dataset, which the propagation of the machine +translation error reduces the accuracy of the model. On the other hand, when we +want to use semantic similarity features for machine translation the same +machine translations should not be used for semantic similarity. For Persian, +which is one of the low resource languages, no effort has been made in this +regard and the need for a model that can understand the context of two +languages is felt more than ever. In this article, the corpus of semantic +textual similarity between sentences in Persian and English languages has been +produced for the first time by using linguistic experts. We named this dataset +PESTS (Persian English Semantic Textual Similarity). This corpus contains 5375 +sentence pairs. Also, different models based on transformers have been +fine-tuned using this dataset. The results show that using the PESTS dataset, +the Pearson correlation of the XLM ROBERTa model increases from 85.87% to +95.62%. + +
+
+
+
+
+ + ♻ ☆ Kun: Answer Polishment for Chinese Self-Alignment with Instruction + Back-Translation + + +
+ In this paper, we introduce Kun, a novel approach for creating high-quality +instruction-tuning datasets for large language models (LLMs) without relying on +manual annotations. Adapting a self-training algorithm based on instruction +back-translation and answer polishment, Kun leverages unlabelled data from +diverse sources such as Wudao, Wanjuan, and SkyPile to generate a substantial +dataset of over a million Chinese instructional data points. This approach +significantly deviates from traditional methods by using a self-curation +process to refine and select the most effective instruction-output pairs. Our +experiments with the 6B-parameter Yi model across various benchmarks +demonstrate Kun's robustness and scalability. Our method's core contributions +lie in its algorithmic advancement, which enhances data retention and clarity, +and its innovative data generation approach that substantially reduces the +reliance on costly and time-consuming manual annotations. This methodology +presents a scalable and efficient solution for improving the +instruction-following capabilities of LLMs, with significant implications for +their application across diverse fields. The code and dataset can be found at +https://github.com/Zheng0428/COIG-Kun + +
+
+ comment: 12 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Data Mixture Inference: What do BPE Tokenizers Reveal about their + Training Data? + + +
+ The pretraining data of today's strongest language models is opaque; in +particular, little is known about the proportions of various domains or +languages represented. In this work, we tackle a task which we call data +mixture inference, which aims to uncover the distributional make-up of training +data. We introduce a novel attack based on a previously overlooked source of +information: byte-pair encoding (BPE) tokenizers, used by the vast majority of +modern language models. Our key insight is that the ordered list of merge rules +learned by a BPE tokenizer naturally reveals information about the token +frequencies in its training data. Given a tokenizer's merge list along with +example data for each category of interest, we formulate a linear program that +solves for the proportion of each category in the tokenizer's training set. In +controlled experiments, we show that our attack recovers mixture ratios with +high precision for tokenizers trained on known mixtures of natural languages, +programming languages, and data sources. We then apply our approach to +off-the-shelf tokenizers released with recent LMs. We confirm much publicly +disclosed information about these models, and also make several new inferences: +GPT-4o and Mistral NeMo's tokenizers are much more multilingual than their +predecessors, training on 39% and 47% non-English language data, respectively; +Llama 3 extends GPT-3.5's tokenizer primarily for multilingual (48%) use; +GPT-3.5's and Claude's tokenizers are trained on predominantly code (~60%). We +hope our work sheds light on current design practices for pretraining data, and +inspires continued research into data mixture inference for LMs. + +
+
+ comment: new robustness experiments; new baselines; include Mistral, + Mistral-Nemo and GPT-NeoX; link to code +
+
+
+
+
+ + ♻ ☆ Cost-Efficient Subjective Task Annotation and Modeling through Few-Shot + Annotator Adaptation + + +
+ In subjective NLP tasks, where a single ground truth does not exist, the +inclusion of diverse annotators becomes crucial as their unique perspectives +significantly influence the annotations. In realistic scenarios, the annotation +budget often becomes the main determinant of the number of perspectives (i.e., +annotators) included in the data and subsequent modeling. We introduce a novel +framework for annotation collection and modeling in subjective tasks that aims +to minimize the annotation budget while maximizing the predictive performance +for each annotator. Our framework has a two-stage design: first, we rely on a +small set of annotators to build a multitask model, and second, we augment the +model for a new perspective by strategically annotating a few samples per +annotator. To test our framework at scale, we introduce and release a unique +dataset, Moral Foundations Subjective Corpus, of 2000 Reddit posts annotated by +24 annotators for moral sentiment. We demonstrate that our framework surpasses +the previous SOTA in capturing the annotators' individual perspectives with as +little as 25% of the original annotation budget on two datasets. Furthermore, +our framework results in more equitable models, reducing the performance +disparity among annotators. + +
+
+
+
+
+ + ♻ ☆ Exploring Group and Symmetry Principles in Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated impressive performance across +a wide range of applications; however, assessing their reasoning capabilities +remains a significant challenge. In this paper, we introduce a framework +grounded in group and symmetry principles, which have played a crucial role in +fields such as physics and mathematics, and offer another way to evaluate their +capabilities. While the proposed framework is general, to showcase the benefits +of employing these properties, we focus on arithmetic reasoning and investigate +the performance of these models on four group properties: closure, identity, +inverse, and associativity. Our findings reveal that LLMs studied in this work +struggle to preserve group properties across different test regimes. In the +closure test, we observe biases towards specific outputs and an abrupt +degradation in their performance from 100% to 0% after a specific sequence +length. They also perform poorly in the identity test, which represents adding +irrelevant information in the context, and show sensitivity when subjected to +inverse test, which examines the robustness of the model with respect to +negation. In addition, we demonstrate that breaking down problems into smaller +steps helps LLMs in the associativity test that we have conducted. To support +these tests we have developed a synthetic dataset which will be released. + +
+
+
+
+
+ + ♻ ☆ Positioning Political Texts with Large Language Models by Asking and + Averaging + + +
+ We use instruction-tuned Large Language Models (LLMs) like GPT-4, Llama 3, +MiXtral, or Aya to position political texts within policy and ideological +spaces. We ask an LLM where a tweet or a sentence of a political text stands on +the focal dimension and take the average of the LLM responses to position +political actors such as US Senators, or longer texts such as UK party +manifestos or EU policy speeches given in 10 different languages. The +correlations between the position estimates obtained with the best LLMs and +benchmarks based on text coding by experts, crowdworkers, or roll call votes +exceed .90. This approach is generally more accurate than the positions +obtained with supervised classifiers trained on large amounts of research data. +Using instruction-tuned LLMs to position texts in policy and ideological spaces +is fast, cost-efficient, reliable, and reproducible (in the case of open LLMs) +even if the texts are short and written in different languages. We conclude +with cautionary notes about the need for empirical validation. + +
+
+
+
+
+ + ♻ ☆ Towards Evaluating and Building Versatile Large Language Models for + Medicine + + +
+ In this study, we present MedS-Bench, a comprehensive benchmark designed to +evaluate the performance of large language models (LLMs) in clinical contexts. +Unlike existing benchmarks that focus on multiple-choice question answering, +MedS-Bench spans 11 high-level clinical tasks, including clinical report +summarization, treatment recommendations, diagnosis, named entity recognition, +and medical concept explanation, among others. We evaluated six leading LLMs, +e.g., MEDITRON, Mistral, InternLM 2, Llama 3, GPT-4, and Claude-3.5 using +few-shot prompting, and found that even the most sophisticated models struggle +with these complex tasks. To address these limitations, we developed MedS-Ins, +a large-scale instruction tuning dataset for medicine. MedS-Ins comprises 58 +medically oriented language corpora, totaling 13.5 million samples across 122 +tasks. To demonstrate the dataset's utility, we conducted a proof-of-concept +experiment by performing instruction tuning on a lightweight, open-source +medical language model. The resulting model, MMedIns-Llama 3, significantly +outperformed existing models across nearly all clinical tasks. To promote +further advancements in the application of LLMs to clinical challenges, we have +made the MedS-Ins dataset fully accessible and invite the research community to +contribute to its expansion.Additionally, we have launched a dynamic +leaderboard for MedS-Bench, which we plan to regularly update the test set to +track progress and enhance the adaptation of general LLMs to the medical +domain. Leaderboard: https://henrychur.github.io/MedS-Bench/. Github: +https://github.com/MAGIC-AI4Med/MedS-Ins. + +
+
+
+
+
+ + ♻ ☆ Legilimens: Practical and Unified Content Moderation for Large Language + Model Services CCS + + +
+ Given the societal impact of unsafe content generated by large language +models (LLMs), ensuring that LLM services comply with safety standards is a +crucial concern for LLM service providers. Common content moderation methods +are limited by an effectiveness-and-efficiency dilemma, where simple models are +fragile while sophisticated models consume excessive computational resources. +In this paper, we reveal for the first time that effective and efficient +content moderation can be achieved by extracting conceptual features from +chat-oriented LLMs, despite their initial fine-tuning for conversation rather +than content moderation. We propose a practical and unified content moderation +framework for LLM services, named Legilimens, which features both effectiveness +and efficiency. Our red-team model-based data augmentation enhances the +robustness of Legilimens against state-of-the-art jailbreaking. Additionally, +we develop a framework to theoretically analyze the cost-effectiveness of +Legilimens compared to other methods. We have conducted extensive experiments +on five host LLMs, seventeen datasets, and nine jailbreaking methods to verify +the effectiveness, efficiency, and robustness of Legilimens against normal and +adaptive adversaries. A comparison of Legilimens with both commercial and +academic baselines demonstrates the superior performance of Legilimens. +Furthermore, we confirm that Legilimens can be applied to few-shot scenarios +and extended to multi-label classification tasks. + +
+
+ comment: Accepted by ACM Conference on Computer and Communications Security + (CCS) 2024 +
+
+
+
+
+ + ♻ ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted + Influence Functions + + +
+ The responses generated by Large Language Models (LLMs) can include sensitive +information from individuals and organizations, leading to potential privacy +leakage. This work implements Influence Functions (IFs) to trace privacy +leakage back to the training data, thereby mitigating privacy concerns of +Language Models (LMs). However, we notice that current IFs struggle to +accurately estimate the influence of tokens with large gradient norms, +potentially overestimating their influence. When tracing the most influential +samples, this leads to frequently tracing back to samples with large gradient +norm tokens, overshadowing the actual most influential samples even if their +influences are well estimated. To address this issue, we propose Heuristically +Adjusted IF (HAIF), which reduces the weight of tokens with large gradient +norms, thereby significantly improving the accuracy of tracing the most +influential samples. To establish easily obtained groundtruth for tracing +privacy leakage, we construct two datasets, PII-E and PII-CR, representing two +distinct scenarios: one with identical text in the model outputs and +pre-training data, and the other where models leverage their reasoning +abilities to generate text divergent from pre-training data. HAIF significantly +improves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E +dataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA +IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs +on real-world pretraining data CLUECorpus2020, demonstrating strong robustness +regardless prompt and response lengths. + +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ Unleashing the potential of prompt engineering in Large Language Models: + a comprehensive review + + +
+ This comprehensive review delves into the pivotal role of prompt engineering +in unleashing the capabilities of Large Language Models (LLMs). The development +of Artificial Intelligence (AI), from its inception in the 1950s to the +emergence of advanced neural networks and deep learning architectures, has made +a breakthrough in LLMs, with models such as GPT-4o and Claude-3, and in +Vision-Language Models (VLMs), with models such as CLIP and ALIGN. Prompt +engineering is the process of structuring inputs, which has emerged as a +crucial technique to maximize the utility and accuracy of these models. This +paper explores both foundational and advanced methodologies of prompt +engineering, including techniques such as self-consistency, chain-of-thought, +and generated knowledge, which significantly enhance model performance. +Additionally, it examines the prompt method of VLMs through innovative +approaches such as Context Optimization (CoOp), Conditional Context +Optimization (CoCoOp), and Multimodal Prompt Learning (MaPLe). Critical to this +discussion is the aspect of AI security, particularly adversarial attacks that +exploit vulnerabilities in prompt engineering. Strategies to mitigate these +risks and enhance model robustness are thoroughly reviewed. The evaluation of +prompt methods is also addressed, through both subjective and objective +metrics, ensuring a robust analysis of their efficacy. This review also +reflects the essential role of prompt engineering in advancing AI capabilities, +providing a structured framework for future research and application. + +
+
+
+
+
+ + ♻ ☆ Enhancing Code-Switching Speech Recognition with LID-Based Collaborative + Mixture of Experts Model + + +
+ Due to the inherent difficulty in modeling phonetic similarities across +different languages, code-switching speech recognition presents a formidable +challenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE) +model that leverages a collaborative mechanism among expert groups. Initially, +a preceding routing network explicitly learns Language Identification (LID) +tasks and selects experts based on acquired LID weights. This process ensures +robust routing information to the MoE layer, mitigating interference from +diverse language domains on expert network parameter updates. The LID weights +are also employed to facilitate inter-group collaboration, enabling the +integration of language-specific representations. Furthermore, within each +language expert group, a gating network operates unsupervised to foster +collaboration on attributes beyond language. Extensive experiments demonstrate +the efficacy of our approach, achieving significant performance enhancements +compared to alternative methods. Importantly, our method preserves the +efficient inference capabilities characteristic of MoE models without +necessitating additional pre-training. + +
+
+ comment: Accepted by IEEE SLT 2024 +
+
+
+
+
+ + ♻ ☆ Temporal Order Preserved Optimal Transport-based Cross-modal Knowledge + Transfer Learning for ASR + + +
+ Transferring linguistic knowledge from a pretrained language model (PLM) to +an acoustic model has been shown to greatly improve the performance of +automatic speech recognition (ASR). However, due to the heterogeneous feature +distributions in cross-modalities, designing an effective model for feature +alignment and knowledge transfer between linguistic and acoustic sequences +remains a challenging task. Optimal transport (OT), which efficiently measures +probability distribution discrepancies, holds great potential for aligning and +transferring knowledge between acoustic and linguistic modalities. Nonetheless, +the original OT treats acoustic and linguistic feature sequences as two +unordered sets in alignment and neglects temporal order information during OT +coupling estimation. Consequently, a time-consuming pretraining stage is +required to learn a good alignment between the acoustic and linguistic +representations. In this paper, we propose a Temporal Order Preserved OT +(TOT)-based Cross-modal Alignment and Knowledge Transfer (CAKT) (TOT-CAKT) for +ASR. In the TOT-CAKT, local neighboring frames of acoustic sequences are +smoothly mapped to neighboring regions of linguistic sequences, preserving +their temporal order relationship in feature alignment and matching. With the +TOT-CAKT model framework, we conduct Mandarin ASR experiments with a pretrained +Chinese PLM for linguistic knowledge transfer. Our results demonstrate that the +proposed TOT-CAKT significantly improves ASR performance compared to several +state-of-the-art models employing linguistic knowledge transfer, and addresses +the weaknesses of the original OT-based method in sequential feature alignment +for ASR. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ♻ ☆ LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language + Models + + +
+ Large Language Models (LLMs) have demonstrated notable capabilities across +various tasks, showcasing complex problem-solving abilities. Understanding and +executing complex rules, along with multi-step planning, are fundamental to +logical reasoning and critical for practical LLM agents and decision-making +systems. However, evaluating LLMs as effective rule-based executors and +planners remains underexplored. In this paper, we introduce LogicGame, a novel +benchmark designed to evaluate the comprehensive rule understanding, execution, +and planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame +provides diverse games that contain a series of rules with an initial state, +requiring models to comprehend and apply predefined regulations to solve +problems. We create simulated scenarios in which models execute or plan +operations to achieve specific outcomes. These game scenarios are specifically +designed to distinguish logical reasoning from mere knowledge by relying +exclusively on predefined rules. This separation allows for a pure assessment +of rule-based reasoning capabilities. The evaluation considers not only final +outcomes but also intermediate steps, providing a comprehensive assessment of +model performance. Moreover, these intermediate steps are deterministic and can +be automatically verified. LogicGame defines game scenarios with varying +difficulty levels, from simple rule applications to complex reasoning chains, +in order to offer a precise evaluation of model performance on rule +understanding and multi-step execution. Utilizing LogicGame, we test various +LLMs and identify notable shortcomings in their rule-based logical reasoning +abilities. + +
+
+
+
+
+ + ♻ ☆ Exposing and Explaining Fake News On-the-Fly + + +
+ Social media platforms enable the rapid dissemination and consumption of +information. However, users instantly consume such content regardless of the +reliability of the shared data. Consequently, the latter crowdsourcing model is +exposed to manipulation. This work contributes with an explainable and online +classification method to recognize fake news in real-time. The proposed method +combines both unsupervised and supervised Machine Learning approaches with +online created lexica. The profiling is built using creator-, content- and +context-based features using Natural Language Processing techniques. The +explainable classification mechanism displays in a dashboard the features +selected for classification and the prediction confidence. The performance of +the proposed solution has been validated with real data sets from Twitter and +the results attain 80 % accuracy and macro F-measure. This proposal is the +first to jointly provide data stream processing, profiling, classification and +explainability. Ultimately, the proposed early detection, isolation and +explanation of fake news contribute to increase the quality and trustworthiness +of social media contents. + +
+
+
+
+
+ + ♻ ☆ A review on the use of large language models as virtual tutors + + +
+ Transformer architectures contribute to managing long-term dependencies for +Natural Language Processing, representing one of the most recent changes in the +field. These architectures are the basis of the innovative, cutting-edge Large +Language Models (LLMs) that have produced a huge buzz in several fields and +industrial sectors, among the ones education stands out. Accordingly, these +generative Artificial Intelligence-based solutions have directed the change in +techniques and the evolution in educational methods and contents, along with +network infrastructure, towards high-quality learning. Given the popularity of +LLMs, this review seeks to provide a comprehensive overview of those solutions +designed specifically to generate and evaluate educational materials and which +involve students and teachers in their design or experimental plan. To the best +of our knowledge, this is the first review of educational applications (e.g., +student assessment) of LLMs. As expected, the most common role of these systems +is as virtual tutors for automatic question generation. Moreover, the most +popular models are GTP-3 and BERT. However, due to the continuous launch of new +generative models, new works are expected to be published shortly. + +
+
+
+
+
+ + ♻ ☆ Improving Speaker Assignment in Speaker-Attributed ASR for Real Meeting + Applications + + +
+ Past studies on end-to-end meeting transcription have focused on model +architecture and have mostly been evaluated on simulated meeting data. We +present a novel study aiming to optimize the use of a Speaker-Attributed ASR +(SA-ASR) system in real-life scenarios, such as the AMI meeting corpus, for +improved speaker assignment of speech segments. First, we propose a pipeline +tailored to real-life applications involving Voice Activity Detection (VAD), +Speaker Diarization (SD), and SA-ASR. Second, we advocate using VAD output +segments to fine-tune the SA-ASR model, considering that it is also applied to +VAD segments during test, and show that this results in a relative reduction of +Speaker Error Rate (SER) up to 28%. Finally, we explore strategies to enhance +the extraction of the speaker embedding templates used as inputs by the SA-ASR +system. We show that extracting them from SD output rather than annotated +speaker segments results in a relative SER reduction up to 20%. + +
+
+ comment: Submitted to Odyssey 2024 +
+
+
+
+
+ + ♻ ☆ Pooling And Attention: What Are Effective Designs For LLM-Based + Embedding Models? + + +
+ The significant advancements of Large Language Models (LLMs) in generative +tasks have led to a growing body of work exploring LLM-based embedding models. +While these models, employing different pooling and attention strategies, have +achieved state-of-the-art performance on public embedding benchmarks, questions +still arise about what constitutes an effective design for LLM-based embedding +models. However, these models are often trained on different datasets, using +different LLM base models or training settings. Moreover, evaluations on public +embedding benchmarks often fail to report statistical significance, making it +difficult to determine which designs truly contribute to final performance. +This complicates the process for practitioners seeking optimal training recipes +for LLM-based embedding models. In this study, we conduct a large-scale +experiment by training a series of LLM-based embedding models using the same +training data and base model but differing in their pooling and attention +strategies. The results show that there is no one-size-fits-all solution: while +bidirectional attention and an additional trainable pooling layer outperform in +text similarity and information retrieval tasks, they do not significantly +surpass simpler designs like EOS-last token pooling and default causal +attention in clustering and classification tasks. Furthermore, we propose a new +pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs +of all hidden layers, rather than just the last layer, using a cross-attention +network. This method proves to be statistically superior in text similarity and +retrieval tasks compared to existing pooling methods. Overall, this paper sheds +light on effective training strategies for LLM-based embedding models. + +
+
+ comment: https://github.com/yixuantt/PoolingAndAttn +
+
+
+
+
+ + ♻ ☆ CAVE: Controllable Authorship Verification Explanations + + +
+ Authorship Verification (AV) (do two documents have the same author?) is +essential in many sensitive real-life applications. AV is often used in +proprietary domains that require a private, offline model, making SOTA online +models like ChatGPT undesirable. Current offline models however have lower +downstream utility due to low accuracy/scalability (eg: traditional stylometry +AV systems) and lack of accessible post-hoc explanations. In this work, we take +the first step to address the above challenges with our trained, offline +Llama-3-8B model CAVE (Controllable Authorship Verification Explanations): CAVE +generates free-text AV explanations that are controlled to be (1) structured +(can be decomposed into sub-explanations in terms of relevant linguistic +features), and (2) easily verified for explanation-label consistency (via +intermediate labels in sub-explanations). We first engineer a prompt that can +generate silver training data from a SOTA teacher model in the desired CAVE +output format. We then filter and distill this data into a pretrained +Llama-3-8B, our carefully selected student model. Results on three difficult AV +datasets IMDb62, Blog-Auth, and Fanfiction show that CAVE generates high +quality explanations (as measured by automatic and human evaluation) as well as +competitive task accuracies. + +
+
+
+
+
+ + ♻ ☆ Aligning Large Language Models to a Domain-specific Graph Database for + NL2GQL + + +
+ Graph Databases (Graph DB) find extensive application across diverse domains +such as finance, social networks, and medicine. Yet, the translation of Natural +Language (NL) into the Graph Query Language (GQL), referred to as NL2GQL, poses +significant challenges owing to its intricate and specialized nature. Some +approaches have sought to utilize Large Language Models (LLMs) to address +analogous tasks like text2SQL. Nonetheless, in the realm of NL2GQL tasks +tailored to a particular domain, the absence of domain-specific NL-GQL data +pairs adds complexity to aligning LLMs with the graph DB. To tackle this +challenge, we present a well-defined pipeline. Initially, we utilize ChatGPT to +generate NL-GQL data pairs, leveraging the provided graph DB with +self-instruction. Subsequently, we employ the generated data to fine-tune LLMs, +ensuring alignment between LLMs and the graph DB. Moreover, we find the +importance of relevant schema in efficiently generating accurate GQLs. Thus, we +introduce a method to extract relevant schema as the input context. We evaluate +our method using two carefully constructed datasets derived from graph DBs in +the finance and medicine domains, named FinGQL and MediGQL. Experimental +results reveal that our approach significantly outperforms a set of baseline +methods, with improvements of 5.90 and 6.36 absolute points on EM, and 6.00 and +7.09 absolute points on EX for FinGQL and MediGQL, respectively. + +
+
+ comment: 13 pages,2 figures +
+
+
+
+
+ + ♻ ☆ More Text, Less Point: Towards 3D Data-Efficient Point-Language + Understanding + + +
+ Enabling Large Language Models (LLMs) to comprehend the 3D physical world +remains a significant challenge. Due to the lack of large-scale 3D-text pair +datasets, the success of LLMs has yet to be replicated in 3D understanding. In +this paper, we rethink this issue and propose a new task: 3D Data-Efficient +Point-Language Understanding. The goal is to enable LLMs to achieve robust 3D +object understanding with minimal 3D point cloud and text data pairs. To +address this task, we introduce GreenPLM, which leverages more text data to +compensate for the lack of 3D data. First, inspired by using CLIP to align +images and text, we utilize a pre-trained point cloud-text encoder to map the +3D point cloud space to the text space. This mapping leaves us to seamlessly +connect the text space with LLMs. Once the point-text-LLM connection is +established, we further enhance text-LLM alignment by expanding the +intermediate text space, thereby reducing the reliance on 3D point cloud data. +Specifically, we generate 6M free-text descriptions of 3D objects, and design a +three-stage training strategy to help LLMs better explore the intrinsic +connections between different modalities. To achieve efficient modality +alignment, we design a zero-parameter cross-attention module for token pooling. +Extensive experimental results show that GreenPLM requires only 12% of the 3D +training data used by existing state-of-the-art models to achieve superior 3D +understanding. Remarkably, GreenPLM also achieves competitive performance using +text-only data. The code and weights are available at: +https://github.com/TangYuan96/GreenPLM. + +
+
+
+
+
+ + ♻ ☆ OpenFact at CheckThat! 2024: Combining Multiple Attack Methods for + Effective Adversarial Text Generation + + +
+ This paper presents the experiments and results for the CheckThat! Lab at +CLEF 2024 Task 6: Robustness of Credibility Assessment with Adversarial +Examples (InCrediblAE). The primary objective of this task was to generate +adversarial examples in five problem domains in order to evaluate the +robustness of widely used text classification methods (fine-tuned BERT, BiLSTM, +and RoBERTa) when applied to credibility assessment issues. + This study explores the application of ensemble learning to enhance +adversarial attacks on natural language processing (NLP) models. We +systematically tested and refined several adversarial attack methods, including +BERT-Attack, Genetic algorithms, TextFooler, and CLARE, on five datasets across +various misinformation tasks. By developing modified versions of BERT-Attack +and hybrid methods, we achieved significant improvements in attack +effectiveness. Our results demonstrate the potential of modification and +combining multiple methods to create more sophisticated and effective +adversarial attack strategies, contributing to the development of more robust +and secure systems. + +
+
+ comment: CLEF 2024 - Conference and Labs of the Evaluation Forum +
+
+
+
+
+ + ♻ ☆ Large Language Models and Cognitive Science: A Comprehensive Review of + Similarities, Differences, and Challenges + + +
+ This comprehensive review explores the intersection of Large Language Models +(LLMs) and cognitive science, examining similarities and differences between +LLMs and human cognitive processes. We analyze methods for evaluating LLMs +cognitive abilities and discuss their potential as cognitive models. The review +covers applications of LLMs in various cognitive fields, highlighting insights +gained for cognitive science research. We assess cognitive biases and +limitations of LLMs, along with proposed methods for improving their +performance. The integration of LLMs with cognitive architectures is examined, +revealing promising avenues for enhancing artificial intelligence (AI) +capabilities. Key challenges and future research directions are identified, +emphasizing the need for continued refinement of LLMs to better align with +human cognition. This review provides a balanced perspective on the current +state and future potential of LLMs in advancing our understanding of both +artificial and human intelligence. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ LongCite: Enabling LLMs to Generate Fine-grained Citations in + Long-context QA + + +
+ Though current long-context large language models (LLMs) have demonstrated +impressive capacities in answering user questions based on extensive text, the +lack of citations in their responses makes user verification difficult, leading +to concerns about their trustworthiness due to their potential hallucinations. +In this work, we aim to enable long-context LLMs to generate responses with +fine-grained sentence-level citations, improving their faithfulness and +verifiability. We first introduce LongBench-Cite, an automated benchmark for +assessing current LLMs' performance in Long-Context Question Answering with +Citations (LQAC), revealing considerable room for improvement. To this end, we +propose CoF (Coarse to Fine), a novel pipeline that utilizes off-the-shelf LLMs +to automatically generate long-context QA instances with precise sentence-level +citations, and leverage this pipeline to construct LongCite-45k, a large-scale +SFT dataset for LQAC. Finally, we train LongCite-8B and LongCite-9B using the +LongCite-45k dataset, successfully enabling their generation of accurate +responses and fine-grained sentence-level citations in a single output. The +evaluation results on LongBench-Cite show that our trained models achieve +state-of-the-art citation quality, surpassing advanced proprietary models +including GPT-4o. + +
+
+
+
+
+ + ♻ ☆ Resolving Knowledge Conflicts in Large Language Models + + +
+ Large language models (LLMs) often encounter knowledge conflicts, scenarios +where discrepancy arises between the internal parametric knowledge of LLMs and +non-parametric information provided in the prompt context. In this work we ask +what are the desiderata for LLMs when a knowledge conflict arises and whether +existing LLMs fulfill them. We posit that LLMs should 1) identify knowledge +conflicts, 2) pinpoint conflicting information segments, and 3) provide +distinct answers or viewpoints in conflicting scenarios. To this end, we +introduce KNOWLEDGE CONFLICT, an evaluation framework for simulating contextual +knowledge conflicts and quantitatively evaluating to what extent LLMs achieve +these goals. KNOWLEDGE CONFLICT includes diverse and complex situations of +knowledge conflict, knowledge from diverse entities and domains, two synthetic +conflict creation methods, and settings with progressively increasing +difficulty to reflect realistic knowledge conflicts. Extensive experiments with +the KNOWLEDGE CONFLICT framework reveal that while LLMs perform well in +identifying the existence of knowledge conflicts, they struggle to determine +the specific conflicting knowledge and produce a response with distinct answers +amidst conflicting information. To address these challenges, we propose new +instruction-based approaches that augment LLMs to better achieve the three +goals. Further analysis shows that abilities to tackle knowledge conflicts are +greatly impacted by factors such as knowledge domain and prompt text, while +generating robust responses to knowledge conflict scenarios remains an open +research question. + +
+
+ comment: Published at COLM 2024 +
+
+
+
+
+ + ♻ ☆ Prediction of COPD Using Machine Learning, Clinical Summary Notes, and + Vital Signs + + +
+ Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung +disease that causes obstructed airflow from the lungs. In the United States, +more than 15.7 million Americans have been diagnosed with COPD, with 96% of +individuals living with at least one other chronic health condition. It is the +4th leading cause of death in the country. Over 2.2 million patients are +admitted to hospitals annually due to COPD exacerbations. Monitoring and +predicting patient exacerbations on-time could save their life. This paper +presents two different predictive models to predict COPD exacerbation using AI +and natural language processing (NLP) approaches. These models use respiration +summary notes, symptoms, and vital signs. To train and test these models, data +records containing physiologic signals and vital signs time series were used. +These records were captured from patient monitors and comprehensive clinical +data obtained from hospital medical information systems for tens of thousands +of Intensive Care Unit (ICU) patients. We achieved an area under the Receiver +operating characteristic (ROC) curve of 0.82 in detection and prediction of +COPD exacerbation. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Simultaneous Masking, Not Prompting Optimization: A Paradigm Shift in + Fine-tuning LLMs for Simultaneous Translation + + +
+ Large language models (LLMs) have achieved state-of-the-art performance in +various language processing tasks, motivating their adoption in simultaneous +translation. Current fine-tuning methods to adapt LLMs for simultaneous +translation focus on prompting optimization strategies using either data +augmentation or prompt structure modifications. However, these methods suffer +from several issues, such as unnecessarily expanded training sets, +computational inefficiency from dumping the key and value cache, increased +prompt sizes, or restriction to a single decision policy. To eliminate these +issues, in this work, we propose SimulMask, a new paradigm for fine-tuning LLMs +for simultaneous translation. It utilizes a novel attention mask approach that +models simultaneous translation during fine-tuning by masking attention for a +desired decision policy. Applying the proposed SimulMask on a Falcon LLM for +the IWSLT 2017 dataset, we have observed a significant translation quality +improvement compared to state-of-the-art prompting optimization strategies on +five language pairs while reducing the computational cost. + +
+
+
+
+
+ + ♻ ☆ UserSumBench: A Benchmark Framework for Evaluating User Summarization + Approaches + + +
+ Large language models (LLMs) have shown remarkable capabilities in generating +user summaries from a long list of raw user activity data. These summaries +capture essential user information such as preferences and interests, and +therefore are invaluable for LLM-based personalization applications, such as +explainable recommender systems. However, the development of new summarization +techniques is hindered by the lack of ground-truth labels, the inherent +subjectivity of user summaries, and human evaluation which is often costly and +time-consuming. To address these challenges, we introduce \UserSumBench, a +benchmark framework designed to facilitate iterative development of LLM-based +summarization approaches. This framework offers two key components: (1) A +reference-free summary quality metric. We show that this metric is effective +and aligned with human preferences across three diverse datasets (MovieLens, +Yelp and Amazon Review). (2) A novel robust summarization method that leverages +time-hierarchical summarizer and self-critique verifier to produce high-quality +summaries while eliminating hallucination. This method serves as a strong +baseline for further innovation in summarization techniques. + +
+
+
+
+
+ + ♻ ☆ SPA: Towards A Computational Friendly Cloud-Base and On-Devices + Collaboration Seq2seq Personalized Generation with Casual Inference SP + + +
+ Large language models(LLMs) have shown its outperforming ability on various +tasks and question answering. However, LLMs require substantial memory storage +on low-resource devices. More critically, the computational speed on these +devices is also severely limited. In this paper, we propose SPA(Side Plugin +Adaption), a lightweight architecture for fast on-devices inference on the +constraints of strict on-devices computation and memory constraints. Compared +with other on-devices seq2seq generation, SPA could make a fast and stable +inference on low-resource constraints, allowing it to obtain cost effiency. Our +method establish an interaction between a pretrained LLMs on-cloud and additive +parameters on-devices, which could provide the knowledge on both pretrained +LLMs and featured personal feature. Further more, SPA provides a framework to +keep feature-base parameters on low computational devices while leave the +parameters containing general information on the high computational devices. + +
+
+ comment: 12 pages, third version of SPA(Side Plugin Adaption) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 123 + +
+
+
+ + ☆ Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene + Understanding + + +
+ Complex 3D scene understanding has gained increasing attention, with scene +encoding strategies playing a crucial role in this success. However, the +optimal scene encoding strategies for various scenarios remain unclear, +particularly compared to their image-based counterparts. To address this issue, +we present a comprehensive study that probes various visual encoding models for +3D scene understanding, identifying the strengths and limitations of each model +across different scenarios. Our evaluation spans seven vision foundation +encoders, including image-based, video-based, and 3D foundation models. We +evaluate these models in four tasks: Vision-Language Scene Reasoning, Visual +Grounding, Segmentation, and Registration, each focusing on different aspects +of scene understanding. Our evaluations yield key findings: DINOv2 demonstrates +superior performance, video models excel in object-level tasks, diffusion +models benefit geometric tasks, and language-pretrained models show unexpected +limitations in language-related tasks. These insights challenge some +conventional understandings, provide novel perspectives on leveraging visual +foundation models, and highlight the need for more flexible encoder selection +in future vision-language and scene-understanding tasks. + +
+
+ comment: Project page: https://yunzeman.github.io/lexicon3d , Github: + https://github.com/YunzeMan/Lexicon3D +
+
+
+
+
+ + ☆ DC-Solver: Improving Predictor-Corrector Diffusion Sampler via Dynamic + Compensation ECCV 2024 + + +
+ Diffusion probabilistic models (DPMs) have shown remarkable performance in +visual synthesis but are computationally expensive due to the need for multiple +evaluations during the sampling. Recent predictor-corrector diffusion samplers +have significantly reduced the required number of function evaluations (NFE), +but inherently suffer from a misalignment issue caused by the extra corrector +step, especially with a large classifier-free guidance scale (CFG). In this +paper, we introduce a new fast DPM sampler called DC-Solver, which leverages +dynamic compensation (DC) to mitigate the misalignment of the +predictor-corrector samplers. The dynamic compensation is controlled by +compensation ratios that are adaptive to the sampling steps and can be +optimized on only 10 datapoints by pushing the sampling trajectory toward a +ground truth trajectory. We further propose a cascade polynomial regression +(CPR) which can instantly predict the compensation ratios on unseen sampling +configurations. Additionally, we find that the proposed dynamic compensation +can also serve as a plug-and-play module to boost the performance of +predictor-only samplers. Extensive experiments on both unconditional sampling +and conditional sampling demonstrate that our DC-Solver can consistently +improve the sampling quality over previous methods on different DPMs with a +wide range of resolutions up to 1024$\times$1024. Notably, we achieve 10.38 FID +(NFE=5) on unconditional FFHQ and 0.394 MSE (NFE=5, CFG=7.5) on +Stable-Diffusion-2.1. Code is available at https://github.com/wl-zhao/DC-Solver + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Foundation Model or Finetune? Evaluation of few-shot semantic + segmentation for river pollution ECCV 2024 + + +
+ Foundation models (FMs) are a popular topic of research in AI. Their ability +to generalize to new tasks and datasets without retraining or needing an +abundance of data makes them an appealing candidate for applications on +specialist datasets. In this work, we compare the performance of FMs to +finetuned pre-trained supervised models in the task of semantic segmentation on +an entirely new dataset. We see that finetuned models consistently outperform +the FMs tested, even in cases were data is scarce. We release the code and +dataset for this work on GitHub. + +
+
+ comment: Accepted at ECCV 2024 Green Foundation Models workshop +
+
+
+
+
+ + ☆ ArtiFade: Learning to Generate High-quality Subject from Blemished + Images + + +
+ Subject-driven text-to-image generation has witnessed remarkable advancements +in its ability to learn and capture characteristics of a subject using only a +limited number of images. However, existing methods commonly rely on +high-quality images for training and may struggle to generate reasonable images +when the input images are blemished by artifacts. This is primarily attributed +to the inadequate capability of current techniques in distinguishing +subject-related features from disruptive artifacts. In this paper, we introduce +ArtiFade to tackle this issue and successfully generate high-quality +artifact-free images from blemished datasets. Specifically, ArtiFade exploits +fine-tuning of a pre-trained text-to-image model, aiming to remove artifacts. +The elimination of artifacts is achieved by utilizing a specialized dataset +that encompasses both unblemished images and their corresponding blemished +counterparts during fine-tuning. ArtiFade also ensures the preservation of the +original generative capabilities inherent within the diffusion model, thereby +enhancing the overall performance of subject-driven methods in generating +high-quality and artifact-free images. We further devise evaluation benchmarks +tailored for this task. Through extensive qualitative and quantitative +experiments, we demonstrate the generalizability of ArtiFade in effective +artifact removal under both in-distribution and out-of-distribution scenarios. + +
+
+
+
+
+ + ☆ Geometry Image Diffusion: Fast and Data-Efficient Text-to-3D with + Image-Based Surface Representation + + +
+ Generating high-quality 3D objects from textual descriptions remains a +challenging problem due to computational cost, the scarcity of 3D data, and +complex 3D representations. We introduce Geometry Image Diffusion +(GIMDiffusion), a novel Text-to-3D model that utilizes geometry images to +efficiently represent 3D shapes using 2D images, thereby avoiding the need for +complex 3D-aware architectures. By integrating a Collaborative Control +mechanism, we exploit the rich 2D priors of existing Text-to-Image models such +as Stable Diffusion. This enables strong generalization even with limited 3D +training data (allowing us to use only high-quality training data) as well as +retaining compatibility with guidance techniques such as IPAdapter. In short, +GIMDiffusion enables the generation of 3D assets at speeds comparable to +current Text-to-Image models. The generated objects consist of semantically +meaningful, separate parts and include internal structures, enhancing both +usability and versatility. + +
+
+ comment: 11 pages, 9 figures, Project page: + https://unity-research.github.io/Geometry-Image-Diffusion.github.io/ +
+
+
+
+
+ + ☆ View-Invariant Policy Learning via Zero-Shot Novel View Synthesis + + +
+ Large-scale visuomotor policy learning is a promising approach toward +developing generalizable manipulation systems. Yet, policies that can be +deployed on diverse embodiments, environments, and observational modalities +remain elusive. In this work, we investigate how knowledge from large-scale +visual data of the world may be used to address one axis of variation for +generalizable manipulation: observational viewpoint. Specifically, we study +single-image novel view synthesis models, which learn 3D-aware scene-level +priors by rendering images of the same scene from alternate camera viewpoints +given a single input image. For practical application to diverse robotic data, +these models must operate zero-shot, performing view synthesis on unseen tasks +and environments. We empirically analyze view synthesis models within a simple +data-augmentation scheme that we call View Synthesis Augmentation (VISTA) to +understand their capabilities for learning viewpoint-invariant policies from +single-viewpoint demonstration data. Upon evaluating the robustness of policies +trained with our method to out-of-distribution camera viewpoints, we find that +they outperform baselines in both simulated and real-world manipulation tasks. +Videos and additional visualizations are available at +https://s-tian.github.io/projects/vista. + +
+
+ comment: Accepted to CoRL 2024 +
+
+
+
+
+ + ☆ RealisHuman: A Two-Stage Approach for Refining Malformed Human Parts in + Generated Images + + +
+ In recent years, diffusion models have revolutionized visual generation, +outperforming traditional frameworks like Generative Adversarial Networks +(GANs). However, generating images of humans with realistic semantic parts, +such as hands and faces, remains a significant challenge due to their intricate +structural complexity. To address this issue, we propose a novel +post-processing solution named RealisHuman. The RealisHuman framework operates +in two stages. First, it generates realistic human parts, such as hands or +faces, using the original malformed parts as references, ensuring consistent +details with the original image. Second, it seamlessly integrates the rectified +human parts back into their corresponding positions by repainting the +surrounding areas to ensure smooth and realistic blending. The RealisHuman +framework significantly enhances the realism of human generation, as +demonstrated by notable improvements in both qualitative and quantitative +metrics. Code is available at https://github.com/Wangbenzhi/RealisHuman. + +
+
+
+
+
+ + ☆ CDM: A Reliable Metric for Fair and Accurate Formula Recognition + Evaluation + + +
+ Formula recognition presents significant challenges due to the complicated +structure and varied notation of mathematical expressions. Despite continuous +advancements in formula recognition models, the evaluation metrics employed by +these models, such as BLEU and Edit Distance, still exhibit notable +limitations. They overlook the fact that the same formula has diverse +representations and is highly sensitive to the distribution of training data, +thereby causing the unfairness in formula recognition evaluation. To this end, +we propose a Character Detection Matching (CDM) metric, ensuring the evaluation +objectivity by designing a image-level rather than LaTex-level metric score. +Specifically, CDM renders both the model-predicted LaTeX and the ground-truth +LaTeX formulas into image-formatted formulas, then employs visual feature +extraction and localization techniques for precise character-level matching, +incorporating spatial position information. Such a spatially-aware and +character-matching method offers a more accurate and equitable evaluation +compared with previous BLEU and Edit Distance metrics that rely solely on +text-based character matching. Experimentally, we evaluated various formula +recognition models using CDM, BLEU, and ExpRate metrics. Their results +demonstrate that the CDM aligns more closely with human evaluation standards +and provides a fairer comparison across different models by eliminating +discrepancies caused by diverse formula representations. + +
+
+ comment: Project Website: + https://github.com/opendatalab/UniMERNet/tree/main/cdm +
+
+
+
+
+ + ☆ Surface-Centric Modeling for High-Fidelity Generalizable Neural Surface + Reconstruction ECCV 2024 + + +
+ Reconstructing the high-fidelity surface from multi-view images, especially +sparse images, is a critical and practical task that has attracted widespread +attention in recent years. However, existing methods are impeded by the memory +constraint or the requirement of ground-truth depths and cannot recover +satisfactory geometric details. To this end, we propose SuRF, a new +Surface-centric framework that incorporates a new Region sparsification based +on a matching Field, achieving good trade-offs between performance, efficiency +and scalability. To our knowledge, this is the first unsupervised method +achieving end-to-end sparsification powered by the introduced matching field, +which leverages the weight distribution to efficiently locate the boundary +regions containing surface. Instead of predicting an SDF value for each voxel, +we present a new region sparsification approach to sparse the volume by judging +whether the voxel is inside the surface region. In this way, our model can +exploit higher frequency features around the surface with less memory and +computational consumption. Extensive experiments on multiple benchmarks +containing complex large-scale scenes show that our reconstructions exhibit +high-quality details and achieve new state-of-the-art performance, i.e., 46% +improvements with 80% less memory consumption. Code is available at +https://github.com/prstrive/SuRF. + +
+
+ comment: ECCV 2024 Accepted +
+
+
+
+
+ + ☆ SegTalker: Segmentation-based Talking Face Generation with Mask-guided + Local Editing + + +
+ Audio-driven talking face generation aims to synthesize video with lip +movements synchronized to input audio. However, current generative techniques +face challenges in preserving intricate regional textures (skin, teeth). To +address the aforementioned challenges, we propose a novel framework called +SegTalker to decouple lip movements and image textures by introducing +segmentation as intermediate representation. Specifically, given the mask of +image employed by a parsing network, we first leverage the speech to drive the +mask and generate talking segmentation. Then we disentangle semantic regions of +image into style codes using a mask-guided encoder. Ultimately, we inject the +previously generated talking segmentation and style codes into a mask-guided +StyleGAN to synthesize video frame. In this way, most of textures are fully +preserved. Moreover, our approach can inherently achieve background separation +and facilitate mask-guided facial local editing. In particular, by editing the +mask and swapping the region textures from a given reference image (e.g. hair, +lip, eyebrows), our approach enables facial editing seamlessly when generating +talking face video. Experiments demonstrate that our proposed approach can +effectively preserve texture details and generate temporally consistent video +while remaining competitive in lip synchronization. Quantitative and +qualitative results on the HDTF and MEAD datasets illustrate the superior +performance of our method over existing methods. + +
+
+ comment: 10 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ TCDiff: Triple Condition Diffusion Model with 3D Constraints for + Stylizing Synthetic Faces + + +
+ A robust face recognition model must be trained using datasets that include a +large number of subjects and numerous samples per subject under varying +conditions (such as pose, expression, age, noise, and occlusion). Due to +ethical and privacy concerns, large-scale real face datasets have been +discontinued, such as MS1MV3, and synthetic face generators have been proposed, +utilizing GANs and Diffusion Models, such as SYNFace, SFace, DigiFace-1M, +IDiff-Face, DCFace, and GANDiffFace, aiming to supply this demand. Some of +these methods can produce high-fidelity realistic faces, but with low +intra-class variance, while others generate high-variance faces with low +identity consistency. In this paper, we propose a Triple Condition Diffusion +Model (TCDiff) to improve face style transfer from real to synthetic faces +through 2D and 3D facial constraints, enhancing face identity consistency while +keeping the necessary high intra-class variance. Face recognition experiments +using 1k, 2k, and 5k classes of our new dataset for training outperform +state-of-the-art synthetic datasets in real face benchmarks such as LFW, +CFP-FP, AgeDB, and BUPT. Our source code is available at: +https://github.com/BOVIFOCR/tcdiff. + +
+
+ comment: SIBGRAPI 2024 +
+
+
+
+
+ + ☆ A practical approach to evaluating the adversarial distance for machine + learning classifiers + + +
+ Robustness is critical for machine learning (ML) classifiers to ensure +consistent performance in real-world applications where models may encounter +corrupted or adversarial inputs. In particular, assessing the robustness of +classifiers to adversarial inputs is essential to protect systems from +vulnerabilities and thus ensure safety in use. However, methods to accurately +compute adversarial robustness have been challenging for complex ML models and +high-dimensional data. Furthermore, evaluations typically measure adversarial +accuracy on specific attack budgets, limiting the informative value of the +resulting metrics. This paper investigates the estimation of the more +informative adversarial distance using iterative adversarial attacks and a +certification approach. Combined, the methods provide a comprehensive +evaluation of adversarial robustness by computing estimates for the upper and +lower bounds of the adversarial distance. We present visualisations and +ablation studies that provide insights into how this evaluation method should +be applied and parameterised. We find that our adversarial attack approach is +effective compared to related implementations, while the certification method +falls short of expectations. The approach in this paper should encourage a more +informative way of evaluating the adversarial robustness of ML classifiers. + +
+
+ comment: Accepted manuscript at International Mechanical Engineering Congress + and Exposition IMECE2024 +
+
+
+
+
+ + ☆ Text-Guided Mixup Towards Long-Tailed Image Categorization BMVC'24 + + +
+ In many real-world applications, the frequency distribution of class labels +for training data can exhibit a long-tailed distribution, which challenges +traditional approaches of training deep neural networks that require heavy +amounts of balanced data. Gathering and labeling data to balance out the class +label distribution can be both costly and time-consuming. Many existing +solutions that enable ensemble learning, re-balancing strategies, or +fine-tuning applied to deep neural networks are limited by the inert problem of +few class samples across a subset of classes. Recently, vision-language models +like CLIP have been observed as effective solutions to zero-shot or few-shot +learning by grasping a similarity between vision and language features for +image and text pairs. Considering that large pre-trained vision-language models +may contain valuable side textual information for minor classes, we propose to +leverage text supervision to tackle the challenge of long-tailed learning. +Concretely, we propose a novel text-guided mixup technique that takes advantage +of the semantic relations between classes recognized by the pre-trained text +encoder to help alleviate the long-tailed problem. Our empirical study on +benchmark long-tailed tasks demonstrates the effectiveness of our proposal with +a theoretical guarantee. Our code is available at +https://github.com/rsamf/text-guided-mixup. + +
+
+ comment: Accepted by BMVC'24, code is available at + https://github.com/rsamf/text-guided-mixup +
+
+
+
+
+ + ☆ MaskVal: Simple but Effective Uncertainty Quantification for 6D Pose + Estimation + + +
+ For the use of 6D pose estimation in robotic applications, reliable poses are +of utmost importance to ensure a safe, reliable and predictable operational +performance. Despite these requirements, state-of-the-art 6D pose estimators +often do not provide any uncertainty quantification for their pose estimates at +all, or if they do, it has been shown that the uncertainty provided is only +weakly correlated with the actual true error. To address this issue, we +investigate a simple but effective uncertainty quantification, that we call +MaskVal, which compares the pose estimates with their corresponding instance +segmentations by rendering and does not require any modification of the pose +estimator itself. Despite its simplicity, MaskVal significantly outperforms a +state-of-the-art ensemble method on both a dataset and a robotic setup. We show +that by using MaskVal, the performance of a state-of-the-art 6D pose estimator +is significantly improved towards a safe and reliable operation. In addition, +we propose a new and specific approach to compare and evaluate uncertainty +quantification methods for 6D pose estimation in the context of robotic +manipulation. + +
+
+
+
+
+ + ☆ Unified Framework for Neural Network Compression via Decomposition and + Optimal Rank Selection + + +
+ Despite their high accuracy, complex neural networks demand significant +computational resources, posing challenges for deployment on +resource-constrained devices such as mobile phones and embedded systems. +Compression algorithms have been developed to address these challenges by +reducing model size and computational demands while maintaining accuracy. Among +these approaches, factorization methods based on tensor decomposition are +theoretically sound and effective. However, they face difficulties in selecting +the appropriate rank for decomposition. This paper tackles this issue by +presenting a unified framework that simultaneously applies decomposition and +optimal rank selection, employing a composite compression loss within defined +rank constraints. Our approach includes an automatic rank search in a +continuous space, efficiently identifying optimal rank configurations without +the use of training data, making it computationally efficient. Combined with a +subsequent fine-tuning step, our approach maintains the performance of highly +compressed models on par with their original counterparts. Using various +benchmark datasets, we demonstrate the efficacy of our method through a +comprehensive analysis. + +
+
+
+
+
+ + ☆ Organized Grouped Discrete Representation for Object-Centric Learning + + +
+ Object-Centric Learning (OCL) represents dense image or video pixels as +sparse object features. Representative methods utilize discrete representation +composed of Variational Autoencoder (VAE) template features to suppress +pixel-level information redundancy and guide object-level feature aggregation. +The most recent advancement, Grouped Discrete Representation (GDR), further +decomposes these template features into attributes. However, its naive channel +grouping as decomposition may erroneously group channels belonging to different +attributes together and discretize them as sub-optimal template attributes, +which losses information and harms expressivity. We propose Organized GDR +(OGDR) to organize channels belonging to the same attributes together for +correct decomposition from features into attributes. In unsupervised +segmentation experiments, OGDR is fully superior to GDR in augmentating +classical transformer-based OCL methods; it even improves state-of-the-art +diffusion-based ones. Codebook PCA and representation similarity analyses show +that compared with GDR, our OGDR eliminates redundancy and preserves +information better for guiding object representation learning. The source code +is available in the supplementary material. + +
+
+
+
+
+ + ☆ DKDM: Data-Free Knowledge Distillation for Diffusion Models with Any + Architecture + + +
+ Diffusion models (DMs) have demonstrated exceptional generative capabilities +across various areas, while they are hindered by slow inference speeds and high +computational demands during deployment. The most common way to accelerate DMs +involves reducing the number of denoising steps during generation, achieved +through faster sampling solvers or knowledge distillation (KD). In contrast to +prior approaches, we propose a novel method that transfers the capability of +large pretrained DMs to faster architectures. Specifically, we employ KD in a +distinct manner to compress DMs by distilling their generative ability into +more rapid variants. Furthermore, considering that the source data is either +unaccessible or too enormous to store for current generative models, we +introduce a new paradigm for their distillation without source data, termed +Data-Free Knowledge Distillation for Diffusion Models (DKDM). Generally, our +established DKDM framework comprises two main components: 1) a DKDM objective +that uses synthetic denoising data produced by pretrained DMs to optimize +faster DMs without source data, and 2) a dynamic iterative distillation method +that flexibly organizes the synthesis of denoising data, preventing it from +slowing down the optimization process as the generation is slow. To our +knowledge, this is the first attempt at using KD to distill DMs into any +architecture in a data-free manner. Importantly, our DKDM is orthogonal to most +existing acceleration methods, such as denoising step reduction, quantization +and pruning. Experiments show that our DKDM is capable of deriving 2x faster +DMs with performance remaining on par with the baseline. Notably, our DKDM +enables pretrained DMs to function as "datasets" for training new DMs. + +
+
+
+
+
+ + ☆ Prediction Accuracy & Reliability: Classification and Object + Localization under Distribution Shift + + +
+ Natural distribution shift causes a deterioration in the perception +performance of convolutional neural networks (CNNs). This comprehensive +analysis for real-world traffic data addresses: 1) investigating the effect of +natural distribution shift and weather augmentations on both detection quality +and confidence estimation, 2) evaluating model performance for both +classification and object localization, and 3) benchmarking two common +uncertainty quantification methods - Ensembles and different variants of +Monte-Carlo (MC) Dropout - under natural and close-to-natural distribution +shift. For this purpose, a novel dataset has been curated from publicly +available autonomous driving datasets. The in-distribution (ID) data is based +on cutouts of a single object, for which both class and bounding box +annotations are available. The six distribution-shift datasets cover adverse +weather scenarios, simulated rain and fog, corner cases, and +out-of-distribution data. A granular analysis of CNNs under distribution shift +allows to quantize the impact of different types of shifts on both, task +performance and confidence estimation: ConvNeXt-Tiny is more robust than +EfficientNet-B0; heavy rain degrades classification stronger than localization, +contrary to heavy fog; integrating MC-Dropout into selected layers only has the +potential to enhance task performance and confidence estimation, whereby the +identification of these layers depends on the type of distribution shift and +the considered task. + +
+
+ comment: This preprint has not undergone any post-submission improvements or + corrections +
+
+
+
+
+ + ☆ Use of triplet loss for facial restoration in low-resolution images + + +
+ In recent years, facial recognition (FR) models have become the most widely +used biometric tool, achieving impressive results on numerous datasets. +However, inherent hardware challenges or shooting distances often result in +low-resolution images, which significantly impact the performance of FR models. +To address this issue, several solutions have been proposed, including +super-resolution (SR) models that generate highly realistic faces. Despite +these efforts, significant improvements in FR algorithms have not been +achieved. We propose a novel SR model FTLGAN, which focuses on generating +high-resolution images that preserve individual identities rather than merely +improving image quality, thereby maximizing the performance of FR models. The +results are compelling, demonstrating a mean value of d' 21% above the best +current state-of-the-art models, specifically having a value of d' = 1.099 and +AUC = 0.78 for 14x14 pixels, d' = 2.112 and AUC = 0.92 for 28x28 pixels, and d' += 3.049 and AUC = 0.98 for 56x56 pixels. The contributions of this study are +significant in several key areas. Firstly, a notable improvement in facial +recognition performance has been achieved in low-resolution images, +specifically at resolutions of 14x14, 28x28, and 56x56 pixels. Secondly, the +enhancements demonstrated by FTLGAN show a consistent response across all +resolutions, delivering outstanding performance uniformly, unlike other +comparative models. Thirdly, an innovative approach has been implemented using +triplet loss logic, enabling the training of the super-resolution model solely +with real images, contrasting with current models, and expanding potential +real-world applications. Lastly, this study introduces a novel model that +specifically addresses the challenge of improving classification performance in +facial recognition systems by integrating facial recognition quality as a loss +during model training. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ FrozenSeg: Harmonizing Frozen Foundation Models for Open-Vocabulary + Segmentation + + +
+ Open-vocabulary segmentation poses significant challenges, as it requires +segmenting and recognizing objects across an open set of categories in +unconstrained environments. Building on the success of powerful vision-language +(ViL) foundation models, such as CLIP, recent efforts sought to harness their +zero-short capabilities to recognize unseen categories. Despite notable +performance improvements, these models still encounter the critical issue of +generating precise mask proposals for unseen categories and scenarios, +resulting in inferior segmentation performance eventually. To address this +challenge, we introduce a novel approach, FrozenSeg, designed to integrate +spatial knowledge from a localization foundation model (e.g., SAM) and semantic +knowledge extracted from a ViL model (e.g., CLIP), in a synergistic framework. +Taking the ViL model's visual encoder as the feature backbone, we inject the +space-aware feature into the learnable queries and CLIP features within the +transformer decoder. In addition, we devise a mask proposal ensemble strategy +for further improving the recall rate and mask quality. To fully exploit +pre-trained knowledge while minimizing training overhead, we freeze both +foundation models, focusing optimization efforts solely on a lightweight +transformer decoder for mask proposal generation-the performance bottleneck. +Extensive experiments demonstrate that FrozenSeg advances state-of-the-art +results across various segmentation benchmarks, trained exclusively on COCO +panoptic data, and tested in a zero-shot manner. Code is available at +https://github.com/chenxi52/FrozenSeg. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ Have Large Vision-Language Models Mastered Art History? + + +
+ The emergence of large Vision-Language Models (VLMs) has recently established +new baselines in image classification across multiple domains. However, the +performance of VLMs in the specific task of artwork classification, +particularly art style classification of paintings - a domain traditionally +mastered by art historians - has not been explored yet. Artworks pose a unique +challenge compared to natural images due to their inherently complex and +diverse structures, characterized by variable compositions and styles. Art +historians have long studied the unique aspects of artworks, with style +prediction being a crucial component of their discipline. This paper +investigates whether large VLMs, which integrate visual and textual data, can +effectively predict the art historical attributes of paintings. We conduct an +in-depth analysis of four VLMs, namely CLIP, LLaVA, OpenFlamingo, and GPT-4o, +focusing on zero-shot classification of art style, author and time period using +two public benchmarks of artworks. Additionally, we present ArTest, a +well-curated test set of artworks, including pivotal paintings studied by art +historians. + +
+
+
+
+
+ + ☆ Tissue Concepts: supervised foundation models in computational pathology + + +
+ Due to the increasing workload of pathologists, the need for automation to +support diagnostic tasks and quantitative biomarker evaluation is becoming more +and more apparent. Foundation models have the potential to improve +generalizability within and across centers and serve as starting points for +data efficient development of specialized yet robust AI models. However, the +training foundation models themselves is usually very expensive in terms of +data, computation, and time. This paper proposes a supervised training method +that drastically reduces these expenses. The proposed method is based on +multi-task learning to train a joint encoder, by combining 16 different +classification, segmentation, and detection tasks on a total of 912,000 +patches. Since the encoder is capable of capturing the properties of the +samples, we term it the Tissue Concepts encoder. To evaluate the performance +and generalizability of the Tissue Concepts encoder across centers, +classification of whole slide images from four of the most prevalent solid +cancers - breast, colon, lung, and prostate - was used. The experiments show +that the Tissue Concepts model achieve comparable performance to models trained +with self-supervision, while requiring only 6% of the amount of training +patches. Furthermore, the Tissue Concepts encoder outperforms an ImageNet +pre-trained encoder on both in-domain and out-of-domain data. + +
+
+ comment: 22 Pages, 3 Figures, submitted to and under revision at Computers in + Biology and Medicine +
+
+
+
+
+ + ☆ LMLT: Low-to-high Multi-Level Vision Transformer for Image + Super-Resolution + + +
+ Recent Vision Transformer (ViT)-based methods for Image Super-Resolution have +demonstrated impressive performance. However, they suffer from significant +complexity, resulting in high inference times and memory usage. Additionally, +ViT models using Window Self-Attention (WSA) face challenges in processing +regions outside their windows. To address these issues, we propose the +Low-to-high Multi-Level Transformer (LMLT), which employs attention with +varying feature sizes for each head. LMLT divides image features along the +channel dimension, gradually reduces spatial size for lower heads, and applies +self-attention to each head. This approach effectively captures both local and +global information. By integrating the results from lower heads into higher +heads, LMLT overcomes the window boundary issues in self-attention. Extensive +experiments show that our model significantly reduces inference time and GPU +memory usage while maintaining or even surpassing the performance of +state-of-the-art ViT-based Image Super-Resolution methods. Our codes are +availiable at https://github.com/jwgdmkj/LMLT. + +
+
+
+
+
+ + ☆ Blended Latent Diffusion under Attention Control for Real-World Video + Editing + + +
+ Due to lack of fully publicly available text-to-video models, current video +editing methods tend to build on pre-trained text-to-image generation models, +however, they still face grand challenges in dealing with the local editing of +video with temporal information. First, although existing methods attempt to +focus on local area editing by a pre-defined mask, the preservation of the +outside-area background is non-ideal due to the spatially entire generation of +each frame. In addition, specially providing a mask by user is an additional +costly undertaking, so an autonomous masking strategy integrated into the +editing process is desirable. Last but not least, image-level pretrained model +hasn't learned temporal information across frames of a video which is vital for +expressing the motion and dynamics. In this paper, we propose to adapt a +image-level blended latent diffusion model to perform local video editing +tasks. Specifically, we leverage DDIM inversion to acquire the latents as +background latents instead of the randomly noised ones to better preserve the +background information of the input video. We further introduce an autonomous +mask manufacture mechanism derived from cross-attention maps in diffusion +steps. Finally, we enhance the temporal consistency across video frames by +transforming the self-attention blocks of U-Net into temporal-spatial blocks. +Through extensive experiments, our proposed approach demonstrates effectiveness +in different real-world video editing tasks. + +
+
+
+
+
+ + ☆ ScreenMark: Watermarking Arbitrary Visual Content on Screen + + +
+ Digital watermarking has demonstrated its effectiveness in protecting +multimedia content. However, existing watermarking are predominantly tailored +for specific media types, rendering them less effective for the protection of +content displayed on computer screens, which is often multimodal and dynamic. +Visual Screen Content (VSC), is particularly susceptible to theft and leakage +via screenshots, a vulnerability that current watermarking methods fail to +adequately address.To tackle these challenges, we propose ScreenMark, a robust +and practical watermarking method designed specifically for arbitrary VSC +protection. ScreenMark utilizes a three-stage progressive watermarking +framework. Initially, inspired by diffusion principles, we initialize the +mutual transformation between regular watermark information and irregular +watermark patterns. Subsequently, these patterns are integrated with screen +content using a pre-multiplication alpha blending technique, supported by a +pre-trained screen decoder for accurate watermark retrieval. The progressively +complex distorter enhances the robustness of the watermark in real-world +screenshot scenarios. Finally, the model undergoes fine-tuning guided by a +joint-level distorter to ensure optimal performance.To validate the +effectiveness of ScreenMark, we compiled a dataset comprising 100,000 +screenshots from various devices and resolutions. Extensive experiments across +different datasets confirm the method's superior robustness, imperceptibility, +and practical applicability. + +
+
+
+
+
+ + ☆ Improving Uncertainty-Error Correspondence in Deep Bayesian Medical + Image Segmentation + + +
+ Increased usage of automated tools like deep learning in medical image +segmentation has alleviated the bottleneck of manual contouring. This has +shifted manual labour to quality assessment (QA) of automated contours which +involves detecting errors and correcting them. A potential solution to +semi-automated QA is to use deep Bayesian uncertainty to recommend potentially +erroneous regions, thus reducing time spent on error detection. Previous work +has investigated the correspondence between uncertainty and error, however, no +work has been done on improving the "utility" of Bayesian uncertainty maps such +that it is only present in inaccurate regions and not in the accurate ones. Our +work trains the FlipOut model with the Accuracy-vs-Uncertainty (AvU) loss which +promotes uncertainty to be present only in inaccurate regions. We apply this +method on datasets of two radiotherapy body sites, c.f. head-and-neck CT and +prostate MR scans. Uncertainty heatmaps (i.e. predictive entropy) are evaluated +against voxel inaccuracies using Receiver Operating Characteristic (ROC) and +Precision-Recall (PR) curves. Numerical results show that when compared to the +Bayesian baseline the proposed method successfully suppresses uncertainty for +accurate voxels, with similar presence of uncertainty for inaccurate voxels. +Code to reproduce experiments is available at +https://github.com/prerakmody/bayesuncertainty-error-correspondence + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024:018 +
+
+
+
+
+ + ☆ LowFormer: Hardware Efficient Design for Convolutional Transformer + Backbones WACV 2025 + + +
+ Research in efficient vision backbones is evolving into models that are a +mixture of convolutions and transformer blocks. A smart combination of both, +architecture-wise and component-wise is mandatory to excel in the speedaccuracy +trade-off. Most publications focus on maximizing accuracy and utilize MACs +(multiply accumulate operations) as an efficiency metric. The latter however +often do not measure accurately how fast a model actually is due to factors +like memory access cost and degree of parallelism. We analyzed common modules +and architectural design choices for backbones not in terms of MACs, but rather +in actual throughput and latency, as the combination of the latter two is a +better representation of the efficiency of models in real applications. We +applied the conclusions taken from that analysis to create a recipe for +increasing hardware-efficiency in macro design. Additionally we introduce a +simple slimmed-down version of MultiHead Self-Attention, that aligns with our +analysis. We combine both macro and micro design to create a new family of +hardware-efficient backbone networks called LowFormer. LowFormer achieves a +remarkable speedup in terms of throughput and latency, while achieving similar +or better accuracy than current state-of-the-art efficient backbones. In order +to prove the generalizability of our hardware-efficient design, we evaluate our +method on GPU, mobile GPU and ARM CPU. We further show that the downstream +tasks object detection and semantic segmentation profit from our +hardware-efficient architecture. Code and models are available at +https://github.com/ altair199797/LowFormer. + +
+
+ comment: Accepted at WACV 2025. Features 11 pages in total +
+
+
+
+
+ + ☆ Non-Uniform Illumination Attack for Fooling Convolutional Neural + Networks + + +
+ Convolutional Neural Networks (CNNs) have made remarkable strides; however, +they remain susceptible to vulnerabilities, particularly in the face of minor +image perturbations that humans can easily recognize. This weakness, often +termed as 'attacks', underscores the limited robustness of CNNs and the need +for research into fortifying their resistance against such manipulations. This +study introduces a novel Non-Uniform Illumination (NUI) attack technique, where +images are subtly altered using varying NUI masks. Extensive experiments are +conducted on widely-accepted datasets including CIFAR10, TinyImageNet, and +CalTech256, focusing on image classification with 12 different NUI attack +models. The resilience of VGG, ResNet, MobilenetV3-small and InceptionV3 models +against NUI attacks are evaluated. Our results show a substantial decline in +the CNN models' classification accuracy when subjected to NUI attacks, +indicating their vulnerability under non-uniform illumination. To mitigate +this, a defense strategy is proposed, including NUI-attacked images, generated +through the new NUI transformation, into the training set. The results +demonstrate a significant enhancement in CNN model performance when confronted +with perturbed images affected by NUI attacks. This strategy seeks to bolster +CNN models' resilience against NUI attacks. + +
+
+
+
+
+ + ☆ LM-Gaussian: Boost Sparse-view 3D Gaussian Splatting with Large Model + Priors + + +
+ We aim to address sparse-view reconstruction of a 3D scene by leveraging +priors from large-scale vision models. While recent advancements such as 3D +Gaussian Splatting (3DGS) have demonstrated remarkable successes in 3D +reconstruction, these methods typically necessitate hundreds of input images +that densely capture the underlying scene, making them time-consuming and +impractical for real-world applications. However, sparse-view reconstruction is +inherently ill-posed and under-constrained, often resulting in inferior and +incomplete outcomes. This is due to issues such as failed initialization, +overfitting on input images, and a lack of details. To mitigate these +challenges, we introduce LM-Gaussian, a method capable of generating +high-quality reconstructions from a limited number of images. Specifically, we +propose a robust initialization module that leverages stereo priors to aid in +the recovery of camera poses and the reliable point clouds. Additionally, a +diffusion-based refinement is iteratively applied to incorporate image +diffusion priors into the Gaussian optimization process to preserve intricate +scene details. Finally, we utilize video diffusion priors to further enhance +the rendered images for realistic visual effects. Overall, our approach +significantly reduces the data acquisition requirements compared to previous +3DGS methods. We validate the effectiveness of our framework through +experiments on various public datasets, demonstrating its potential for +high-quality 360-degree scene reconstruction. Visual results are on our +website. + +
+
+ comment: Project page: https://hanyangyu1021.github.io/lm-gaussian.github.io/ +
+
+
+
+
+ + ☆ Data-free Distillation with Degradation-prompt Diffusion for + Multi-weather Image Restoration + + +
+ Multi-weather image restoration has witnessed incredible progress, while the +increasing model capacity and expensive data acquisition impair its +applications in memory-limited devices. Data-free distillation provides an +alternative for allowing to learn a lightweight student model from a +pre-trained teacher model without relying on the original training data. The +existing data-free learning methods mainly optimize the models with the pseudo +data generated by GANs or the real data collected from the Internet. However, +they inevitably suffer from the problems of unstable training or domain shifts +with the original data. In this paper, we propose a novel Data-free +Distillation with Degradation-prompt Diffusion framework for multi-weather +Image Restoration (D4IR). It replaces GANs with pre-trained diffusion models to +avoid model collapse and incorporates a degradation-aware prompt adapter to +facilitate content-driven conditional diffusion for generating domain-related +images. Specifically, a contrast-based degradation prompt adapter is firstly +designed to capture degradation-aware prompts from web-collected degraded +images. Then, the collected unpaired clean images are perturbed to latent +features of stable diffusion, and conditioned with the degradation-aware +prompts to synthesize new domain-related degraded images for knowledge +distillation. Experiments illustrate that our proposal achieves comparable +performance to the model distilled with original training data, and is even +superior to other mainstream unsupervised methods. + +
+
+
+
+
+ + ☆ Automatic occlusion removal from 3D maps for maritime situational + awareness SP + + +
+ We introduce a novel method for updating 3D geospatial models, specifically +targeting occlusion removal in large-scale maritime environments. Traditional +3D reconstruction techniques often face problems with dynamic objects, like +cars or vessels, that obscure the true environment, leading to inaccurate +models or requiring extensive manual editing. Our approach leverages deep +learning techniques, including instance segmentation and generative inpainting, +to directly modify both the texture and geometry of 3D meshes without the need +for costly reprocessing. By selectively targeting occluding objects and +preserving static elements, the method enhances both geometric and visual +accuracy. This approach not only preserves structural and textural details of +map data but also maintains compatibility with current geospatial standards, +ensuring robust performance across diverse datasets. The results demonstrate +significant improvements in 3D model fidelity, making this method highly +applicable for maritime situational awareness and the dynamic display of +auxiliary information. + +
+
+ comment: Preprint of SPIE Sensor + Imaging 2024 conference paper +
+
+
+
+
+ + ☆ Shuffle Vision Transformer: Lightweight, Fast and Efficient Recognition + of Driver Facial Expression + + +
+ Existing methods for driver facial expression recognition (DFER) are often +computationally intensive, rendering them unsuitable for real-time +applications. In this work, we introduce a novel transfer learning-based dual +architecture, named ShuffViT-DFER, which elegantly combines computational +efficiency and accuracy. This is achieved by harnessing the strengths of two +lightweight and efficient models using convolutional neural network (CNN) and +vision transformers (ViT). We efficiently fuse the extracted features to +enhance the performance of the model in accurately recognizing the facial +expressions of the driver. Our experimental results on two benchmarking and +public datasets, KMU-FED and KDEF, highlight the validity of our proposed +method for real-time application with superior performance when compared to +state-of-the-art methods. + +
+
+ comment: Accepted for publication in The 6th IEEE International Conference on + Artificial Intelligence Circuits and Systems (IEEE AICAS 2024), 5 pages, 3 + figures +
+
+
+
+
+ + ☆ A Key-Driven Framework for Identity-Preserving Face Anonymization NDSS + + +
+ Virtual faces are crucial content in the metaverse. Recently, attempts have +been made to generate virtual faces for privacy protection. Nevertheless, these +virtual faces either permanently remove the identifiable information or map the +original identity into a virtual one, which loses the original identity +forever. In this study, we first attempt to address the conflict between +privacy and identifiability in virtual faces, where a key-driven face +anonymization and authentication recognition (KFAAR) framework is proposed. +Concretely, the KFAAR framework consists of a head posture-preserving virtual +face generation (HPVFG) module and a key-controllable virtual face +authentication (KVFA) module. The HPVFG module uses a user key to project the +latent vector of the original face into a virtual one. Then it maps the virtual +vectors to obtain an extended encoding, based on which the virtual face is +generated. By simultaneously adding a head posture and facial expression +correction module, the virtual face has the same head posture and facial +expression as the original face. During the authentication, we propose a KVFA +module to directly recognize the virtual faces using the correct user key, +which can obtain the original identity without exposing the original face +image. We also propose a multi-task learning objective to train HPVFG and KVFA. +Extensive experiments demonstrate the advantages of the proposed HPVFG and KVFA +modules, which effectively achieve both facial anonymity and identifiability. + +
+
+ comment: Accepted by NDSS Symposium 2025. Please cite this paper as "Miaomiao + Wang, Guang Hua, Sheng Li, and Guorui Feng. A Key-Driven Framework for + Identity-Preserving Face Anonymization. In the 32nd Annual Network and + Distributed System Security Symposium (NDSS 2025)." +
+
+
+
+
+ + ☆ UV-Mamba: A DCN-Enhanced State Space Model for Urban Village Boundary + Identification in High-Resolution Remote Sensing Images + + +
+ Owing to the diverse geographical environments, intricate landscapes, and +high-density settlements, the automatic identification of urban village +boundaries using remote sensing images is a highly challenging task. This paper +proposes a novel and efficient neural network model called UV-Mamba for +accurate boundary detection in high-resolution remote sensing images. UV-Mamba +mitigates the memory loss problem in long sequence modeling, which arises in +state space model (SSM) with increasing image size, by incorporating deformable +convolutions (DCN). Its architecture utilizes an encoder-decoder framework, +includes an encoder with four deformable state space augmentation (DSSA) blocks +for efficient multi-level semantic extraction and a decoder to integrate the +extracted semantic information. We conducted experiments on the Beijing and +Xi'an datasets, and the results show that UV-Mamba achieves state-of-the-art +performance. Specifically, our model achieves 73.3% and 78.1% IoU on the +Beijing and Xi'an datasets, respectively, representing improvements of 1.2% and +3.4% IoU over the previous best model, while also being 6x faster in inference +speed and 40x smaller in parameter count. Source code and pre-trained models +are available in the supplementary material. + +
+
+ comment: 5 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ Weight Conditioning for Smooth Optimization of Neural Networks ECCV 2024 + + +
+ In this article, we introduce a novel normalization technique for neural +network weight matrices, which we term weight conditioning. This approach aims +to narrow the gap between the smallest and largest singular values of the +weight matrices, resulting in better-conditioned matrices. The inspiration for +this technique partially derives from numerical linear algebra, where +well-conditioned matrices are known to facilitate stronger convergence results +for iterative solvers. We provide a theoretical foundation demonstrating that +our normalization technique smoothens the loss landscape, thereby enhancing +convergence of stochastic gradient descent algorithms. Empirically, we validate +our normalization across various neural network architectures, including +Convolutional Neural Networks (CNNs), Vision Transformers (ViT), Neural +Radiance Fields (NeRF), and 3D shape modeling. Our findings indicate that our +normalization method is not only competitive but also outperforms existing +weight normalization techniques from the literature. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ mPLUG-DocOwl2: High-resolution Compressing for OCR-free Multi-page + Document Understanding + + +
+ Multimodel Large Language Models(MLLMs) have achieved promising OCR-free +Document Understanding performance by increasing the supported resolution of +document images. However, this comes at the cost of generating thousands of +visual tokens for a single document image, leading to excessive GPU memory and +slower inference times, particularly in multi-page document comprehension. In +this work, to address these challenges, we propose a High-resolution +DocCompressor module to compress each high-resolution document image into 324 +tokens, guided by low-resolution global visual features. With this compression +module, to strengthen multi-page document comprehension ability and balance +both token efficiency and question-answering performance, we develop the +DocOwl2 under a three-stage training framework: Single-image Pretraining, +Multi-image Continue-pretraining, and Multi-task Finetuning. DocOwl2 sets a new +state-of-the-art across multi-page document understanding benchmarks and +reduces first token latency by more than 50%, demonstrating advanced +capabilities in multi-page questioning answering, explanation with evidence +pages, and cross-page structure understanding. Additionally, compared to +single-image MLLMs trained on similar data, our DocOwl2 achieves comparable +single-page understanding performance with less than 20% of the visual tokens. +Our codes, models, and data are publicly available at +https://github.com/X-PLUG/mPLUG-DocOwl/tree/main/DocOwl2. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ☆ TG-LMM: Enhancing Medical Image Segmentation Accuracy through + Text-Guided Large Multi-Modal Model + + +
+ We propose TG-LMM (Text-Guided Large Multi-Modal Model), a novel approach +that leverages textual descriptions of organs to enhance segmentation accuracy +in medical images. Existing medical image segmentation methods face several +challenges: current medical automatic segmentation models do not effectively +utilize prior knowledge, such as descriptions of organ locations; previous +text-visual models focus on identifying the target rather than improving the +segmentation accuracy; prior models attempt to use prior knowledge to enhance +accuracy but do not incorporate pre-trained models. To address these issues, +TG-LMM integrates prior knowledge, specifically expert descriptions of the +spatial locations of organs, into the segmentation process. Our model utilizes +pre-trained image and text encoders to reduce the number of training parameters +and accelerate the training process. Additionally, we designed a comprehensive +image-text information fusion structure to ensure thorough integration of the +two modalities of data. We evaluated TG-LMM on three authoritative medical +image datasets, encompassing the segmentation of various parts of the human +body. Our method demonstrated superior performance compared to existing +approaches, such as MedSAM, SAM and nnUnet. + +
+
+ comment: 11 pages, 2 figures +
+
+
+
+
+ + ☆ KAN See In the Dark + + +
+ Existing low-light image enhancement methods are difficult to fit the complex +nonlinear relationship between normal and low-light images due to uneven +illumination and noise effects. The recently proposed Kolmogorov-Arnold +networks (KANs) feature spline-based convolutional layers and learnable +activation functions, which can effectively capture nonlinear dependencies. In +this paper, we design a KAN-Block based on KANs and innovatively apply it to +low-light image enhancement. This method effectively alleviates the limitations +of current methods constrained by linear network structures and lack of +interpretability, further demonstrating the potential of KANs in low-level +vision tasks. Given the poor perception of current low-light image enhancement +methods and the stochastic nature of the inverse diffusion process, we further +introduce frequency-domain perception for visually oriented enhancement. +Extensive experiments demonstrate the competitive performance of our method on +benchmark datasets. The code will be available at: +https://github.com/AXNing/KSID}{https://github.com/AXNing/KSID. + +
+
+
+
+
+ + ☆ Make Graph-based Referring Expression Comprehension Great Again through + Expression-guided Dynamic Gating and Regression + + +
+ One common belief is that with complex models and pre-training on large-scale +datasets, transformer-based methods for referring expression comprehension +(REC) perform much better than existing graph-based methods. We observe that +since most graph-based methods adopt an off-the-shelf detector to locate +candidate objects (i.e., regions detected by the object detector), they face +two challenges that result in subpar performance: (1) the presence of +significant noise caused by numerous irrelevant objects during reasoning, and +(2) inaccurate localization outcomes attributed to the provided detector. To +address these issues, we introduce a plug-and-adapt module guided by +sub-expressions, called dynamic gate constraint (DGC), which can adaptively +disable irrelevant proposals and their connections in graphs during reasoning. +We further introduce an expression-guided regression strategy (EGR) to refine +location prediction. Extensive experimental results on the RefCOCO, RefCOCO+, +RefCOCOg, Flickr30K, RefClef, and Ref-reasoning datasets demonstrate the +effectiveness of the DGC module and the EGR strategy in consistently boosting +the performances of various graph-based REC methods. Without any pretaining, +the proposed graph-based method achieves better performance than the +state-of-the-art (SOTA) transformer-based methods. + +
+
+ comment: 12 pages to appear in IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ TBConvL-Net: A Hybrid Deep Learning Architecture for Robust Medical + Image Segmentation + + +
+ Deep learning has shown great potential for automated medical image +segmentation to improve the precision and speed of disease diagnostics. +However, the task presents significant difficulties due to variations in the +scale, shape, texture, and contrast of the pathologies. Traditional +convolutional neural network (CNN) models have certain limitations when it +comes to effectively modelling multiscale context information and facilitating +information interaction between skip connections across levels. To overcome +these limitations, a novel deep learning architecture is introduced for medical +image segmentation, taking advantage of CNNs and vision transformers. Our +proposed model, named TBConvL-Net, involves a hybrid network that combines the +local features of a CNN encoder-decoder architecture with long-range and +temporal dependencies using biconvolutional long-short-term memory (LSTM) +networks and vision transformers (ViT). This enables the model to capture +contextual channel relationships in the data and account for the uncertainty of +segmentation over time. Additionally, we introduce a novel composite loss +function that considers both the segmentation robustness and the boundary +agreement of the predicted output with the gold standard. Our proposed model +shows consistent improvement over the state of the art on ten publicly +available datasets of seven different medical imaging modalities. + +
+
+
+
+
+ + ☆ MouseSIS: A Frames-and-Events Dataset for Space-Time Instance + Segmentation of Mice ECCV + + +
+ Enabled by large annotated datasets, tracking and segmentation of objects in +videos has made remarkable progress in recent years. Despite these +advancements, algorithms still struggle under degraded conditions and during +fast movements. Event cameras are novel sensors with high temporal resolution +and high dynamic range that offer promising advantages to address these +challenges. However, annotated data for developing learning-based mask-level +tracking algorithms with events is not available. To this end, we introduce: +($i$) a new task termed \emph{space-time instance segmentation}, similar to +video instance segmentation, whose goal is to segment instances throughout the +entire duration of the sensor input (here, the input are quasi-continuous +events and optionally aligned frames); and ($ii$) \emph{\dname}, a dataset for +the new task, containing aligned grayscale frames and events. It includes +annotated ground-truth labels (pixel-level instance segmentation masks) of a +group of up to seven freely moving and interacting mice. We also provide two +reference methods, which show that leveraging event data can consistently +improve tracking performance, especially when used in combination with +conventional cameras. The results highlight the potential of event-aided +tracking in difficult scenarios. We hope our dataset opens the field of +event-based video instance segmentation and enables the development of robust +tracking algorithms for challenging +conditions.\url{https://github.com/tub-rip/MouseSIS} + +
+
+ comment: 18 pages, 5 figures, ECCV Workshops +
+
+
+
+
+ + ☆ Few-Shot Continual Learning for Activity Recognition in Classroom + Surveillance Images + + +
+ The application of activity recognition in the "AI + Education" field is +gaining increasing attention. However, current work mainly focuses on the +recognition of activities in manually captured videos and a limited number of +activity types, with little attention given to recognizing activities in +surveillance images from real classrooms. In real classroom settings, normal +teaching activities such as reading, account for a large proportion of samples, +while rare non-teaching activities such as eating, continue to appear. This +requires a model that can learn non-teaching activities from few samples +without forgetting the normal teaching activities, which necessitates fewshot +continual learning (FSCL) capability. To address this gap, we constructed a +continual learning dataset focused on classroom surveillance image activity +recognition called ARIC (Activity Recognition in Classroom). The dataset has +advantages such as multiple perspectives, a wide variety of activities, and +real-world scenarios, but it also presents challenges like similar activities +and imbalanced sample distribution. To overcome these challenges, we designed a +few-shot continual learning method that combines supervised contrastive +learning (SCL) and an adaptive covariance classifier (ACC). During the base +phase, we proposed a SCL approach based on feature augmentation to enhance the +model's generalization ability. In the incremental phase, we employed an ACC to +more accurately describe the distribution of new classes. Experimental results +demonstrate that our method outperforms other existing methods on the ARIC +dataset. + +
+
+
+
+
+ + ☆ Eetimating Indoor Scene Depth Maps from Ultrasonic Echoes ICIP 2024 + + +
+ Measuring 3D geometric structures of indoor scenes requires dedicated depth +sensors, which are not always available. Echo-based depth estimation has +recently been studied as a promising alternative solution. All previous studies +have assumed the use of echoes in the audible range. However, one major problem +is that audible echoes cannot be used in quiet spaces or other situations where +producing audible sounds is prohibited. In this paper, we consider echo-based +depth estimation using inaudible ultrasonic echoes. While ultrasonic waves +provide high measurement accuracy in theory, the actual depth estimation +accuracy when ultrasonic echoes are used has remained unclear, due to its +disadvantage of being sensitive to noise and susceptible to attenuation. We +first investigate the depth estimation accuracy when the frequency of the sound +source is restricted to the high-frequency band, and found that the accuracy +decreased when the frequency was limited to ultrasonic ranges. Based on this +observation, we propose a novel deep learning method to improve the accuracy of +ultrasonic echo-based depth estimation by using audible echoes as auxiliary +data only during training. Experimental results with a public dataset +demonstrate that our method improves the estimation accuracy. + +
+
+ comment: ICIP 2024 +
+
+
+
+
+ + ☆ Enhancing User-Centric Privacy Protection: An Interactive Framework + through Diffusion Models and Machine Unlearning + + +
+ In the realm of multimedia data analysis, the extensive use of image datasets +has escalated concerns over privacy protection within such data. Current +research predominantly focuses on privacy protection either in data sharing or +upon the release of trained machine learning models. Our study pioneers a +comprehensive privacy protection framework that safeguards image data privacy +concurrently during data sharing and model publication. We propose an +interactive image privacy protection framework that utilizes generative machine +learning models to modify image information at the attribute level and employs +machine unlearning algorithms for the privacy preservation of model parameters. +This user-interactive framework allows for adjustments in privacy protection +intensity based on user feedback on generated images, striking a balance +between maximal privacy safeguarding and maintaining model performance. Within +this framework, we instantiate two modules: a differential privacy diffusion +model for protecting attribute information in images and a feature unlearning +algorithm for efficient updates of the trained model on the revised image +dataset. Our approach demonstrated superiority over existing methods on facial +datasets across various attribute classifications. + +
+
+
+
+
+ + ☆ YOLO-PPA based Efficient Traffic Sign Detection for Cruise Control in + Autonomous Driving + + +
+ It is very important to detect traffic signs efficiently and accurately in +autonomous driving systems. However, the farther the distance, the smaller the +traffic signs. Existing object detection algorithms can hardly detect these +small scaled signs.In addition, the performance of embedded devices on vehicles +limits the scale of detection models.To address these challenges, a YOLO PPA +based traffic sign detection algorithm is proposed in this paper.The +experimental results on the GTSDB dataset show that compared to the original +YOLO, the proposed method improves inference efficiency by 11.2%. The mAP 50 is +also improved by 93.2%, which demonstrates the effectiveness of the proposed +YOLO PPA. + +
+
+
+
+
+ + ☆ Improving Robustness to Multiple Spurious Correlations by + Multi-Objective Optimization + + +
+ We study the problem of training an unbiased and accurate model given a +dataset with multiple biases. This problem is challenging since the multiple +biases cause multiple undesirable shortcuts during training, and even worse, +mitigating one may exacerbate the other. We propose a novel training method to +tackle this challenge. Our method first groups training data so that different +groups induce different shortcuts, and then optimizes a linear combination of +group-wise losses while adjusting their weights dynamically to alleviate +conflicts between the groups in performance; this approach, rooted in the +multi-objective optimization theory, encourages to achieve the minimax Pareto +solution. We also present a new benchmark with multiple biases, dubbed +MultiCelebA, for evaluating debiased training methods under realistic and +challenging scenarios. Our method achieved the best on three datasets with +multiple biases, and also showed superior performance on conventional +single-bias datasets. + +
+
+ comment: International Conference on Machine Learning 2024 +
+
+
+
+
+ + ☆ ChartMoE: Mixture of Expert Connector for Advanced Chart Understanding + + +
+ Automatic chart understanding is crucial for content comprehension and +document parsing. Multimodal large language models (MLLMs) have demonstrated +remarkable capabilities in chart understanding through domain-specific +alignment and fine-tuning. However, the application of alignment training +within the chart domain is still underexplored. To address this, we propose +ChartMoE, which employs the mixture of expert (MoE) architecture to replace the +traditional linear projector to bridge the modality gap. Specifically, we train +multiple linear connectors through distinct alignment tasks, which are utilized +as the foundational initialization parameters for different experts. +Additionally, we introduce ChartMoE-Align, a dataset with over 900K +chart-table-JSON-code quadruples to conduct three alignment tasks +(chart-table/JSON/code). Combined with the vanilla connector, we initialize +different experts in four distinct ways and adopt high-quality knowledge +learning to further refine the MoE connector and LLM parameters. Extensive +experiments demonstrate the effectiveness of the MoE connector and our +initialization strategy, e.g., ChartMoE improves the accuracy of the previous +state-of-the-art from 80.48% to 84.64% on the ChartQA benchmark. + +
+
+
+
+
+ + ☆ OccLLaMA: An Occupancy-Language-Action Generative World Model for + Autonomous Driving + + +
+ The rise of multi-modal large language models(MLLMs) has spurred their +applications in autonomous driving. Recent MLLM-based methods perform action by +learning a direct mapping from perception to action, neglecting the dynamics of +the world and the relations between action and world dynamics. In contrast, +human beings possess world model that enables them to simulate the future +states based on 3D internal visual representation and plan actions accordingly. +To this end, we propose OccLLaMA, an occupancy-language-action generative world +model, which uses semantic occupancy as a general visual representation and +unifies vision-language-action(VLA) modalities through an autoregressive model. +Specifically, we introduce a novel VQVAE-like scene tokenizer to efficiently +discretize and reconstruct semantic occupancy scenes, considering its sparsity +and classes imbalance. Then, we build a unified multi-modal vocabulary for +vision, language and action. Furthermore, we enhance LLM, specifically LLaMA, +to perform the next token/scene prediction on the unified vocabulary to +complete multiple tasks in autonomous driving. Extensive experiments +demonstrate that OccLLaMA achieves competitive performance across multiple +tasks, including 4D occupancy forecasting, motion planning, and visual question +answering, showcasing its potential as a foundation model in autonomous +driving. + +
+
+
+
+
+ + ☆ SVP: Style-Enhanced Vivid Portrait Talking Head Diffusion Model + + +
+ Talking Head Generation (THG), typically driven by audio, is an important and +challenging task with broad application prospects in various fields such as +digital humans, film production, and virtual reality. While diffusion +model-based THG methods present high quality and stable content generation, +they often overlook the intrinsic style which encompasses personalized features +such as speaking habits and facial expressions of a video. As consequence, the +generated video content lacks diversity and vividness, thus being limited in +real life scenarios. To address these issues, we propose a novel framework +named Style-Enhanced Vivid Portrait (SVP) which fully leverages style-related +information in THG. Specifically, we first introduce the novel probabilistic +style prior learning to model the intrinsic style as a Gaussian distribution +using facial expressions and audio embedding. The distribution is learned +through the 'bespoked' contrastive objective, effectively capturing the dynamic +style information in each video. Then we finetune a pretrained Stable Diffusion +(SD) model to inject the learned intrinsic style as a controlling signal via +cross attention. Experiments show that our model generates diverse, vivid, and +high-quality videos with flexible control over intrinsic styles, outperforming +existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ Bones Can't Be Triangles: Accurate and Efficient Vertebrae Keypoint + Estimation through Collaborative Error Revision ECCV 2024 + + +
+ Recent advances in interactive keypoint estimation methods have enhanced +accuracy while minimizing user intervention. However, these methods require +user input for error correction, which can be costly in vertebrae keypoint +estimation where inaccurate keypoints are densely clustered or overlap. We +introduce a novel approach, KeyBot, specifically designed to identify and +correct significant and typical errors in existing models, akin to user +revision. By characterizing typical error types and using simulated errors for +training, KeyBot effectively corrects these errors and significantly reduces +user workload. Comprehensive quantitative and qualitative evaluations on three +public datasets confirm that KeyBot significantly outperforms existing methods, +achieving state-of-the-art performance in interactive vertebrae keypoint +estimation. The source code and demo video are available at: +https://ts-kim.github.io/KeyBot/ + +
+
+ comment: 33 pages, ECCV 2024, Project Page: https://ts-kim.github.io/KeyBot/ +
+
+
+
+
+ + ☆ Granular-ball Representation Learning for Deep CNN on Learning with + Label Noise + + +
+ In actual scenarios, whether manually or automatically annotated, label noise +is inevitably generated in the training data, which can affect the +effectiveness of deep CNN models. The popular solutions require data cleaning +or designing additional optimizations to punish the data with mislabeled data, +thereby enhancing the robustness of models. However, these methods come at the +cost of weakening or even losing some data during the training process. As we +know, content is the inherent attribute of an image that does not change with +changes in annotations. In this study, we propose a general granular-ball +computing (GBC) module that can be embedded into a CNN model, where the +classifier finally predicts the label of granular-ball ($gb$) samples instead +of each individual samples. Specifically, considering the classification task: +(1) in forward process, we split the input samples as $gb$ samples at +feature-level, each of which can correspond to multiple samples with varying +numbers and share one single label; (2) during the backpropagation process, we +modify the gradient allocation strategy of the GBC module to enable it to +propagate normally; and (3) we develop an experience replay policy to ensure +the stability of the training process. Experiments demonstrate that the +proposed method can improve the robustness of CNN models with no additional +data or optimization. + +
+
+
+
+
+ + ☆ Gr-IoU: Ground-Intersection over Union for Robust Multi-Object Tracking + with 3D Geometric Constraints ECCV 2024 + + +
+ We propose a Ground IoU (Gr-IoU) to address the data association problem in +multi-object tracking. When tracking objects detected by a camera, it often +occurs that the same object is assigned different IDs in consecutive frames, +especially when objects are close to each other or overlapping. To address this +issue, we introduce Gr-IoU, which takes into account the 3D structure of the +scene. Gr-IoU transforms traditional bounding boxes from the image space to the +ground plane using the vanishing point geometry. The IoU calculated with these +transformed bounding boxes is more sensitive to the front-to-back relationships +of objects, thereby improving data association accuracy and reducing ID +switches. We evaluated our Gr-IoU method on the MOT17 and MOT20 datasets, which +contain diverse tracking scenarios including crowded scenes and sequences with +frequent occlusions. Experimental results demonstrated that Gr-IoU outperforms +conventional real-time methods without appearance features. + +
+
+ comment: Accepted for the ECCV 2024 Workshop on Affective Behavior Analysis + in-the-wild(ABAW) +
+
+
+
+
+ + ☆ Multiple weather images restoration using the task transformer and + adaptive mixup strategy + + +
+ The current state-of-the-art in severe weather removal predominantly focuses +on single-task applications, such as rain removal, haze removal, and snow +removal. However, real-world weather conditions often consist of a mixture of +several weather types, and the degree of weather mixing in autonomous driving +scenarios remains unknown. In the presence of complex and diverse weather +conditions, a single weather removal model often encounters challenges in +producing clear images from severe weather images. Therefore, there is a need +for the development of multi-task severe weather removal models that can +effectively handle mixed weather conditions and improve image quality in +autonomous driving scenarios. In this paper, we introduce a novel multi-task +severe weather removal model that can effectively handle complex weather +conditions in an adaptive manner. Our model incorporates a weather task +sequence generator, enabling the self-attention mechanism to selectively focus +on features specific to different weather types. To tackle the challenge of +repairing large areas of weather degradation, we introduce Fast Fourier +Convolution (FFC) to increase the receptive field. Additionally, we propose an +adaptive upsampling technique that effectively processes both the weather task +information and underlying image features by selectively retaining relevant +information. Our proposed model has achieved state-of-the-art performance on +the publicly available dataset. + +
+
+ comment: 10 pages, 5 figures and 2 table +
+
+
+
+
+ + ☆ UAV (Unmanned Aerial Vehicles): Diverse Applications of UAV Datasets in + Segmentation, Classification, Detection, and Tracking + + +
+ Unmanned Aerial Vehicles (UAVs), have greatly revolutionized the process of +gathering and analyzing data in diverse research domains, providing unmatched +adaptability and effectiveness. This paper presents a thorough examination of +Unmanned Aerial Vehicle (UAV) datasets, emphasizing their wide range of +applications and progress. UAV datasets consist of various types of data, such +as satellite imagery, images captured by drones, and videos. These datasets can +be categorized as either unimodal or multimodal, offering a wide range of +detailed and comprehensive information. These datasets play a crucial role in +disaster damage assessment, aerial surveillance, object recognition, and +tracking. They facilitate the development of sophisticated models for tasks +like semantic segmentation, pose estimation, vehicle re-identification, and +gesture recognition. By leveraging UAV datasets, researchers can significantly +enhance the capabilities of computer vision models, thereby advancing +technology and improving our understanding of complex, dynamic environments +from an aerial perspective. This review aims to encapsulate the multifaceted +utility of UAV datasets, emphasizing their pivotal role in driving innovation +and practical applications in multiple domains. + +
+
+
+
+
+ + ☆ Unveiling Context-Related Anomalies: Knowledge Graph Empowered + Decoupling of Scene and Action for Human-Related Video Anomaly Detection + + +
+ Detecting anomalies in human-related videos is crucial for surveillance +applications. Current methods primarily include appearance-based and +action-based techniques. Appearance-based methods rely on low-level visual +features such as color, texture, and shape. They learn a large number of pixel +patterns and features related to known scenes during training, making them +effective in detecting anomalies within these familiar contexts. However, when +encountering new or significantly changed scenes, i.e., unknown scenes, they +often fail because existing SOTA methods do not effectively capture the +relationship between actions and their surrounding scenes, resulting in low +generalization. In contrast, action-based methods focus on detecting anomalies +in human actions but are usually less informative because they tend to overlook +the relationship between actions and their scenes, leading to incorrect +detection. For instance, the normal event of running on the beach and the +abnormal event of running on the street might both be considered normal due to +the lack of scene information. In short, current methods struggle to integrate +low-level visual and high-level action features, leading to poor anomaly +detection in varied and complex scenes. To address this challenge, we propose a +novel decoupling-based architecture for human-related video anomaly detection +(DecoAD). DecoAD significantly improves the integration of visual and action +features through the decoupling and interweaving of scenes and actions, thereby +enabling a more intuitive and accurate understanding of complex behaviors and +scenes. DecoAD supports fully supervised, weakly supervised, and unsupervised +settings. + +
+
+ comment: 13pages, 9 figures +
+
+
+
+
+ + ☆ Labeled-to-Unlabeled Distribution Alignment for Partially-Supervised + Multi-Organ Medical Image Segmentation + + +
+ Partially-supervised multi-organ medical image segmentation aims to develop a +unified semantic segmentation model by utilizing multiple partially-labeled +datasets, with each dataset providing labels for a single class of organs. +However, the limited availability of labeled foreground organs and the absence +of supervision to distinguish unlabeled foreground organs from the background +pose a significant challenge, which leads to a distribution mismatch between +labeled and unlabeled pixels. Although existing pseudo-labeling methods can be +employed to learn from both labeled and unlabeled pixels, they are prone to +performance degradation in this task, as they rely on the assumption that +labeled and unlabeled pixels have the same distribution. In this paper, to +address the problem of distribution mismatch, we propose a labeled-to-unlabeled +distribution alignment (LTUDA) framework that aligns feature distributions and +enhances discriminative capability. Specifically, we introduce a cross-set data +augmentation strategy, which performs region-level mixing between labeled and +unlabeled organs to reduce distribution discrepancy and enrich the training +set. Besides, we propose a prototype-based distribution alignment method that +implicitly reduces intra-class variation and increases the separation between +the unlabeled foreground and background. This can be achieved by encouraging +consistency between the outputs of two prototype classifiers and a linear +classifier. Extensive experimental results on the AbdomenCT-1K dataset and a +union of four benchmark datasets (including LiTS, MSD-Spleen, KiTS, and NIH82) +demonstrate that our method outperforms the state-of-the-art +partially-supervised methods by a considerable margin, and even surpasses the +fully-supervised methods. The source code is publicly available at +https://github.com/xjiangmed/LTUDA. + +
+
+ comment: Accepted by Medical Image Analysis +
+
+
+
+
+ + ☆ Why mamba is effective? Exploit Linear Transformer-Mamba Network for + Multi-Modality Image Fusion + + +
+ Multi-modality image fusion aims to integrate the merits of images from +different sources and render high-quality fusion images. However, existing +feature extraction and fusion methods are either constrained by inherent local +reduction bias and static parameters during inference (CNN) or limited by +quadratic computational complexity (Transformers), and cannot effectively +extract and fuse features. To solve this problem, we propose a dual-branch +image fusion network called Tmamba. It consists of linear Transformer and +Mamba, which has global modeling capabilities while maintaining linear +complexity. Due to the difference between the Transformer and Mamba structures, +the features extracted by the two branches carry channel and position +information respectively. T-M interaction structure is designed between the two +branches, using global learnable parameters and convolutional layers to +transfer position and channel information respectively. We further propose +cross-modal interaction at the attention level to obtain cross-modal attention. +Experiments show that our Tmamba achieves promising results in multiple fusion +tasks, including infrared-visible image fusion and medical image fusion. Code +with checkpoints will be available after the peer-review process. + +
+
+
+
+
+ + ☆ Optimizing 3D Gaussian Splatting for Sparse Viewpoint Scene + Reconstruction + + +
+ 3D Gaussian Splatting (3DGS) has emerged as a promising approach for 3D scene +representation, offering a reduction in computational overhead compared to +Neural Radiance Fields (NeRF). However, 3DGS is susceptible to high-frequency +artifacts and demonstrates suboptimal performance under sparse viewpoint +conditions, thereby limiting its applicability in robotics and computer vision. +To address these limitations, we introduce SVS-GS, a novel framework for Sparse +Viewpoint Scene reconstruction that integrates a 3D Gaussian smoothing filter +to suppress artifacts. Furthermore, our approach incorporates a Depth Gradient +Profile Prior (DGPP) loss with a dynamic depth mask to sharpen edges and 2D +diffusion with Score Distillation Sampling (SDS) loss to enhance geometric +consistency in novel view synthesis. Experimental evaluations on the +MipNeRF-360 and SeaThru-NeRF datasets demonstrate that SVS-GS markedly improves +3D reconstruction from sparse viewpoints, offering a robust and efficient +solution for scene understanding in robotics and computer vision applications. + +
+
+
+
+
+ + ☆ Bi-capacity Choquet Integral for Sensor Fusion with Label Uncertainty + + +
+ Sensor fusion combines data from multiple sensor sources to improve +reliability, robustness, and accuracy of data interpretation. The Fuzzy +Integral (FI), in particular, the Choquet integral (ChI), is often used as a +powerful nonlinear aggregator for fusion across multiple sensors. However, +existing supervised ChI learning algorithms typically require precise training +labels for each input data point, which can be difficult or impossible to +obtain. Additionally, prior work on ChI fusion is often based only on the +normalized fuzzy measures, which bounds the fuzzy measure values between [0, +1]. This can be limiting in cases where the underlying scales of input data +sources are bipolar (i.e., between [-1, 1]). To address these challenges, this +paper proposes a novel Choquet integral-based fusion framework, named Bi-MIChI +(pronounced "bi-mi-kee"), which uses bi-capacities to represent the +interactions between pairs of subsets of the input sensor sources on a bi-polar +scale. This allows for extended non-linear interactions between the sensor +sources and can lead to interesting fusion results. Bi-MIChI also addresses +label uncertainty through Multiple Instance Learning, where training labels are +applied to "bags" (sets) of data instead of per-instance. Our proposed Bi-MIChI +framework shows effective classification and detection performance on both +synthetic and real-world experiments for sensor fusion with label uncertainty. +We also provide detailed analyses on the behavior of the fuzzy measures to +demonstrate our fusion process. + +
+
+ comment: 10 pages, 7 figures, 7 tables; Accepted to 2024 FUZZ-IEEE and + presented at 2024 IEEE WCCI; Code available at + https://github.com/hvak/Bi-MIChI +
+
+
+
+
+ + ☆ iSeg: An Iterative Refinement-based Framework for Training-free + Segmentation + + +
+ Stable diffusion has demonstrated strong image synthesis ability to given +text descriptions, suggesting it to contain strong semantic clue for grouping +objects. Inspired by this, researchers have explored employing stable diffusion +for trainingfree segmentation. Most existing approaches either simply employ +cross-attention map or refine it by self-attention map, to generate +segmentation masks. We believe that iterative refinement with self-attention +map would lead to better results. However, we mpirically demonstrate that such +a refinement is sub-optimal likely due to the self-attention map containing +irrelevant global information which hampers accurately refining cross-attention +map with multiple iterations. To address this, we propose an iterative +refinement framework for training-free segmentation, named iSeg, having an +entropy-reduced self-attention module which utilizes a gradient descent scheme +to reduce the entropy of self-attention map, thereby suppressing the weak +responses corresponding to irrelevant global information. Leveraging the +entropy-reduced self-attention module, our iSeg stably improves refined +crossattention map with iterative refinement. Further, we design a +category-enhanced cross-attention module to generate accurate cross-attention +map, providing a better initial input for iterative refinement. Extensive +experiments across different datasets and diverse segmentation tasks reveal the +merits of proposed contributions, leading to promising performance on diverse +segmentation tasks. For unsupervised semantic segmentation on Cityscapes, our +iSeg achieves an absolute gain of 3.8% in terms of mIoU compared to the best +existing training-free approach in literature. Moreover, our proposed iSeg can +support segmentation with different kind of images and interactions. + +
+
+
+
+
+ + ☆ TC-LLaVA: Rethinking the Transfer from Image to Video Understanding with + Temporal Considerations + + +
+ Multimodal Large Language Models (MLLMs) have significantly improved +performance across various image-language applications. Recently, there has +been a growing interest in adapting image pre-trained MLLMs for video-related +tasks. However, most efforts concentrate on enhancing the vision encoder and +projector components, while the core part, Large Language Models (LLMs), +remains comparatively under-explored. In this paper, we propose two strategies +to enhance the model's capability in video understanding tasks by improving +inter-layer attention computation in LLMs. Specifically, the first approach +focuses on the enhancement of Rotary Position Embedding (RoPE) with +Temporal-Aware Dual RoPE, which introduces temporal position information to +strengthen the MLLM's temporal modeling capabilities while preserving the +relative position relationships of both visual and text tokens. The second +approach involves enhancing the Attention Mask with the Frame-wise Block Causal +Attention Mask, a simple yet effective method that broadens visual token +interactions within and across video frames while maintaining the causal +inference mechanism. Based on these proposed methods, we adapt LLaVA for video +understanding tasks, naming it Temporal-Considered LLaVA (TC-LLaVA). Our +TC-LLaVA achieves new state-of-the-art performance across various video +understanding benchmarks with only supervised fine-tuning (SFT) on +video-related datasets. + +
+
+
+
+
+ + ☆ Active Fake: DeepFake Camouflage + + +
+ DeepFake technology has gained significant attention due to its ability to +manipulate facial attributes with high realism, raising serious societal +concerns. Face-Swap DeepFake is the most harmful among these techniques, which +fabricates behaviors by swapping original faces with synthesized ones. Existing +forensic methods, primarily based on Deep Neural Networks (DNNs), effectively +expose these manipulations and have become important authenticity indicators. +However, these methods mainly concentrate on capturing the blending +inconsistency in DeepFake faces, raising a new security issue, termed Active +Fake, emerges when individuals intentionally create blending inconsistency in +their authentic videos to evade responsibility. This tactic is called DeepFake +Camouflage. To achieve this, we introduce a new framework for creating DeepFake +camouflage that generates blending inconsistencies while ensuring +imperceptibility, effectiveness, and transferability. This framework, optimized +via an adversarial learning strategy, crafts imperceptible yet effective +inconsistencies to mislead forensic detectors. Extensive experiments +demonstrate the effectiveness and robustness of our method, highlighting the +need for further research in active fake detection. + +
+
+
+
+
+ + ☆ RoomDiffusion: A Specialized Diffusion Model in the Interior Design + Industry + + +
+ Recent advancements in text-to-image diffusion models have significantly +transformed visual content generation, yet their application in specialized +fields such as interior design remains underexplored. In this paper, we present +RoomDiffusion, a pioneering diffusion model meticulously tailored for the +interior design industry. To begin with, we build from scratch a whole data +pipeline to update and evaluate data for iterative model optimization. +Subsequently, techniques such as multiaspect training, multi-stage fine-tune +and model fusion are applied to enhance both the visual appeal and precision of +the generated results. Lastly, leveraging the latent consistency Distillation +method, we distill and expedite the model for optimal efficiency. Unlike +existing models optimized for general scenarios, RoomDiffusion addresses +specific challenges in interior design, such as lack of fashion, high furniture +duplication rate, and inaccurate style. Through our holistic human evaluation +protocol with more than 20 professional human evaluators, RoomDiffusion +demonstrates industry-leading performance in terms of aesthetics, accuracy, and +efficiency, surpassing all existing open source models such as stable diffusion +and SDXL. + +
+
+
+
+
+ + ☆ PEPL: Precision-Enhanced Pseudo-Labeling for Fine-Grained Image + Classification in Semi-Supervised Learning + + +
+ Fine-grained image classification has witnessed significant advancements with +the advent of deep learning and computer vision technologies. However, the +scarcity of detailed annotations remains a major challenge, especially in +scenarios where obtaining high-quality labeled data is costly or +time-consuming. To address this limitation, we introduce Precision-Enhanced +Pseudo-Labeling(PEPL) approach specifically designed for fine-grained image +classification within a semi-supervised learning framework. Our method +leverages the abundance of unlabeled data by generating high-quality +pseudo-labels that are progressively refined through two key phases: initial +pseudo-label generation and semantic-mixed pseudo-label generation. These +phases utilize Class Activation Maps (CAMs) to accurately estimate the semantic +content and generate refined labels that capture the essential details +necessary for fine-grained classification. By focusing on semantic-level +information, our approach effectively addresses the limitations of standard +data augmentation and image-mixing techniques in preserving critical +fine-grained features. We achieve state-of-the-art performance on benchmark +datasets, demonstrating significant improvements over existing semi-supervised +strategies, with notable boosts in accuracy and robustness.Our code has been +open sourced at https://github.com/TianSuya/SemiFG. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Perceptual-Distortion Balanced Image Super-Resolution is a + Multi-Objective Optimization Problem + + +
+ Training Single-Image Super-Resolution (SISR) models using pixel-based +regression losses can achieve high distortion metrics scores (e.g., PSNR and +SSIM), but often results in blurry images due to insufficient recovery of +high-frequency details. Conversely, using GAN or perceptual losses can produce +sharp images with high perceptual metric scores (e.g., LPIPS), but may +introduce artifacts and incorrect textures. Balancing these two types of losses +can help achieve a trade-off between distortion and perception, but the +challenge lies in tuning the loss function weights. To address this issue, we +propose a novel method that incorporates Multi-Objective Optimization (MOO) +into the training process of SISR models to balance perceptual quality and +distortion. We conceptualize the relationship between loss weights and image +quality assessment (IQA) metrics as black-box objective functions to be +optimized within our Multi-Objective Bayesian Optimization Super-Resolution +(MOBOSR) framework. This approach automates the hyperparameter tuning process, +reduces overall computational cost, and enables the use of numerous loss +functions simultaneously. Extensive experiments demonstrate that MOBOSR +outperforms state-of-the-art methods in terms of both perceptual quality and +distortion, significantly advancing the perception-distortion Pareto frontier. +Our work points towards a new direction for future research on balancing +perceptual quality and fidelity in nearly all image restoration tasks. The +source code and pretrained models are available at: +https://github.com/ZhuKeven/MOBOSR. + +
+
+
+
+
+ + ☆ TropNNC: Structured Neural Network Compression Using Tropical Geometry + + +
+ We present TropNNC, a structured pruning framework for compressing neural +networks with linear and convolutional layers and ReLU activations. Our +approximation is based on a geometrical approach to machine/deep learning, +using tropical geometry and extending the work of Misiakos et al. (2022). We +use the Hausdorff distance of zonotopes in its standard continuous form to +achieve a tighter approximation bound for tropical polynomials compared to +Misiakos et al. (2022). This enhancement allows for superior functional +approximations of neural networks, leading to a more effective compression +algorithm. Our method is significantly easier to implement compared to other +frameworks, and does not depend on the availability of training data samples. +We validate our framework through extensive empirical evaluations on the MNIST, +CIFAR, and ImageNet datasets. Our results demonstrate that TropNNC achieves +performance on par with the state-of-the-art method ThiNet, even surpassing it +in compressing linear layers, and to the best of our knowledge, it is the first +method that achieves this using tropical geometry. + +
+
+
+
+
+ + ☆ HUMOS: Human Motion Model Conditioned on Body Shape ECCV'24 + + +
+ Generating realistic human motion is essential for many computer vision and +graphics applications. The wide variety of human body shapes and sizes greatly +impacts how people move. However, most existing motion models ignore these +differences, relying on a standardized, average body. This leads to uniform +motion across different body types, where movements don't match their physical +characteristics, limiting diversity. To solve this, we introduce a new approach +to develop a generative motion model based on body shape. We show that it's +possible to train this model using unpaired data by applying cycle consistency, +intuitive physics, and stability constraints, which capture the relationship +between identity and movement. The resulting model generates diverse, +physically plausible, and dynamically stable human motions that are both +quantitatively and qualitatively more realistic than current state-of-the-art +methods. More details are available on our project page +https://CarstenEpic.github.io/humos/. + +
+
+ comment: Accepted in ECCV'24. Project page: + https://CarstenEpic.github.io/humos/ +
+
+
+
+
+ + ☆ Deep Clustering of Remote Sensing Scenes through Heterogeneous Transfer + Learning + + +
+ This paper proposes a method for unsupervised whole-image clustering of a +target dataset of remote sensing scenes with no labels. The method consists of +three main steps: (1) finetuning a pretrained deep neural network (DINOv2) on a +labelled source remote sensing imagery dataset and using it to extract a +feature vector from each image in the target dataset, (2) reducing the +dimension of these deep features via manifold projection into a low-dimensional +Euclidean space, and (3) clustering the embedded features using a Bayesian +nonparametric technique to infer the number and membership of clusters +simultaneously. The method takes advantage of heterogeneous transfer learning +to cluster unseen data with different feature and label distributions. We +demonstrate the performance of this approach outperforming state-of-the-art +zero-shot classification methods on several remote sensing scene classification +datasets. + +
+
+
+
+
+ + ☆ Data-Efficient Generation for Dataset Distillation + + +
+ While deep learning techniques have proven successful in image-related tasks, +the exponentially increased data storage and computation costs become a +significant challenge. Dataset distillation addresses these challenges by +synthesizing only a few images for each class that encapsulate all essential +information. Most current methods focus on matching. The problems lie in the +synthetic images not being human-readable and the dataset performance being +insufficient for downstream learning tasks. Moreover, the distillation time can +quickly get out of bounds when the number of synthetic images per class +increases even slightly. To address this, we train a class conditional latent +diffusion model capable of generating realistic synthetic images with labels. +The sampling time can be reduced to several tens of images per seconds. We +demonstrate that models can be effectively trained using only a small set of +synthetic images and evaluated on a large real test set. Our approach achieved +rank \(1\) in The First Dataset Distillation Challenge at ECCV 2024 on the +CIFAR100 and TinyImageNet datasets. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Image Recognition for Garbage Classification Based on Pixel Distribution + Learning + + +
+ The exponential growth in waste production due to rapid economic and +industrial development necessitates efficient waste management strategies to +mitigate environmental pollution and resource depletion. Leveraging +advancements in computer vision, this study proposes a novel approach inspired +by pixel distribution learning techniques to enhance automated garbage +classification. The method aims to address limitations of conventional +convolutional neural network (CNN)-based approaches, including computational +complexity and vulnerability to image variations. We will conduct experiments +using the Kaggle Garbage Classification dataset, comparing our approach with +existing models to demonstrate the strength and efficiency of pixel +distribution learning in automated garbage classification technologies. + +
+
+
+
+
+ + ☆ The Role of Generative Systems in Historical Photography Management: A + Case Study on Catalan Archives ECCV + + +
+ The use of image analysis in automated photography management is an +increasing trend in heritage institutions. Such tools alleviate the human cost +associated with the manual and expensive annotation of new data sources while +facilitating fast access to the citizenship through online indexes and search +engines. However, available tagging and description tools are usually designed +around modern photographs in English, neglecting historical corpora in +minoritized languages, each of which exhibits intrinsic particularities. The +primary objective of this research is to study the quantitative contribution of +generative systems in the description of historical sources. This is done by +contextualizing the task of captioning historical photographs from the Catalan +archives as a case study. Our findings provide practitioners with tools and +directions on transfer learning for captioning models based on visual +adaptation and linguistic proximity. + +
+
+ comment: Accepted at ECCV workshop AI4DH +
+
+
+
+
+ + ☆ On-board Satellite Image Classification for Earth Observation: A + Comparative Study of Pre-Trained Vision Transformer Models + + +
+ Remote sensing image classification is a critical component of Earth +observation (EO) systems, traditionally dominated by convolutional neural +networks (CNNs) and other deep learning techniques. However, the advent of +Transformer-based architectures and large-scale pre-trained models has +significantly shifted, offering enhanced performance and efficiency. This study +focuses on identifying the most effective pre-trained model for land use +classification in onboard satellite processing, emphasizing achieving high +accuracy, computational efficiency, and robustness against noisy data +conditions commonly encountered during satellite-based inference. Through +extensive experimentation, we compared traditional CNN-based models, +ResNet-based models, and various pre-trained vision Transformer models. Our +findings demonstrate that pre-trained Transformer models, particularly +MobileViTV2 and EfficientViT-M2, outperform models trained from scratch in +accuracy and efficiency. These models achieve high performance with reduced +computational requirements and exhibit greater resilience during inference +under noisy conditions. While MobileViTV2 excelled on clean validation data, +EfficientViT-M2 proved more robust when handling noise, making it the most +suitable model for onboard satellite Earth observation tasks. In conclusion, +EfficientViT-M2 is the optimal choice for reliable and efficient remote sensing +image classification in satellite operations, achieving 98.76\% accuracy, +precision, and recall. Specifically, EfficientViT-M2 delivered the highest +performance across all metrics, excelled in training efficiency (1,000s) and +inference time (10s), and demonstrated greater robustness (overall robustness +score at 0.79). + +
+
+
+
+
+ + ☆ MVTN: A Multiscale Video Transformer Network for Hand Gesture + Recognition + + +
+ In this paper, we introduce a novel Multiscale Video Transformer Network +(MVTN) for dynamic hand gesture recognition, since multiscale features can +extract features with variable size, pose, and shape of hand which is a +challenge in hand gesture recognition. The proposed model incorporates a +multiscale feature hierarchy to capture diverse levels of detail and context +within hand gestures which enhances the model's ability. This multiscale +hierarchy is obtained by extracting different dimensions of attention in +different transformer stages with initial stages to model high-resolution +features and later stages to model low-resolution features. Our approach also +leverages multimodal data, utilizing depth maps, infrared data, and surface +normals along with RGB images from NVGesture and Briareo datasets. Experiments +show that the proposed MVTN achieves state-of-the-art results with less +computational complexity and parameters. The source code is available at +https://github.com/mallikagarg/MVTN. + +
+
+
+
+
+ + ☆ Recon-all-clinical: Cortical surface reconstruction and analysis of + heterogeneous clinical brain MRI + + +
+ Surface-based analysis of the cerebral cortex is ubiquitous in human +neuroimaging with MRI. It is crucial for cortical registration, parcellation, +and thickness estimation. Traditionally, these analyses require +high-resolution, isotropic scans with good gray-white matter contrast, +typically a 1mm T1-weighted scan. This excludes most clinical MRI scans, which +are often anisotropic and lack the necessary T1 contrast. To enable large-scale +neuroimaging studies using vast clinical data, we introduce recon-all-clinical, +a novel method for cortical reconstruction, registration, parcellation, and +thickness estimation in brain MRI scans of any resolution and contrast. Our +approach employs a hybrid analysis method that combines a convolutional neural +network (CNN) trained with domain randomization to predict signed distance +functions (SDFs) and classical geometry processing for accurate surface +placement while maintaining topological and geometric constraints. The method +does not require retraining for different acquisitions, thus simplifying the +analysis of heterogeneous clinical datasets. We tested recon-all-clinical on +multiple datasets, including over 19,000 clinical scans. The method +consistently produced precise cortical reconstructions and high parcellation +accuracy across varied MRI contrasts and resolutions. Cortical thickness +estimates are precise enough to capture aging effects independently of MRI +contrast, although accuracy varies with slice thickness. Our method is publicly +available at https://surfer.nmr.mgh.harvard.edu/fswiki/recon-all-clinical, +enabling researchers to perform detailed cortical analysis on the huge amounts +of already existing clinical MRI scans. This advancement may be particularly +valuable for studying rare diseases and underrepresented populations where +research-grade MRI data is scarce. + +
+
+ comment: 16 pages in the manuscript with 11 page supplementary material +
+
+
+
+
+ + ☆ The Influence of Faulty Labels in Data Sets on Human Pose Estimation + + +
+ In this study we provide empirical evidence demonstrating that the quality of +training data impacts model performance in Human Pose Estimation (HPE). +Inaccurate labels in widely used data sets, ranging from minor errors to severe +mislabeling, can negatively influence learning and distort performance metrics. +We perform an in-depth analysis of popular HPE data sets to show the extent and +nature of label inaccuracies. Our findings suggest that accounting for the +impact of faulty labels will facilitate the development of more robust and +accurate HPE models for a variety of real-world applications. We show improved +performance with cleansed data. + +
+
+ comment: 15 pages, 7 figures, 5 tables +
+
+
+
+
+ + ☆ Multi-Camera Industrial Open-Set Person Re-Identification and Tracking ECCV 2024 + + +
+ In recent years, the development of deep learning approaches for the task of +person re-identification led to impressive results. However, this comes with a +limitation for industrial and practical real-world applications. Firstly, most +of the existing works operate on closed-world scenarios, in which the people to +re-identify (probes) are compared to a closed-set (gallery). Real-world +scenarios often are open-set problems in which the gallery is not known a +priori, but the number of open-set approaches in the literature is +significantly lower. Secondly, challenges such as multi-camera setups, +occlusions, real-time requirements, etc., further constrain the applicability +of off-the-shelf methods. This work presents MICRO-TRACK, a Modular Industrial +multi-Camera Re_identification and Open-set Tracking system that is real-time, +scalable, and easy to integrate into existing industrial surveillance +scenarios. Furthermore, we release a novel Re-ID and tracking dataset acquired +in an industrial manufacturing facility, dubbed Facility-ReID, consisting of +18-minute videos captured by 8 surveillance cameras. + +
+
+ comment: Accepted at T-CAP workshop at ECCV 2024 +
+
+
+
+
+ + ☆ Ground-roll Separation From Land Seismic Records Based on Convolutional + Neural Network + + +
+ Ground-roll wave is a common coherent noise in land field seismic data. This +Rayleigh-type surface wave usually has low frequency, low apparent velocity, +and high amplitude, therefore obscures the reflection events of seismic shot +gathers. Commonly used techniques focus on the differences of ground-roll and +reflection in transformed domain such as $f-k$ domain, wavelet domain, or +curvelet domain. These approaches use a series of fixed atoms or bases to +transform the data in time-space domain into transformed domain to separate +different waveforms, thus tend to suffer from the complexity for a delicate +design of the parameters of the transform domain filter. To deal with these +problems, a novel way is proposed to separate ground-roll from reflections +using convolutional neural network (CNN) model based method to learn to extract +the features of ground-roll and reflections automatically based on training +data. In the proposed method, low-pass filtered seismic data which is +contaminated by ground-roll wave is used as input of CNN, and then outputs both +ground-roll component and low-frequency part of reflection component +simultaneously. Discriminative loss is applied together with similarity loss in +the training process to enhance the similarity to their train labels as well as +the difference between the two outputs. Experiments are conducted on both +synthetic and real data, showing that CNN based method can separate ground roll +from reflections effectively, and has generalization ability to a certain +extent. + +
+
+
+
+
+ + ☆ Few-shot Adaptation of Medical Vision-Language Models MICCAI 2024 + + +
+ Integrating image and text data through multi-modal learning has emerged as a +new approach in medical imaging research, following its successful deployment +in computer vision. While considerable efforts have been dedicated to +establishing medical foundation models and their zero-shot transfer to +downstream tasks, the popular few-shot setting remains relatively unexplored. +Following on from the currently strong emergence of this setting in computer +vision, we introduce the first structured benchmark for adapting medical +vision-language models (VLMs) in a strict few-shot regime and investigate +various adaptation strategies commonly used in the context of natural images. +Furthermore, we evaluate a simple generalization of the linear-probe adaptation +baseline, which seeks an optimal blending of the visual prototypes and text +embeddings via learnable class-wise multipliers. Surprisingly, such a +text-informed linear probe yields competitive performances in comparison to +convoluted prompt-learning and adapter-based strategies, while running +considerably faster and accommodating the black-box setting. Our extensive +experiments span three different medical modalities and specialized foundation +models, nine downstream tasks, and several state-of-the-art few-shot adaptation +methods. We made our benchmark and code publicly available to trigger further +developments in this emergent subject: +\url{https://github.com/FereshteShakeri/few-shot-MedVLMs}. + +
+
+ comment: MICCAI 2024 (Spotlight) - Code is available at + https://github.com/FereshteShakeri/few-shot-MedVLMs.git +
+
+
+
+
+ + ♻ ☆ Segment Beyond View: Handling Partially Missing Modality for + Audio-Visual Semantic Segmentation AAAI-24 + + +
+ Augmented Reality (AR) devices, emerging as prominent mobile interaction +platforms, face challenges in user safety, particularly concerning oncoming +vehicles. While some solutions leverage onboard camera arrays, these cameras +often have limited field-of-view (FoV) with front or downward perspectives. +Addressing this, we propose a new out-of-view semantic segmentation task and +Segment Beyond View (SBV), a novel audio-visual semantic segmentation method. +SBV supplements the visual modality, which miss the information beyond FoV, +with the auditory information using a teacher-student distillation model +(Omni2Ego). The model consists of a vision teacher utilising panoramic +information, an auditory teacher with 8-channel audio, and an audio-visual +student that takes views with limited FoV and binaural audio as input and +produce semantic segmentation for objects outside FoV. SBV outperforms existing +models in comparative evaluations and shows a consistent performance across +varying FoV ranges and in monaural audio settings. + +
+
+ comment: AAAI-24 (Fixed some erros) +
+
+
+
+
+ + ♻ ☆ Mesh2NeRF: Direct Mesh Supervision for Neural Radiance Field + Representation and Generation ECCV 2024 + + +
+ We present Mesh2NeRF, an approach to derive ground-truth radiance fields from +textured meshes for 3D generation tasks. Many 3D generative approaches +represent 3D scenes as radiance fields for training. Their ground-truth +radiance fields are usually fitted from multi-view renderings from a +large-scale synthetic 3D dataset, which often results in artifacts due to +occlusions or under-fitting issues. In Mesh2NeRF, we propose an analytic +solution to directly obtain ground-truth radiance fields from 3D meshes, +characterizing the density field with an occupancy function featuring a defined +surface thickness, and determining view-dependent color through a reflection +function considering both the mesh and environment lighting. Mesh2NeRF extracts +accurate radiance fields which provides direct supervision for training +generative NeRFs and single scene representation. We validate the effectiveness +of Mesh2NeRF across various tasks, achieving a noteworthy 3.12dB improvement in +PSNR for view synthesis in single scene representation on the ABO dataset, a +0.69 PSNR enhancement in the single-view conditional generation of ShapeNet +Cars, and notably improved mesh extraction from NeRF in the unconditional +generation of Objaverse Mugs. + +
+
+ comment: Accepted to ECCV 2024, Project page: + https://terencecyj.github.io/projects/Mesh2NeRF/ Video: + https://youtu.be/SsFkhSuQYGM +
+
+
+
+
+ + ♻ ☆ UniMERNet: A Universal Network for Real-World Mathematical Expression + Recognition + + +
+ The paper introduces the UniMER dataset, marking the first study on +Mathematical Expression Recognition (MER) targeting complex real-world +scenarios. The UniMER dataset includes a large-scale training set, UniMER-1M, +which offers unprecedented scale and diversity with one million training +instances to train high-quality, robust models. Additionally, UniMER features a +meticulously designed, diverse test set, UniMER-Test, which covers a variety of +formula distributions found in real-world scenarios, providing a more +comprehensive and fair evaluation. To better utilize the UniMER dataset, the +paper proposes a Universal Mathematical Expression Recognition Network +(UniMERNet), tailored to the characteristics of formula recognition. UniMERNet +consists of a carefully designed encoder that incorporates detail-aware and +local context features, and an optimized decoder for accelerated performance. +Extensive experiments conducted using the UniMER-1M dataset and UniMERNet +demonstrate that training on the large-scale UniMER-1M dataset can produce a +more generalizable formula recognition model, significantly outperforming all +previous datasets. Furthermore, the introduction of UniMERNet enhances the +model's performance in formula recognition, achieving higher accuracy and +speeds. All data, models, and code are available at +https://github.com/opendatalab/UniMERNet. + +
+
+ comment: Project Website: https://github.com/opendatalab/UniMERNet +
+
+
+
+
+ + ♻ ☆ FDNet: Feature Decoupled Segmentation Network for Tooth CBCT Image + + +
+ Precise Tooth Cone Beam Computed Tomography (CBCT) image segmentation is +crucial for orthodontic treatment planning. In this paper, we propose FDNet, a +Feature Decoupled Segmentation Network, to excel in the face of the variable +dental conditions encountered in CBCT scans, such as complex artifacts and +indistinct tooth boundaries. The Low-Frequency Wavelet Transform (LF-Wavelet) +is employed to enrich the semantic content by emphasizing the global structural +integrity of the teeth, while the SAM encoder is leveraged to refine the +boundary delineation, thus improving the contrast between adjacent dental +structures. By integrating these dual aspects, FDNet adeptly addresses the +semantic gap, providing a detailed and accurate segmentation. The framework's +effectiveness is validated through rigorous benchmarks, achieving the top Dice +and IoU scores of 85.28% and 75.23%, respectively. This innovative decoupling +of semantic and boundary features capitalizes on the unique strengths of each +element to elevate the quality of segmentation performance. + +
+
+ comment: IEEE ISBI 2024, Oral +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ HiVG: Hierarchical Multimodal Fine-grained Modulation for Visual + Grounding ACM MM 2024 + + +
+ Visual grounding, which aims to ground a visual region via natural language, +is a task that heavily relies on cross-modal alignment. Existing works utilized +uni-modal pre-trained models to transfer visual or linguistic knowledge +separately while ignoring the multimodal corresponding information. Motivated +by recent advancements in contrastive language-image pre-training and low-rank +adaptation (LoRA) methods, we aim to solve the grounding task based on +multimodal pre-training. However, there exists significant task gaps between +pre-training and grounding. Therefore, to address these gaps, we propose a +concise and efficient hierarchical multimodal fine-grained modulation +framework, namely HiVG. Specifically, HiVG consists of a multi-layer adaptive +cross-modal bridge and a hierarchical multimodal low-rank adaptation (HiLoRA) +paradigm. The cross-modal bridge can address the inconsistency between visual +features and those required for grounding, and establish a connection between +multi-level visual and text features. HiLoRA prevents the accumulation of +perceptual errors by adapting the cross-modal features from shallow to deep +layers in a hierarchical manner. Experimental results on five datasets +demonstrate the effectiveness of our approach and showcase the significant +grounding capabilities as well as promising energy efficiency advantages. The +project page: https://github.com/linhuixiao/HiVG. + +
+
+ comment: Accepted by ACM MM 2024. The project page: + https://github.com/linhuixiao/HiVG +
+
+
+
+
+ + ♻ ☆ Triple-domain Feature Learning with Frequency-aware Memory Enhancement + for Moving Infrared Small Target Detection + + +
+ As a sub-field of object detection, moving infrared small target detection +presents significant challenges due to tiny target sizes and low contrast +against backgrounds. Currently-existing methods primarily rely on the features +extracted only from spatio-temporal domain. Frequency domain has hardly been +concerned yet, although it has been widely applied in image processing. To +extend feature source domains and enhance feature representation, we propose a +new Triple-domain Strategy (Tridos) with the frequency-aware memory enhancement +on spatio-temporal domain for infrared small target detection. In this scheme, +it effectively detaches and enhances frequency features by a local-global +frequency-aware module with Fourier transform. Inspired by human visual system, +our memory enhancement is designed to capture the spatial relations of infrared +targets among video frames. Furthermore, it encodes temporal dynamics motion +features via differential learning and residual enhancing. Additionally, we +further design a residual compensation to reconcile possible cross-domain +feature mismatches. To our best knowledge, proposed Tridos is the first work to +explore infrared target feature learning comprehensively in +spatio-temporal-frequency domains. The extensive experiments on three datasets +(i.e., DAUB, ITSDT-15K and IRDST) validate that our triple-domain infrared +feature learning scheme could often be obviously superior to state-of-the-art +ones. Source codes are available at https://github.com/UESTC-nnLab/Tridos. + +
+
+ comment: This paper has accepted IEEE TGRS +
+
+
+
+
+ + ♻ ☆ GarmentCodeData: A Dataset of 3D Made-to-Measure Garments With Sewing + Patterns ECCV 2024 + + +
+ Recent research interest in the learning-based processing of garments, from +virtual fitting to generation and reconstruction, stumbles on a scarcity of +high-quality public data in the domain. We contribute to resolving this need by +presenting the first large-scale synthetic dataset of 3D made-to-measure +garments with sewing patterns, as well as its generation pipeline. +GarmentCodeData contains 115,000 data points that cover a variety of designs in +many common garment categories: tops, shirts, dresses, jumpsuits, skirts, +pants, etc., fitted to a variety of body shapes sampled from a custom +statistical body model based on CAESAR, as well as a standard reference body +shape, applying three different textile materials. To enable the creation of +datasets of such complexity, we introduce a set of algorithms for automatically +taking tailor's measures on sampled body shapes, sampling strategies for sewing +pattern design, and propose an automatic, open-source 3D garment draping +pipeline based on a fast XPBD simulator, while contributing several solutions +for collision resolution and drape correctness to enable scalability. + Project Page: https://igl.ethz.ch/projects/GarmentCodeData/ + +
+
+ comment: Accepted to ECCV 2024. Sept 4th, 2024: release of GarmentCodeData(v2) +
+
+
+
+
+ + ♻ ☆ EaDeblur-GS: Event assisted 3D Deblur Reconstruction with Gaussian + Splatting + + +
+ 3D deblurring reconstruction techniques have recently seen significant +advancements with the development of Neural Radiance Fields (NeRF) and 3D +Gaussian Splatting (3DGS). Although these techniques can recover relatively +clear 3D reconstructions from blurry image inputs, they still face limitations +in handling severe blurring and complex camera motion. To address these issues, +we propose Event-assisted 3D Deblur Reconstruction with Gaussian Splatting +(EaDeblur-GS), which integrates event camera data to enhance the robustness of +3DGS against motion blur. By employing an Adaptive Deviation Estimator (ADE) +network to estimate Gaussian center deviations and using novel loss functions, +EaDeblur-GS achieves sharp 3D reconstructions in real-time, demonstrating +performance comparable to state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Shapley Values-enabled Progressive Pseudo Bag Augmentation for Whole + Slide Image Classification + + +
+ In computational pathology, whole-slide image (WSI) classification presents a +formidable challenge due to its gigapixel resolution and limited fine-grained +annotations. Multiple-instance learning (MIL) offers a weakly supervised +solution, yet refining instance-level information from bag-level labels remains +challenging. While most of the conventional MIL methods use attention scores to +estimate instance importance scores (IIS) which contribute to the prediction of +the slide labels, these often lead to skewed attention distributions and +inaccuracies in identifying crucial instances. To address these issues, we +propose a new approach inspired by cooperative game theory: employing Shapley +values to assess each instance's contribution, thereby improving IIS +estimation. The computation of the Shapley value is then accelerated using +attention, meanwhile retaining the enhanced instance identification and +prioritization. We further introduce a framework for the progressive assignment +of pseudo bags based on estimated IIS, encouraging more balanced attention +distributions in MIL models. Our extensive experiments on CAMELYON-16, BRACS, +TCGA-LUNG, and TCGA-BRCA datasets show our method's superiority over existing +state-of-the-art approaches, offering enhanced interpretability and class-wise +insights. Our source code is available at https://github.com/RenaoYan/PMIL. + +
+
+ comment: IEEE TRANSACTIONS ON MEDICAL IMAGING 2024 +
+
+
+
+
+ + ♻ ☆ EgoHDM: An Online Egocentric-Inertial Human Motion Capture, + Localization, and Dense Mapping System + + +
+ We present EgoHDM, an online egocentric-inertial human motion capture +(mocap), localization, and dense mapping system. Our system uses 6 inertial +measurement units (IMUs) and a commodity head-mounted RGB camera. EgoHDM is the +first human mocap system that offers dense scene mapping in near real-time. +Further, it is fast and robust to initialize and fully closes the loop between +physically plausible map-aware global human motion estimation and mocap-aware +3D scene reconstruction. Our key idea is integrating camera localization and +mapping information with inertial human motion capture bidirectionally in our +system. To achieve this, we design a tightly coupled mocap-aware dense bundle +adjustment and physics-based body pose correction module leveraging a local +body-centric elevation map. The latter introduces a novel terrain-aware contact +PD controller, which enables characters to physically contact the given local +elevation map thereby reducing human floating or penetration. We demonstrate +the performance of our system on established synthetic and real-world +benchmarks. The results show that our method reduces human localization, camera +pose, and mapping accuracy error by 41%, 71%, 46%, respectively, compared to +the state of the art. Our qualitative evaluations on newly captured data +further demonstrate that EgoHDM can cover challenging scenarios in non-flat +terrain including stepping over stairs and outdoor scenes in the wild. + +
+
+ comment: Project Page: https://handiyin.github.io/EgoHDM/ +
+
+
+
+
+ + ♻ ☆ Enhancing Facial Expression Recognition through Dual-Direction Attention + Mixed Feature Networks: Application to 7th ABAW Challenge + + +
+ We present our contribution to the 7th ABAW challenge at ECCV 2024, by +utilizing a Dual-Direction Attention Mixed Feature Network (DDAMFN) for +multitask facial expression recognition, we achieve results far beyond the +proposed baseline for the Multi-Task ABAW challenge. Our proposal uses the +well-known DDAMFN architecture as base to effectively predict valence-arousal, +emotion recognition, and facial action units. We demonstrate the architecture +ability to handle these tasks simultaneously, providing insights into its +architecture and the rationale behind its design. Additionally, we compare our +results for a multitask solution with independent single-task performance. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ Human-AI Collaborative Multi-modal Multi-rater Learning for + Endometriosis Diagnosis + + +
+ Endometriosis, affecting about 10\% of individuals assigned female at birth, +is challenging to diagnose and manage. Diagnosis typically involves the +identification of various signs of the disease using either laparoscopic +surgery or the analysis of T1/T2 MRI images, with the latter being quicker and +cheaper but less accurate. A key diagnostic sign of endometriosis is the +obliteration of the Pouch of Douglas (POD). However, even experienced +clinicians struggle with accurately classifying POD obliteration from MRI +images, which complicates the training of reliable AI models. In this paper, we +introduce the \underline{H}uman-\underline{AI} \underline{Co}llaborative +\underline{M}ulti-modal \underline{M}ulti-rater Learning (HAICOMM) methodology +to address the challenge above. HAICOMM is the first method that explores three +important aspects of this problem: 1) multi-rater learning to extract a cleaner +label from the multiple ``noisy'' labels available per training sample; 2) +multi-modal learning to leverage the presence of T1/T2 MRI images for training +and testing; and 3) human-AI collaboration to build a system that leverages the +predictions from clinicians and the AI model to provide more accurate +classification than standalone clinicians and AI models. Presenting results on +the multi-rater T1/T2 MRI endometriosis dataset that we collected to validate +our methodology, the proposed HAICOMM model outperforms an ensemble of +clinicians, noisy-label learning models, and multi-rater learning methods. + +
+
+
+
+
+ + ♻ ☆ Giving each task what it needs -- leveraging structured sparsity for + tailored multi-task learning ECCV 2024 + + +
+ In the Multi-task Learning (MTL) framework, every task demands distinct +feature representations, ranging from low-level to high-level attributes. It is +vital to address the specific (feature/parameter) needs of each task, +especially in computationally constrained environments. This work, therefore, +introduces Layer-Optimized Multi-Task (LOMT) models that utilize structured +sparsity to refine feature selection for individual tasks and enhance the +performance of all tasks in a multi-task scenario. Structured or group sparsity +systematically eliminates parameters from trivial channels and, sometimes, +eventually, entire layers within a convolution neural network during training. +Consequently, the remaining layers provide the most optimal features for a +given task. In this two-step approach, we subsequently leverage this +sparsity-induced optimal layer information to build the LOMT models by +connecting task-specific decoders to these strategically identified layers, +deviating from conventional approaches that uniformly connect decoders at the +end of the network. This tailored architecture optimizes the network, focusing +on essential features while reducing redundancy. We validate the efficacy of +the proposed approach on two datasets, i.e., NYU-v2 and CelebAMask-HD datasets, +for multiple heterogeneous tasks. A detailed performance analysis of the LOMT +models, in contrast to the conventional MTL models, reveals that the LOMT +models outperform for most task combinations. The excellent qualitative and +quantitative outcomes highlight the effectiveness of employing structured +sparsity for optimal layer (or feature) selection. + +
+
+ comment: Accepted at ECCV 2024 workshop - Computational Aspects of Deep + Learning +
+
+
+
+
+ + ♻ ☆ Kernel Adversarial Learning for Real-world Image Super-resolution + + +
+ Current deep image super-resolution (SR) approaches aim to restore +high-resolution images from down-sampled images or by assuming degradation from +simple Gaussian kernels and additive noises. However, these techniques only +assume crude approximations of the real-world image degradation process, which +should involve complex kernels and noise patterns that are difficult to model +using simple assumptions. In this paper, we propose a more realistic process to +synthesise low-resolution images for real-world image SR by introducing a new +Kernel Adversarial Learning Super-resolution (KASR) framework. In the proposed +framework, degradation kernels and noises are adaptively modelled rather than +explicitly specified. Moreover, we also propose a high-frequency selective +objective and an iterative supervision process to further boost the model SR +reconstruction accuracy. Extensive experiments validate the effectiveness of +the proposed framework on real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Loopy: Taming Audio-Driven Portrait Avatar with Long-Term Motion + Dependency + + +
+ With the introduction of diffusion-based video generation techniques, +audio-conditioned human video generation has recently achieved significant +breakthroughs in both the naturalness of motion and the synthesis of portrait +details. Due to the limited control of audio signals in driving human motion, +existing methods often add auxiliary spatial signals to stabilize movements, +which may compromise the naturalness and freedom of motion. In this paper, we +propose an end-to-end audio-only conditioned video diffusion model named Loopy. +Specifically, we designed an inter- and intra-clip temporal module and an +audio-to-latents module, enabling the model to leverage long-term motion +information from the data to learn natural motion patterns and improving +audio-portrait movement correlation. This method removes the need for manually +specified spatial motion templates used in existing methods to constrain motion +during inference. Extensive experiments show that Loopy outperforms recent +audio-driven portrait diffusion models, delivering more lifelike and +high-quality results across various scenarios. + +
+
+ comment: Homepage: https://loopyavatar.github.io/ +
+
+
+
+
+ + ♻ ☆ G-Style: Stylized Gaussian Splatting + + +
+ We introduce G-Style, a novel algorithm designed to transfer the style of an +image onto a 3D scene represented using Gaussian Splatting. Gaussian Splatting +is a powerful 3D representation for novel view synthesis, as -- compared to +other approaches based on Neural Radiance Fields -- it provides fast scene +renderings and user control over the scene. Recent pre-prints have demonstrated +that the style of Gaussian Splatting scenes can be modified using an image +exemplar. However, since the scene geometry remains fixed during the +stylization process, current solutions fall short of producing satisfactory +results. Our algorithm aims to address these limitations by following a +three-step process: In a pre-processing step, we remove undesirable Gaussians +with large projection areas or highly elongated shapes. Subsequently, we +combine several losses carefully designed to preserve different scales of the +style in the image, while maintaining as much as possible the integrity of the +original scene content. During the stylization process and following the +original design of Gaussian Splatting, we split Gaussians where additional +detail is necessary within our scene by tracking the gradient of the stylized +color. Our experiments demonstrate that G-Style generates high-quality +stylizations within just a few minutes, outperforming existing methods both +qualitatively and quantitatively. + +
+
+
+
+
+ + ♻ ☆ MCDS-VSS: Moving Camera Dynamic Scene Video Semantic Segmentation by + Filtering with Self-Supervised Geometry and Motion BMVC 2024 + + +
+ Autonomous systems, such as self-driving cars, rely on reliable semantic +environment perception for decision making. Despite great advances in video +semantic segmentation, existing approaches ignore important inductive biases +and lack structured and interpretable internal representations. In this work, +we propose MCDS-VSS, a structured filter model that learns in a self-supervised +manner to estimate scene geometry and ego-motion of the camera, while also +estimating the motion of external objects. Our model leverages these +representations to improve the temporal consistency of semantic segmentation +without sacrificing segmentation accuracy. MCDS-VSS follows a prediction-fusion +approach in which scene geometry and camera motion are first used to compensate +for ego-motion, then residual flow is used to compensate motion of dynamic +objects, and finally the predicted scene features are fused with the current +features to obtain a temporally consistent scene segmentation. Our model parses +automotive scenes into multiple decoupled interpretable representations such as +scene geometry, ego-motion, and object motion. Quantitative evaluation shows +that MCDS-VSS achieves superior temporal consistency on video sequences while +retaining competitive segmentation performance. + +
+
+ comment: Accepted for publication at BMVC 2024 +
+
+
+
+
+ + ♻ ☆ BEVal: A Cross-dataset Evaluation Study of BEV Segmentation Models for + Autononomous Driving + + +
+ Current research in semantic bird's-eye view segmentation for autonomous +driving focuses solely on optimizing neural network models using a single +dataset, typically nuScenes. This practice leads to the development of highly +specialized models that may fail when faced with different environments or +sensor setups, a problem known as domain shift. In this paper, we conduct a +comprehensive cross-dataset evaluation of state-of-the-art BEV segmentation +models to assess their performance across different training and testing +datasets and setups, as well as different semantic categories. We investigate +the influence of different sensors, such as cameras and LiDAR, on the models' +ability to generalize to diverse conditions and scenarios. Additionally, we +conduct multi-dataset training experiments that improve models' BEV +segmentation performance compared to single-dataset training. Our work +addresses the gap in evaluating BEV segmentation models under cross-dataset +validation. And our findings underscore the importance of enhancing model +generalizability and adaptability to ensure more robust and reliable BEV +segmentation approaches for autonomous driving applications. The code for this +paper available at https://github.com/manueldiaz96/beval . + +
+
+
+
+
+ + ♻ ☆ Adaptive Explicit Knowledge Transfer for Knowledge Distillation + + +
+ Logit-based knowledge distillation (KD) for classification is cost-efficient +compared to feature-based KD but often subject to inferior performance. +Recently, it was shown that the performance of logit-based KD can be improved +by effectively delivering the probability distribution for the non-target +classes from the teacher model, which is known as `implicit (dark) knowledge', +to the student model. Through gradient analysis, we first show that this +actually has an effect of adaptively controlling the learning of implicit +knowledge. Then, we propose a new loss that enables the student to learn +explicit knowledge (i.e., the teacher's confidence about the target class) +along with implicit knowledge in an adaptive manner. Furthermore, we propose to +separate the classification and distillation tasks for effective distillation +and inter-class relationship modeling. Experimental results demonstrate that +the proposed method, called adaptive explicit knowledge transfer (AEKT) method, +achieves improved performance compared to the state-of-the-art KD methods on +the CIFAR-100 and ImageNet datasets. + +
+
+ comment: 19 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Boosting Adversarial Transferability for Skeleton-based Action + Recognition via Exploring the Model Posterior Space + + +
+ Skeletal motion plays a pivotal role in human activity recognition (HAR). +Recently, attack methods have been proposed to identify the universal +vulnerability of skeleton-based HAR(S-HAR). However, the research of +adversarial transferability on S-HAR is largely missing. More importantly, +existing attacks all struggle in transfer across unknown S-HAR models. We +observed that the key reason is that the loss landscape of the action +recognizers is rugged and sharp. Given the established correlation in prior +studies~\cite{qin2022boosting,wu2020towards} between loss landscape and +adversarial transferability, we assume and empirically validate that smoothing +the loss landscape could potentially improve adversarial transferability on +S-HAR. This is achieved by proposing a new post-train Dual Bayesian strategy, +which can effectively explore the model posterior space for a collection of +surrogates without the need for re-training. Furthermore, to craft adversarial +examples along the motion manifold, we incorporate the attack gradient with +information of the motion dynamics in a Bayesian manner. Evaluated on benchmark +datasets, e.g. HDM05 and NTU 60, the average transfer success rate can reach as +high as 35.9\% and 45.5\% respectively. In comparison, current state-of-the-art +skeletal attacks achieve only 3.6\% and 9.8\%. The high adversarial +transferability remains consistent across various surrogate, victim, and even +defense models. Through a comprehensive analysis of the results, we provide +insights on what surrogates are more likely to exhibit transferability, to shed +light on future research. + +
+
+ comment: We have submitted a new version of our work at arXiv:2409.02483. This + version, arXiv:2407.08572, is no longer valid. Any update for this work will + be conducted in arXiv:2409.02483 +
+
+
+
+
+ + ♻ ☆ 3D Single-object Tracking in Point Clouds with High Temporal Variation ECCV24 + + +
+ The high temporal variation of the point clouds is the key challenge of 3D +single-object tracking (3D SOT). Existing approaches rely on the assumption +that the shape variation of the point clouds and the motion of the objects +across neighboring frames are smooth, failing to cope with high temporal +variation data. In this paper, we present a novel framework for 3D SOT in point +clouds with high temporal variation, called HVTrack. HVTrack proposes three +novel components to tackle the challenges in the high temporal variation +scenario: 1) A Relative-Pose-Aware Memory module to handle temporal point cloud +shape variations; 2) a Base-Expansion Feature Cross-Attention module to deal +with similar object distractions in expanded search areas; 3) a Contextual +Point Guided Self-Attention module for suppressing heavy background noise. We +construct a dataset with high temporal variation (KITTI-HV) by setting +different frame intervals for sampling in the KITTI dataset. On the KITTI-HV +with 5 frame intervals, our HVTrack surpasses the state-of-the-art tracker +CXTracker by 11.3%/15.7% in Success/Precision. + +
+
+ comment: Accepted by ECCV24 +
+
+
+
+
+ + ♻ ☆ More Text, Less Point: Towards 3D Data-Efficient Point-Language + Understanding + + +
+ Enabling Large Language Models (LLMs) to comprehend the 3D physical world +remains a significant challenge. Due to the lack of large-scale 3D-text pair +datasets, the success of LLMs has yet to be replicated in 3D understanding. In +this paper, we rethink this issue and propose a new task: 3D Data-Efficient +Point-Language Understanding. The goal is to enable LLMs to achieve robust 3D +object understanding with minimal 3D point cloud and text data pairs. To +address this task, we introduce GreenPLM, which leverages more text data to +compensate for the lack of 3D data. First, inspired by using CLIP to align +images and text, we utilize a pre-trained point cloud-text encoder to map the +3D point cloud space to the text space. This mapping leaves us to seamlessly +connect the text space with LLMs. Once the point-text-LLM connection is +established, we further enhance text-LLM alignment by expanding the +intermediate text space, thereby reducing the reliance on 3D point cloud data. +Specifically, we generate 6M free-text descriptions of 3D objects, and design a +three-stage training strategy to help LLMs better explore the intrinsic +connections between different modalities. To achieve efficient modality +alignment, we design a zero-parameter cross-attention module for token pooling. +Extensive experimental results show that GreenPLM requires only 12% of the 3D +training data used by existing state-of-the-art models to achieve superior 3D +understanding. Remarkably, GreenPLM also achieves competitive performance using +text-only data. The code and weights are available at: +https://github.com/TangYuan96/GreenPLM. + +
+
+
+
+
+ + ♻ ☆ Prediction of soil fertility parameters using USB-microscope imagery and + portable X-ray fluorescence spectrometry + + +
+ This study investigated the use of portable X-ray fluorescence (PXRF) +spectrometry and soil image analysis for rapid soil fertility assessment, with +a focus on key indicators such as available boron (B), organic carbon (OC), +available manganese (Mn), available sulfur (S), and the sulfur availability +index (SAI). A total of 1,133 soil samples from diverse agro-climatic zones in +Eastern India were analyzed. The research integrated color and texture features +from microscopic soil images, PXRF data, and auxiliary soil variables (AVs) +using a Random Forest model. Results showed that combining image features (IFs) +with AVs significantly improved prediction accuracy for available B (R2 = 0.80) +and OC (R2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF +data, further enhanced predictions for available Mn and SAI, with R2 values of +0.72 and 0.70, respectively. The study highlights the potential of integrating +these technologies to offer rapid, cost-effective soil testing methods, paving +the way for more advanced predictive models and a deeper understanding of soil +fertility. Future work should explore the application of deep learning models +on a larger dataset, incorporating soils from a wider range of agro-climatic +zones under field conditions. + +
+
+ comment: Published in 'Soil Advances' +
+
+
+
+
+ + ♻ ☆ D-SCo: Dual-Stream Conditional Diffusion for Monocular Hand-Held Object + Reconstruction ECCV 2024 + + +
+ Reconstructing hand-held objects from a single RGB image is a challenging +task in computer vision. In contrast to prior works that utilize deterministic +modeling paradigms, we employ a point cloud denoising diffusion model to +account for the probabilistic nature of this problem. In the core, we introduce +centroid-fixed dual-stream conditional diffusion for monocular hand-held object +reconstruction (D-SCo), tackling two predominant challenges. First, to avoid +the object centroid from deviating, we utilize a novel hand-constrained +centroid fixing paradigm, enhancing the stability of diffusion and reverse +processes and the precision of feature projection. Second, we introduce a +dual-stream denoiser to semantically and geometrically model hand-object +interactions with a novel unified hand-object semantic embedding, enhancing the +reconstruction performance of the hand-occluded region of the object. +Experiments on the synthetic ObMan dataset and three real-world datasets HO3D, +MOW and DexYCB demonstrate that our approach can surpass all other +state-of-the-art methods. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ AICAttack: Adversarial Image Captioning Attack with Attention-Based + Optimization + + +
+ Recent advances in deep learning research have shown remarkable achievements +across many tasks in computer vision (CV) and natural language processing +(NLP). At the intersection of CV and NLP is the problem of image captioning, +where the related models' robustness against adversarial attacks has not been +well studied. This paper presents a novel adversarial attack strategy, +AICAttack (Attention-based Image Captioning Attack), designed to attack image +captioning models through subtle perturbations on images. Operating within a +black-box attack scenario, our algorithm requires no access to the target +model's architecture, parameters, or gradient information. We introduce an +attention-based candidate selection mechanism that identifies the optimal +pixels to attack, followed by a customised differential evolution method to +optimise the perturbations of pixels' RGB values. We demonstrate AICAttack's +effectiveness through extensive experiments on benchmark datasets against +multiple victim models. The experimental results demonstrate that our method +outperforms current leading-edge techniques by achieving consistently higher +attack success rates. + +
+
+
+
+
+ + ♻ ☆ LinFusion: 1 GPU, 1 Minute, 16K Image + + +
+ Modern diffusion models, particularly those utilizing a Transformer-based +UNet for denoising, rely heavily on self-attention operations to manage complex +spatial relationships, thus achieving impressive generation performance. +However, this existing paradigm faces significant challenges in generating +high-resolution visual content due to its quadratic time and memory complexity +with respect to the number of spatial tokens. To address this limitation, we +aim at a novel linear attention mechanism as an alternative in this paper. +Specifically, we begin our exploration from recently introduced models with +linear complexity, e.g., Mamba2, RWKV6, Gated Linear Attention, etc, and +identify two key features-attention normalization and non-causal inference-that +enhance high-resolution visual generation performance. Building on these +insights, we introduce a generalized linear attention paradigm, which serves as +a low-rank approximation of a wide spectrum of popular linear token mixers. To +save the training cost and better leverage pre-trained models, we initialize +our models and distill the knowledge from pre-trained StableDiffusion (SD). We +find that the distilled model, termed LinFusion, achieves performance on par +with or superior to the original SD after only modest training, while +significantly reducing time and memory complexity. Extensive experiments on +SD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory +zero-shot cross-resolution generation performance, generating high-resolution +images like 16K resolution. Moreover, it is highly compatible with pre-trained +SD components, such as ControlNet and IP-Adapter, requiring no adaptation +efforts. Codes are available at https://github.com/Huage001/LinFusion. + +
+
+ comment: Work in Progress. Codes are available at + https://github.com/Huage001/LinFusion +
+
+
+
+
+ + ♻ ☆ UVL2: A Unified Framework for Video Tampering Localization + + +
+ With the advancement of deep learning-driven video editing technology, +security risks have emerged. Malicious video tampering can lead to public +misunderstanding, property losses, and legal disputes. Currently, detection +methods are mostly limited to specific datasets, with limited detection +performance for unknown forgeries, and lack of robustness for processed data. +This paper proposes an effective video tampering localization network that +significantly improves the detection performance of video inpainting and +splicing by extracting more generalized features of forgery traces. Considering +the inherent differences between tampered videos and original videos, such as +edge artifacts, pixel distribution, texture features, and compress information, +we have specifically designed four modules to independently extract these +features. Furthermore, to seamlessly integrate these features, we employ a +two-stage approach utilizing both a Convolutional Neural Network and a Vision +Transformer, enabling us to learn these features in a local-to-global manner. +Experimental results demonstrate that the method significantly outperforms the +existing state-of-the-art methods and exhibits robustness. + +
+
+
+
+
+ + ♻ ☆ LuSNAR:A Lunar Segmentation, Navigation and Reconstruction Dataset based + on Muti-sensor for Autonomous Exploration + + +
+ With the complexity of lunar exploration missions, the moon needs to have a +higher level of autonomy. Environmental perception and navigation algorithms +are the foundation for lunar rovers to achieve autonomous exploration. The +development and verification of algorithms require highly reliable data +support. Most of the existing lunar datasets are targeted at a single task, +lacking diverse scenes and high-precision ground truth labels. To address this +issue, we propose a multi-task, multi-scene, and multi-label lunar benchmark +dataset LuSNAR. This dataset can be used for comprehensive evaluation of +autonomous perception and navigation systems, including high-resolution stereo +image pairs, panoramic semantic labels, dense depth maps, LiDAR point clouds, +and the position of rover. In order to provide richer scene data, we built 9 +lunar simulation scenes based on Unreal Engine. Each scene is divided according +to topographic relief and the density of objects. To verify the usability of +the dataset, we evaluated and analyzed the algorithms of semantic segmentation, +3D reconstruction, and autonomous navigation. The experiment results prove that +the dataset proposed in this paper can be used for ground verification of tasks +such as autonomous environment perception and navigation, and provides a lunar +benchmark dataset for testing the accessibility of algorithm metrics. We make +LuSNAR publicly available at: https://github.com/autumn999999/LuSNAR-dataset. + +
+
+ comment: 19 pages, 13 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ A Survey for Foundation Models in Autonomous Driving + + +
+ The advent of foundation models has revolutionized the fields of natural +language processing and computer vision, paving the way for their application +in autonomous driving (AD). This survey presents a comprehensive review of more +than 40 research papers, demonstrating the role of foundation models in +enhancing AD. Large language models contribute to planning and simulation in +AD, particularly through their proficiency in reasoning, code generation and +translation. In parallel, vision foundation models are increasingly adapted for +critical tasks such as 3D object detection and tracking, as well as creating +realistic driving scenarios for simulation and testing. Multi-modal foundation +models, integrating diverse inputs, exhibit exceptional visual understanding +and spatial reasoning, crucial for end-to-end AD. This survey not only provides +a structured taxonomy, categorizing foundation models based on their modalities +and functionalities within the AD domain but also delves into the methods +employed in current research. It identifies the gaps between existing +foundation models and cutting-edge AD approaches, thereby charting future +research directions and proposing a roadmap for bridging these gaps. + +
+
+
+
+
+ + ♻ ☆ CyberHost: Taming Audio-driven Avatar Diffusion Model with Region + Codebook Attention + + +
+ Diffusion-based video generation technology has advanced significantly, +catalyzing a proliferation of research in human animation. However, the +majority of these studies are confined to same-modality driving settings, with +cross-modality human body animation remaining relatively underexplored. In this +paper, we introduce, an end-to-end audio-driven human animation framework that +ensures hand integrity, identity consistency, and natural motion. The key +design of CyberHost is the Region Codebook Attention mechanism, which improves +the generation quality of facial and hand animations by integrating +fine-grained local features with learned motion pattern priors. Furthermore, we +have developed a suite of human-prior-guided training strategies, including +body movement map, hand clarity score, pose-aligned reference feature, and +local enhancement supervision, to improve synthesis results. To our knowledge, +CyberHost is the first end-to-end audio-driven human diffusion model capable of +facilitating zero-shot video generation within the scope of human body. +Extensive experiments demonstrate that CyberHost surpasses previous works in +both quantitative and qualitative aspects. + +
+
+ comment: Homepage: https://cyberhost.github.io/ +
+
+
+
+
+ + ♻ ☆ PointCloud-Text Matching: Benchmark Datasets and a Baseline + + +
+ In this paper, we present and study a new instance-level retrieval task: +PointCloud-Text Matching~(PTM), which aims to find the exact cross-modal +instance that matches a given point-cloud query or text query. PTM could be +applied to various scenarios, such as indoor/urban-canyon localization and +scene retrieval. However, there exists no suitable and targeted dataset for PTM +in practice. Therefore, we construct three new PTM benchmark datasets, namely +3D2T-SR, 3D2T-NR, and 3D2T-QA. We observe that the data is challenging and with +noisy correspondence due to the sparsity, noise, or disorder of point clouds +and the ambiguity, vagueness, or incompleteness of texts, which make existing +cross-modal matching methods ineffective for PTM. To tackle these challenges, +we propose a PTM baseline, named Robust PointCloud-Text Matching method (RoMa). +RoMa consists of two modules: a Dual Attention Perception module (DAP) and a +Robust Negative Contrastive Learning module (RNCL). Specifically, DAP leverages +token-level and feature-level attention to adaptively focus on useful local and +global features, and aggregate them into common representations, thereby +reducing the adverse impact of noise and ambiguity. To handle noisy +correspondence, RNCL divides negative pairs, which are much less error-prone +than positive pairs, into clean and noisy subsets, and assigns them forward and +reverse optimization directions respectively, thus enhancing robustness against +noisy correspondence. We conduct extensive experiments on our benchmarks and +demonstrate the superiority of our RoMa. + +
+
+ comment: Upon further consideration, we have concluded that the current + version requires significant revision and may not yet be ready for + publication. We plan to conduct additional experiments and make the necessary + improvements to ensure the paper meets the standards for future submission +
+
+
+
+
+ + ♻ ☆ Zero-Shot Character Identification and Speaker Prediction in Comics via + Iterative Multimodal Fusion + + +
+ Recognizing characters and predicting speakers of dialogue are critical for +comic processing tasks, such as voice generation or translation. However, +because characters vary by comic title, supervised learning approaches like +training character classifiers which require specific annotations for each +comic title are infeasible. This motivates us to propose a novel zero-shot +approach, allowing machines to identify characters and predict speaker names +based solely on unannotated comic images. In spite of their importance in +real-world applications, these task have largely remained unexplored due to +challenges in story comprehension and multimodal integration. Recent large +language models (LLMs) have shown great capability for text understanding and +reasoning, while their application to multimodal content analysis is still an +open problem. To address this problem, we propose an iterative multimodal +framework, the first to employ multimodal information for both character +identification and speaker prediction tasks. Our experiments demonstrate the +effectiveness of the proposed framework, establishing a robust baseline for +these tasks. Furthermore, since our method requires no training data or +annotations, it can be used as-is on any comic series. + +
+
+ comment: Accepted to ACM Multimedia 2024. Project page: + https://liyingxuan1012.github.io/zeroshot-speaker-prediction ; Github repo: + https://github.com/liyingxuan1012/zeroshot-speaker-prediction +
+
+
+
+
+ + ♻ ☆ Hypergraph Multi-modal Large Language Model: Exploiting EEG and + Eye-tracking Modalities to Evaluate Heterogeneous Responses for Video + Understanding + + +
+ Understanding of video creativity and content often varies among individuals, +with differences in focal points and cognitive levels across different ages, +experiences, and genders. There is currently a lack of research in this area, +and most existing benchmarks suffer from several drawbacks: 1) a limited number +of modalities and answers with restrictive length; 2) the content and scenarios +within the videos are excessively monotonous, transmitting allegories and +emotions that are overly simplistic. To bridge the gap to real-world +applications, we introduce a large-scale Subjective Response Indicators for +Advertisement Videos dataset, namely SRI-ADV. Specifically, we collected real +changes in Electroencephalographic (EEG) and eye-tracking regions from +different demographics while they viewed identical video content. Utilizing +this multi-modal dataset, we developed tasks and protocols to analyze and +evaluate the extent of cognitive understanding of video content among different +users. Along with the dataset, we designed a Hypergraph Multi-modal Large +Language Model (HMLLM) to explore the associations among different +demographics, video elements, EEG, and eye-tracking indicators. HMLLM could +bridge semantic gaps across rich modalities and integrate information beyond +different modalities to perform logical reasoning. Extensive experimental +evaluations on SRI-ADV and other additional video-based generative performance +benchmarks demonstrate the effectiveness of our method. The codes and dataset +will be released at https://github.com/mininglamp-MLLM/HMLLM. + +
+
+ comment: Accepted by ACM MULTIMEDIA 2024 +
+
+
+
+
+ + ♻ ☆ Q-Seg: Quantum Annealing-Based Unsupervised Image Segmentation + + +
+ We present Q-Seg, a novel unsupervised image segmentation method based on +quantum annealing, tailored for existing quantum hardware. We formulate the +pixel-wise segmentation problem, which assimilates spectral and spatial +information of the image, as a graph-cut optimization task. Our method +efficiently leverages the interconnected qubit topology of the D-Wave Advantage +device, offering superior scalability over existing quantum approaches and +outperforming several tested state-of-the-art classical methods. Empirical +evaluations on synthetic datasets have shown that Q-Seg has better runtime +performance than the state-of-the-art classical optimizer Gurobi. The method +has also been tested on earth observation image segmentation, a critical area +with noisy and unreliable annotations. In the era of noisy intermediate-scale +quantum, Q-Seg emerges as a reliable contender for real-world applications in +comparison to advanced techniques like Segment Anything. Consequently, Q-Seg +offers a promising solution using available quantum hardware, especially in +situations constrained by limited labeled data and the need for efficient +computational runtime. + +
+
+ comment: 12 pages, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Open-Vocabulary SAM3D: Towards Training-free Open-Vocabulary 3D Scene + Understanding + + +
+ Open-vocabulary 3D scene understanding presents a significant challenge in +the field. Recent works have sought to transfer knowledge embedded in +vision-language models from 2D to 3D domains. However, these approaches often +require prior knowledge from specific 3D scene datasets, limiting their +applicability in open-world scenarios. The Segment Anything Model (SAM) has +demonstrated remarkable zero-shot segmentation capabilities, prompting us to +investigate its potential for comprehending 3D scenes without training. In this +paper, we introduce OV-SAM3D, a training-free method that contains a universal +framework for understanding open-vocabulary 3D scenes. This framework is +designed to perform understanding tasks for any 3D scene without requiring +prior knowledge of the scene. Specifically, our method is composed of two key +sub-modules: First, we initiate the process by generating superpoints as the +initial 3D prompts and refine these prompts using segment masks derived from +SAM. Moreover, we then integrate a specially designed overlapping score table +with open tags from the Recognize Anything Model (RAM) to produce final 3D +instances with open-world labels. Empirical evaluations on the ScanNet200 and +nuScenes datasets demonstrate that our approach surpasses existing +open-vocabulary methods in unknown open-world environments. + +
+
+ comment: Project page: https://hithqd.github.io/projects/OV-SAM3D +
+
+
+
+
+ + ♻ ☆ An Efficient Instance Segmentation Framework Using Segmentation + Foundation Models with Oriented Bounding Box Prompts + + +
+ Instance segmentation in unmanned aerial vehicle measurement is a +long-standing challenge. Since horizontal bounding boxes introduce many +interference objects, oriented bounding boxes (OBBs) are usually used for +instance identification. However, based on ``segmentation within bounding box'' +paradigm, current instance segmentation methods using OBBs are overly dependent +on bounding box detection performance. To tackle this, this paper proposes +OBSeg, an efficient instance segmentation framework using OBBs. OBSeg is based +on box prompt-based segmentation foundation models (BSMs), e.g., Segment +Anything Model. Specifically, OBSeg first detects OBBs to distinguish instances +and provide coarse localization information. Then, it predicts OBB +prompt-related masks for fine segmentation. Since OBBs only serve as prompts, +OBSeg alleviates the over-dependence on bounding box detection performance of +current instance segmentation methods using OBBs. In addition, to enable BSMs +to handle OBB prompts, we propose a novel OBB prompt encoder. To make OBSeg +more lightweight and further improve the performance of lightweight distilled +BSMs, a Gaussian smoothing-based knowledge distillation method is introduced. +Experiments demonstrate that OBSeg outperforms current instance segmentation +methods on multiple public datasets. The code is available at +https://github.com/zhen6618/OBBInstanceSegmentation. + +
+
+
+
+
+ + ♻ ☆ Diff-IP2D: Diffusion-Based Hand-Object Interaction Prediction on + Egocentric Videos + + +
+ Understanding how humans would behave during hand-object interaction is vital +for applications in service robot manipulation and extended reality. To achieve +this, some recent works have been proposed to simultaneously forecast hand +trajectories and object affordances on human egocentric videos. The joint +prediction serves as a comprehensive representation of future hand-object +interactions in 2D space, indicating potential human motion and motivation. +However, the existing approaches mostly adopt the autoregressive paradigm for +unidirectional prediction, which lacks mutual constraints within the holistic +future sequence, and accumulates errors along the time axis. Meanwhile, these +works basically overlook the effect of camera egomotion on first-person view +predictions. To address these limitations, we propose a novel diffusion-based +interaction prediction method, namely Diff-IP2D, to forecast future hand +trajectories and object affordances concurrently in an iterative +non-autoregressive manner. We transform the sequential 2D images into latent +feature space and design a denoising diffusion model to predict future latent +interaction features conditioned on past ones. Motion features are further +integrated into the conditional denoising process to enable Diff-IP2D aware of +the camera wearer's dynamics for more accurate interaction prediction. +Extensive experiments demonstrate that our method significantly outperforms the +state-of-the-art baselines on both the off-the-shelf metrics and our newly +proposed evaluation protocol. This highlights the efficacy of leveraging a +generative paradigm for 2D hand-object interaction prediction. The code of +Diff-IP2D will be released at https://github.com/IRMVLab/Diff-IP2D. + +
+
+
+
+
+ + ♻ ☆ TransKD: Transformer Knowledge Distillation for Efficient Semantic + Segmentation + + +
+ Semantic segmentation benchmarks in the realm of autonomous driving are +dominated by large pre-trained transformers, yet their widespread adoption is +impeded by substantial computational costs and prolonged training durations. To +lift this constraint, we look at efficient semantic segmentation from a +perspective of comprehensive knowledge distillation and aim to bridge the gap +between multi-source knowledge extractions and transformer-specific patch +embeddings. We put forward the Transformer-based Knowledge Distillation +(TransKD) framework which learns compact student transformers by distilling +both feature maps and patch embeddings of large teacher transformers, bypassing +the long pre-training process and reducing the FLOPs by >85.0%. Specifically, +we propose two fundamental modules to realize feature map distillation and +patch embedding distillation, respectively: (1) Cross Selective Fusion (CSF) +enables knowledge transfer between cross-stage features via channel attention +and feature map distillation within hierarchical transformers; (2) Patch +Embedding Alignment (PEA) performs dimensional transformation within the +patchifying process to facilitate the patch embedding distillation. +Furthermore, we introduce two optimization modules to enhance the patch +embedding distillation from different perspectives: (1) Global-Local Context +Mixer (GL-Mixer) extracts both global and local information of a representative +embedding; (2) Embedding Assistant (EA) acts as an embedding method to +seamlessly bridge teacher and student models with the teacher's number of +channels. Experiments on Cityscapes, ACDC, NYUv2, and Pascal VOC2012 datasets +show that TransKD outperforms state-of-the-art distillation frameworks and +rivals the time-consuming pre-training method. The source code is publicly +available at https://github.com/RuipingL/TransKD. + +
+
+ comment: Accepted to IEEE Transactions on Intelligent Transportation Systems + (T-ITS). The source code is publicly available at + https://github.com/RuipingL/TransKD +
+
+
+
+
+ + ♻ ☆ Refusing Safe Prompts for Multi-modal Large Language Models + + +
+ Multimodal large language models (MLLMs) have become the cornerstone of +today's generative AI ecosystem, sparking intense competition among tech giants +and startups. In particular, an MLLM generates a text response given a prompt +consisting of an image and a question. While state-of-the-art MLLMs use safety +filters and alignment techniques to refuse unsafe prompts, in this work, we +introduce MLLM-Refusal, the first method that induces refusals for safe +prompts. In particular, our MLLM-Refusal optimizes a nearly-imperceptible +refusal perturbation and adds it to an image, causing target MLLMs to likely +refuse a safe prompt containing the perturbed image and a safe question. +Specifically, we formulate MLLM-Refusal as a constrained optimization problem +and propose an algorithm to solve it. Our method offers competitive advantages +for MLLM model providers by potentially disrupting user experiences of +competing MLLMs, since competing MLLM's users will receive unexpected refusals +when they unwittingly use these perturbed images in their prompts. We evaluate +MLLM-Refusal on four MLLMs across four datasets, demonstrating its +effectiveness in causing competing MLLMs to refuse safe prompts while not +affecting non-competing MLLMs. Furthermore, we explore three potential +countermeasures-adding Gaussian noise, DiffPure, and adversarial training. Our +results show that though they can mitigate MLLM-Refusal's effectiveness, they +also sacrifice the accuracy and/or efficiency of the competing MLLM. The code +is available at https://github.com/Sadcardation/MLLM-Refusal. + +
+
+
+
+
+ + ♻ ☆ Visual Prompting Upgrades Neural Network Sparsification: A Data-Model + Perspective + + +
+ The rapid development of large-scale deep learning models questions the +affordability of hardware platforms, which necessitates the pruning to reduce +their computational and memory footprints. Sparse neural networks as the +product, have demonstrated numerous favorable benefits like low complexity, +undamaged generalization, etc. Most of the prominent pruning strategies are +invented from a model-centric perspective, focusing on searching and preserving +crucial weights by analyzing network topologies. However, the role of data and +its interplay with model-centric pruning has remained relatively unexplored. In +this research, we introduce a novel data-model co-design perspective: to +promote superior weight sparsity by learning important model topology and +adequate input data in a synergetic manner. Specifically, customized Visual +Prompts are mounted to upgrade neural Network sparsification in our proposed +VPNs framework. As a pioneering effort, this paper conducts systematic +investigations about the impact of different visual prompts on model pruning +and suggests an effective joint optimization approach. Extensive experiments +with 3 network architectures and 8 datasets evidence the substantial +performance improvements from VPNs over existing start-of-the-art pruning +algorithms. Furthermore, we find that subnetworks discovered by VPNs from +pre-trained models enjoy better transferability across diverse downstream +scenarios. These insights shed light on new promising possibilities of +data-model co-designs for vision model sparsification. + +
+
+
+
+
+ + ♻ ☆ UnsafeBench: Benchmarking Image Safety Classifiers on Real-World and + AI-Generated Images + + +
+ With the advent of text-to-image models and concerns about their misuse, +developers are increasingly relying on image safety classifiers to moderate +their generated unsafe images. Yet, the performance of current image safety +classifiers remains unknown for both real-world and AI-generated images. In +this work, we propose UnsafeBench, a benchmarking framework that evaluates the +effectiveness and robustness of image safety classifiers, with a particular +focus on the impact of AI-generated images on their performance. First, we +curate a large dataset of 10K real-world and AI-generated images that are +annotated as safe or unsafe based on a set of 11 unsafe categories of images +(sexual, violent, hateful, etc.). Then, we evaluate the effectiveness and +robustness of five popular image safety classifiers, as well as three +classifiers that are powered by general-purpose visual language models. Our +assessment indicates that existing image safety classifiers are not +comprehensive and effective enough to mitigate the multifaceted problem of +unsafe images. Also, there exists a distribution shift between real-world and +AI-generated images in image qualities, styles, and layouts, leading to +degraded effectiveness and robustness. Motivated by these findings, we build a +comprehensive image moderation tool called PerspectiveVision, which addresses +the main drawbacks of existing classifiers with improved effectiveness and +robustness, especially on AI-generated images. UnsafeBench and +PerspectiveVision can aid the research community in better understanding the +landscape of image safety classification in the era of generative AI. + +
+
+
+
+
+ + ♻ ☆ Less is More: Fewer Interpretable Region via Submodular Subset Selection ICLR 2024 + + +
+ Image attribution algorithms aim to identify important regions that are +highly relevant to model decisions. Although existing attribution solutions can +effectively assign importance to target elements, they still face the following +challenges: 1) existing attribution methods generate inaccurate small regions +thus misleading the direction of correct attribution, and 2) the model cannot +produce good attribution results for samples with wrong predictions. To address +the above challenges, this paper re-models the above image attribution problem +as a submodular subset selection problem, aiming to enhance model +interpretability using fewer regions. To address the lack of attention to local +regions, we construct a novel submodular function to discover more accurate +small interpretation regions. To enhance the attribution effect for all +samples, we also impose four different constraints on the selection of +sub-regions, i.e., confidence, effectiveness, consistency, and collaboration +scores, to assess the importance of various subsets. Moreover, our theoretical +analysis substantiates that the proposed function is in fact submodular. +Extensive experiments show that the proposed method outperforms SOTA methods on +two face datasets (Celeb-A and VGG-Face2) and one fine-grained dataset +(CUB-200-2011). For correctly predicted samples, the proposed method improves +the Deletion and Insertion scores with an average of 4.9% and 2.5% gain +relative to HSIC-Attribution. For incorrectly predicted samples, our method +achieves gains of 81.0% and 18.4% compared to the HSIC-Attribution algorithm in +the average highest confidence and Insertion score respectively. The code is +released at https://github.com/RuoyuChen10/SMDL-Attribution. + +
+
+ comment: Accepted to ICLR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ Continual Learning Approaches for Anomaly Detection + + +
+ Anomaly Detection is a relevant problem that arises in numerous real-world +applications, especially when dealing with images. However, there has been +little research for this task in the Continual Learning setting. In this work, +we introduce a novel approach called SCALE (SCALing is Enough) to perform +Compressed Replay in a framework for Anomaly Detection in Continual Learning +setting. The proposed technique scales and compresses the original images using +a Super Resolution model which, to the best of our knowledge, is studied for +the first time in the Continual Learning setting. SCALE can achieve a high +level of compression while maintaining a high level of image reconstruction +quality. In conjunction with other Anomaly Detection approaches, it can achieve +optimal results. To validate the proposed approach, we use a real-world dataset +of images with pixel-based anomalies, with the scope to provide a reliable +benchmark for Anomaly Detection in the context of Continual Learning, serving +as a foundation for further advancements in the field. + +
+
+
+
+
+ + ♻ ☆ RadCLIP: Enhancing Radiologic Image Analysis through Contrastive + Language-Image Pre-training + + +
+ The integration of artificial intelligence (AI) with radiology marks a +transformative era in medicine. Vision foundation models have been adopted to +enhance radiologic imaging analysis. However, the distinct complexities of +radiologic 2D and 3D radiologic data pose unique challenges that existing +models, pre-trained on general non-medical images, fail to address adequately. +To bridge this gap and capitalize on the diagnostic precision required in +radiologic imaging, we introduce Radiologic Contrastive Language-Image +Pre-training (RadCLIP): a cross-modal vision-language foundational model that +harnesses Vision Language Pre-training (VLP) framework to improve radiologic +image analysis. Building upon Contrastive Language-Image Pre-training (CLIP), +RadCLIP incorporates a slice pooling mechanism tailored for volumetric image +analysis and is pre-trained using a large and diverse dataset of radiologic +image-text pairs. The RadCLIP was pre-trained to effectively align radiologic +images with their corresponding text annotations, creating a robust vision +backbone for radiologic images. Extensive experiments demonstrate RadCLIP's +superior performance in both uni-modal radiologic image classification and +cross-modal image-text matching, highlighting its significant promise for +improving diagnostic accuracy and efficiency in clinical settings. Our Key +contributions include curating a large dataset with diverse radiologic 2D/3D +radiologic image-text pairs, a slice pooling adapter using an attention +mechanism for integrating 2D images, and comprehensive evaluations of RadCLIP +on various radiologic downstream tasks. + +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild + + +
+ The increasing availability of real-world conversation data offers exciting +opportunities for researchers to study user-chatbot interactions. However, the +sheer volume of this data makes manually examining individual conversations +impractical. To overcome this challenge, we introduce WildVis, an interactive +tool that enables fast, versatile, and large-scale conversation analysis. +WildVis provides search and visualization capabilities in the text and +embedding spaces based on a list of criteria. To manage million-scale datasets, +we implemented optimizations including search index construction, embedding +precomputation and compression, and caching to ensure responsive user +interactions within seconds. We demonstrate WildVis's utility through three +case studies: facilitating chatbot misuse research, visualizing and comparing +topic distributions across datasets, and characterizing user-specific +conversation patterns. WildVis is open-source and designed to be extendable, +supporting additional datasets and customized search and visualization +functionalities. + +
+
+
+
+
+ + ☆ RAG based Question-Answering for Contextual Response Prediction System CIKM'24 + + +
+ Large Language Models (LLMs) have shown versatility in various Natural +Language Processing (NLP) tasks, including their potential as effective +question-answering systems. However, to provide precise and relevant +information in response to specific customer queries in industry settings, LLMs +require access to a comprehensive knowledge base to avoid hallucinations. +Retrieval Augmented Generation (RAG) emerges as a promising technique to +address this challenge. Yet, developing an accurate question-answering +framework for real-world applications using RAG entails several challenges: 1) +data availability issues, 2) evaluating the quality of generated content, and +3) the costly nature of human evaluation. In this paper, we introduce an +end-to-end framework that employs LLMs with RAG capabilities for industry use +cases. Given a customer query, the proposed system retrieves relevant knowledge +documents and leverages them, along with previous chat history, to generate +response suggestions for customer service agents in the contact centers of a +major retail company. Through comprehensive automated and human evaluations, we +show that this solution outperforms the current BERT-based algorithms in +accuracy and relevance. Our findings suggest that RAG-based LLMs can be an +excellent support to human customer service representatives by lightening their +workload. + +
+
+ comment: Accepted at the 1st Workshop on GenAI and RAG Systems for Enterprise, + CIKM'24. 6 pages +
+
+
+
+
+ + ☆ HGAMN: Heterogeneous Graph Attention Matching Network for Multilingual + POI Retrieval at Baidu Maps KDD'21 + + +
+ The increasing interest in international travel has raised the demand of +retrieving point of interests in multiple languages. This is even superior to +find local venues such as restaurants and scenic spots in unfamiliar languages +when traveling abroad. Multilingual POI retrieval, enabling users to find +desired POIs in a demanded language using queries in numerous languages, has +become an indispensable feature of today's global map applications such as +Baidu Maps. This task is non-trivial because of two key challenges: (1) +visiting sparsity and (2) multilingual query-POI matching. To this end, we +propose a Heterogeneous Graph Attention Matching Network (HGAMN) to +concurrently address both challenges. Specifically, we construct a +heterogeneous graph that contains two types of nodes: POI node and query node +using the search logs of Baidu Maps. To alleviate challenge \#1, we construct +edges between different POI nodes to link the low-frequency POIs with the +high-frequency ones, which enables the transfer of knowledge from the latter to +the former. To mitigate challenge \#2, we construct edges between POI and query +nodes based on the co-occurrences between queries and POIs, where queries in +different languages and formulations can be aggregated for individual POIs. +Moreover, we develop an attention-based network to jointly learn node +representations of the heterogeneous graph and further design a cross-attention +module to fuse the representations of both types of nodes for query-POI +relevance scoring. Extensive experiments conducted on large-scale real-world +datasets from Baidu Maps demonstrate the superiority and effectiveness of +HGAMN. In addition, HGAMN has already been deployed in production at Baidu +Maps, and it successfully keeps serving hundreds of millions of requests every +day. + +
+
+ comment: Accepted by KDD'21 +
+
+
+
+
+ + ☆ MOBIUS: Towards the Next Generation of Query-Ad Matching in Baidu's + Sponsored Search KDD'19 + + +
+ Baidu runs the largest commercial web search engine in China, serving +hundreds of millions of online users every day in response to a great variety +of queries. In order to build a high-efficiency sponsored search engine, we +used to adopt a three-layer funnel-shaped structure to screen and sort hundreds +of ads from billions of ad candidates subject to the requirement of low +response latency and the restraints of computing resources. Given a user query, +the top matching layer is responsible for providing semantically relevant ad +candidates to the next layer, while the ranking layer at the bottom concerns +more about business indicators (e.g., CPM, ROI, etc.) of those ads. The clear +separation between the matching and ranking objectives results in a lower +commercial return. The Mobius project has been established to address this +serious issue. It is our first attempt to train the matching layer to consider +CPM as an additional optimization objective besides the query-ad relevance, via +directly predicting CTR (click-through rate) from billions of query-ad pairs. +Specifically, this paper will elaborate on how we adopt active learning to +overcome the insufficiency of click history at the matching layer when training +our neural click networks offline, and how we use the SOTA ANN search technique +for retrieving ads more efficiently (Here ``ANN'' stands for approximate +nearest neighbor search). We contribute the solutions to Mobius-V1 as the first +version of our next generation query-ad matching system. + +
+
+ comment: Accepted by KDD'19 +
+
+
+
+
+ + ☆ Federated Prototype-based Contrastive Learning for Privacy-Preserving + Cross-domain Recommendation + + +
+ Cross-domain recommendation (CDR) aims to improve recommendation accuracy in +sparse domains by transferring knowledge from data-rich domains. However, +existing CDR methods often assume the availability of user-item interaction +data across domains, overlooking user privacy concerns. Furthermore, these +methods suffer from performance degradation in scenarios with sparse +overlapping users, as they typically depend on a large number of fully shared +users for effective knowledge transfer. To address these challenges, we propose +a Federated Prototype-based Contrastive Learning (CL) method for +Privacy-Preserving CDR, named FedPCL-CDR. This approach utilizes +non-overlapping user information and prototypes to improve multi-domain +performance while protecting user privacy. FedPCL-CDR comprises two modules: +local domain (client) learning and global server aggregation. In the local +domain, FedPCL-CDR clusters all user data to learn representative prototypes, +effectively utilizing non-overlapping user information and addressing the +sparse overlapping user issue. It then facilitates knowledge transfer by +employing both local and global prototypes returned from the server in a CL +manner. Simultaneously, the global server aggregates representative prototypes +from local domains to learn both local and global prototypes. The combination +of prototypes and federated learning (FL) ensures that sensitive user data +remains decentralized, with only prototypes being shared across domains, +thereby protecting user privacy. Extensive experiments on four CDR tasks using +two real-world datasets demonstrate that FedPCL-CDR outperforms the +state-of-the-art baselines. + +
+
+
+
+
+ + ☆ iText2KG: Incremental Knowledge Graphs Construction Using Large Language + Models + + +
+ Most available data is unstructured, making it challenging to access valuable +information. Automatically building Knowledge Graphs (KGs) is crucial for +structuring data and making it accessible, allowing users to search for +information effectively. KGs also facilitate insights, inference, and +reasoning. Traditional NLP methods, such as named entity recognition and +relation extraction, are key in information retrieval but face limitations, +including the use of predefined entity types and the need for supervised +learning. Current research leverages large language models' capabilities, such +as zero- or few-shot learning. However, unresolved and semantically duplicated +entities and relations still pose challenges, leading to inconsistent graphs +and requiring extensive post-processing. Additionally, most approaches are +topic-dependent. In this paper, we propose iText2KG, a method for incremental, +topic-independent KG construction without post-processing. This plug-and-play, +zero-shot method is applicable across a wide range of KG construction scenarios +and comprises four modules: Document Distiller, Incremental Entity Extractor, +Incremental Relation Extractor, and Graph Integrator and Visualization. Our +method demonstrates superior performance compared to baseline methods across +three scenarios: converting scientific papers to graphs, websites to graphs, +and CVs to graphs. + +
+
+ comment: Accepted at The International Web Information Systems Engineering + conference (the WISE conference) 2024 +
+
+
+
+
+ + ☆ GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase + Recommendation + + +
+ Online sellers and advertisers are recommended keyphrases for their listed +products, which they bid on to enhance their sales. One popular paradigm that +generates such recommendations is Extreme Multi-Label Classification (XMC), +which involves tagging/mapping keyphrases to items. We outline the limitations +of using traditional item-query based tagging or mapping techniques for +keyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an +innovative graph-based approach that recommends keyphrases to sellers using +extraction of token permutations from item titles. Additionally, we demonstrate +that relying on traditional metrics such as precision/recall can be misleading +in practical applications, thereby necessitating a combination of metrics to +evaluate performance in real-world scenarios. These metrics are designed to +assess the relevance of keyphrases to items and the potential for buyer +outreach. GraphEx outperforms production models at eBay, achieving the +objectives mentioned above. It supports near real-time inferencing in +resource-constrained production environments and scales effectively for +billions of items. + +
+
+
+
+
+ + ☆ RETAIN: Interactive Tool for Regression Testing Guided LLM Migration + + +
+ Large Language Models (LLMs) are increasingly integrated into diverse +applications. The rapid evolution of LLMs presents opportunities for developers +to enhance applications continuously. However, this constant adaptation can +also lead to performance regressions during model migrations. While several +interactive tools have been proposed to streamline the complexity of prompt +engineering, few address the specific requirements of regression testing for +LLM Migrations. To bridge this gap, we introduce RETAIN (REgression Testing +guided LLM migrAtIoN), a tool designed explicitly for regression testing in LLM +Migrations. RETAIN comprises two key components: an interactive interface +tailored to regression testing needs during LLM migrations, and an error +discovery module that facilitates understanding of differences in model +behaviors. The error discovery module generates textual descriptions of various +errors or differences between model outputs, providing actionable insights for +prompt refinement. Our automatic evaluation and empirical user studies +demonstrate that RETAIN, when compared to manual evaluation, enabled +participants to identify twice as many errors, facilitated experimentation with +75% more prompts, and achieves 12% higher metric scores in a given time frame. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Understanding Fairness Metrics in Recommender Systems: A Healthcare + Perspective + + +
+ Fairness in AI-driven decision-making systems has become a critical concern, +especially when these systems directly affect human lives. This paper explores +the public's comprehension of fairness in healthcare recommendations. We +conducted a survey where participants selected from four fairness metrics -- +Demographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive +Value -- across different healthcare scenarios to assess their understanding of +these concepts. Our findings reveal that fairness is a complex and often +misunderstood concept, with a generally low level of public understanding +regarding fairness metrics in recommender systems. This study highlights the +need for enhanced information and education on algorithmic fairness to support +informed decision-making in using these systems. Furthermore, the results +suggest that a one-size-fits-all approach to fairness may be insufficient, +pointing to the importance of context-sensitive designs in developing equitable +AI systems. + +
+
+ comment: Accepted to the 18th ACM Conference on Recommender Systems +
+
+
+
+
+ + ♻ ☆ Pooling And Attention: What Are Effective Designs For LLM-Based + Embedding Models? + + +
+ The significant advancements of Large Language Models (LLMs) in generative +tasks have led to a growing body of work exploring LLM-based embedding models. +While these models, employing different pooling and attention strategies, have +achieved state-of-the-art performance on public embedding benchmarks, questions +still arise about what constitutes an effective design for LLM-based embedding +models. However, these models are often trained on different datasets, using +different LLM base models or training settings. Moreover, evaluations on public +embedding benchmarks often fail to report statistical significance, making it +difficult to determine which designs truly contribute to final performance. +This complicates the process for practitioners seeking optimal training recipes +for LLM-based embedding models. In this study, we conduct a large-scale +experiment by training a series of LLM-based embedding models using the same +training data and base model but differing in their pooling and attention +strategies. The results show that there is no one-size-fits-all solution: while +bidirectional attention and an additional trainable pooling layer outperform in +text similarity and information retrieval tasks, they do not significantly +surpass simpler designs like EOS-last token pooling and default causal +attention in clustering and classification tasks. Furthermore, we propose a new +pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs +of all hidden layers, rather than just the last layer, using a cross-attention +network. This method proves to be statistically superior in text similarity and +retrieval tasks compared to existing pooling methods. Overall, this paper sheds +light on effective training strategies for LLM-based embedding models. + +
+
+ comment: https://github.com/yixuantt/PoolingAndAttn +
+
+
+
+
+ + ♻ ☆ Behavior-Dependent Linear Recurrent Units for Efficient Sequential + Recommendation CIKM 2024 + + +
+ Sequential recommender systems aims to predict the users' next interaction +through user behavior modeling with various operators like RNNs and attentions. +However, existing models generally fail to achieve the three golden principles +for sequential recommendation simultaneously, i.e., training efficiency, +low-cost inference, and strong performance. To this end, we propose RecBLR, an +Efficient Sequential Recommendation Model based on Behavior-Dependent Linear +Recurrent Units to accomplish the impossible triangle of the three principles. +By incorporating gating mechanisms and behavior-dependent designs into linear +recurrent units, our model significantly enhances user behavior modeling and +recommendation performance. Furthermore, we unlock the parallelizable training +as well as inference efficiency for our model by designing a hardware-aware +scanning acceleration algorithm with a customized CUDA kernel. Extensive +experiments on real-world datasets with varying lengths of user behavior +sequences demonstrate RecBLR's remarkable effectiveness in simultaneously +achieving all three golden principles - strong recommendation performance, +training efficiency, and low-cost inference, while exhibiting excellent +scalability to datasets with long user interaction histories. + +
+
+ comment: Accepted to CIKM 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 160 + +
+
+
+ + ☆ Lexicon3D: Probing Visual Foundation Models for Complex 3D Scene + Understanding + + +
+ Complex 3D scene understanding has gained increasing attention, with scene +encoding strategies playing a crucial role in this success. However, the +optimal scene encoding strategies for various scenarios remain unclear, +particularly compared to their image-based counterparts. To address this issue, +we present a comprehensive study that probes various visual encoding models for +3D scene understanding, identifying the strengths and limitations of each model +across different scenarios. Our evaluation spans seven vision foundation +encoders, including image-based, video-based, and 3D foundation models. We +evaluate these models in four tasks: Vision-Language Scene Reasoning, Visual +Grounding, Segmentation, and Registration, each focusing on different aspects +of scene understanding. Our evaluations yield key findings: DINOv2 demonstrates +superior performance, video models excel in object-level tasks, diffusion +models benefit geometric tasks, and language-pretrained models show unexpected +limitations in language-related tasks. These insights challenge some +conventional understandings, provide novel perspectives on leveraging visual +foundation models, and highlight the need for more flexible encoder selection +in future vision-language and scene-understanding tasks. + +
+
+ comment: Project page: https://yunzeman.github.io/lexicon3d , Github: + https://github.com/YunzeMan/Lexicon3D +
+
+
+
+
+ + ☆ WildVis: Open Source Visualizer for Million-Scale Chat Logs in the Wild + + +
+ The increasing availability of real-world conversation data offers exciting +opportunities for researchers to study user-chatbot interactions. However, the +sheer volume of this data makes manually examining individual conversations +impractical. To overcome this challenge, we introduce WildVis, an interactive +tool that enables fast, versatile, and large-scale conversation analysis. +WildVis provides search and visualization capabilities in the text and +embedding spaces based on a list of criteria. To manage million-scale datasets, +we implemented optimizations including search index construction, embedding +precomputation and compression, and caching to ensure responsive user +interactions within seconds. We demonstrate WildVis's utility through three +case studies: facilitating chatbot misuse research, visualizing and comparing +topic distributions across datasets, and characterizing user-specific +conversation patterns. WildVis is open-source and designed to be extendable, +supporting additional datasets and customized search and visualization +functionalities. + +
+
+
+
+
+ + ☆ Dynamics of Supervised and Reinforcement Learning in the Non-Linear + Perceptron + + +
+ The ability of a brain or a neural network to efficiently learn depends +crucially on both the task structure and the learning rule. Previous works have +analyzed the dynamical equations describing learning in the relatively +simplified context of the perceptron under assumptions of a student-teacher +framework or a linearized output. While these assumptions have facilitated +theoretical understanding, they have precluded a detailed understanding of the +roles of the nonlinearity and input-data distribution in determining the +learning dynamics, limiting the applicability of the theories to real +biological or artificial neural networks. Here, we use a stochastic-process +approach to derive flow equations describing learning, applying this framework +to the case of a nonlinear perceptron performing binary classification. We +characterize the effects of the learning rule (supervised or reinforcement +learning, SL/RL) and input-data distribution on the perceptron's learning curve +and the forgetting curve as subsequent tasks are learned. In particular, we +find that the input-data noise differently affects the learning speed under SL +vs. RL, as well as determines how quickly learning of a task is overwritten by +subsequent learning. Additionally, we verify our approach with real data using +the MNIST dataset. This approach points a way toward analyzing learning +dynamics for more-complex circuit architectures. + +
+
+
+
+
+ + ☆ Understanding Data Importance in Machine Learning Attacks: Does Valuable + Data Pose Greater Harm? NDSS + + +
+ Machine learning has revolutionized numerous domains, playing a crucial role +in driving advancements and enabling data-centric processes. The significance +of data in training models and shaping their performance cannot be overstated. +Recent research has highlighted the heterogeneous impact of individual data +samples, particularly the presence of valuable data that significantly +contributes to the utility and effectiveness of machine learning models. +However, a critical question remains unanswered: are these valuable data +samples more vulnerable to machine learning attacks? In this work, we +investigate the relationship between data importance and machine learning +attacks by analyzing five distinct attack types. Our findings reveal notable +insights. For example, we observe that high importance data samples exhibit +increased vulnerability in certain attacks, such as membership inference and +model stealing. By analyzing the linkage between membership inference +vulnerability and data importance, we demonstrate that sample characteristics +can be integrated into membership metrics by introducing sample-specific +criteria, therefore enhancing the membership inference performance. These +findings emphasize the urgent need for innovative defense mechanisms that +strike a balance between maximizing utility and safeguarding valuable data +against potential exploitation. + +
+
+ comment: To Appear in Network and Distributed System Security (NDSS) Symposium + 2025 +
+
+
+
+
+ + ☆ Differentiable Discrete Event Simulation for Queuing Network Control + + +
+ Queuing network control is essential for managing congestion in +job-processing systems such as service systems, communication networks, and +manufacturing processes. Despite growing interest in applying reinforcement +learning (RL) techniques, queueing network control poses distinct challenges, +including high stochasticity, large state and action spaces, and lack of +stability. To tackle these challenges, we propose a scalable framework for +policy optimization based on differentiable discrete event simulation. Our main +insight is that by implementing a well-designed smoothing technique for +discrete event dynamics, we can compute pathwise policy gradients for +large-scale queueing networks using auto-differentiation software (e.g., +Tensorflow, PyTorch) and GPU parallelization. Through extensive empirical +experiments, we observe that our policy gradient estimators are several orders +of magnitude more accurate than typical REINFORCE-based estimators. In +addition, We propose a new policy architecture, which drastically improves +stability while maintaining the flexibility of neural-network policies. In a +wide variety of scheduling and admission control tasks, we demonstrate that +training control policies with pathwise gradients leads to a 50-1000x +improvement in sample efficiency over state-of-the-art RL methods. Unlike prior +tailored approaches to queueing, our methods can flexibly handle realistic +scenarios, including systems operating in non-stationary environments and those +with non-exponential interarrival/service times. + +
+
+
+
+
+ + ☆ LLM-CI: Assessing Contextual Integrity Norms in Language Models + + +
+ Large language models (LLMs), while memorizing parts of their training data +scraped from the Internet, may also inadvertently encode societal preferences +and norms. As these models are integrated into sociotechnical systems, it is +crucial that the norms they encode align with societal expectations. These +norms could vary across models, hyperparameters, optimization techniques, and +datasets. This is especially challenging due to prompt sensitivity$-$small +variations in prompts yield different responses, rendering existing assessment +methodologies unreliable. There is a need for a comprehensive framework +covering various models, optimization, and datasets, along with a reliable +methodology to assess encoded norms. + We present LLM-CI, the first open-sourced framework to assess privacy norms +encoded in LLMs. LLM-CI uses a Contextual Integrity-based factorial vignette +methodology to assess the encoded norms across different contexts and LLMs. We +propose the multi-prompt assessment methodology to address prompt sensitivity +by assessing the norms from only the prompts that yield consistent responses +across multiple variants. Using LLM-CI and our proposed methodology, we +comprehensively evaluate LLMs using IoT and COPPA vignettes datasets from prior +work, examining the impact of model properties (e.g., hyperparameters, +capacity) and optimization strategies (e.g., alignment, quantization). + +
+
+ comment: 20 pages, 8 Figures, 4 Tables +
+
+
+
+
+ + ☆ Safety vs. Performance: How Multi-Objective Learning Reduces Barriers to + Market Entry + + +
+ Emerging marketplaces for large language models and other large-scale machine +learning (ML) models appear to exhibit market concentration, which has raised +concerns about whether there are insurmountable barriers to entry in such +markets. In this work, we study this issue from both an economic and an +algorithmic point of view, focusing on a phenomenon that reduces barriers to +entry. Specifically, an incumbent company risks reputational damage unless its +model is sufficiently aligned with safety objectives, whereas a new company can +more easily avoid reputational damage. To study this issue formally, we define +a multi-objective high-dimensional regression framework that captures +reputational damage, and we characterize the number of data points that a new +company needs to enter the market. Our results demonstrate how multi-objective +considerations can fundamentally reduce barriers to entry -- the required +number of data points can be significantly smaller than the incumbent company's +dataset size. En route to proving these results, we develop scaling laws for +high-dimensional linear regression in multi-objective environments, showing +that the scaling rate becomes slower when the dataset size is large, which +could be of independent interest. + +
+
+
+
+
+ + ☆ Planning In Natural Language Improves LLM Search For Code Generation + + +
+ While scaling training compute has led to remarkable improvements in large +language models (LLMs), scaling inference compute has not yet yielded analogous +gains. We hypothesize that a core missing component is a lack of diverse LLM +outputs, leading to inefficient search due to models repeatedly sampling highly +similar, yet incorrect generations. We empirically demonstrate that this lack +of diversity can be mitigated by searching over candidate plans for solving a +problem in natural language. Based on this insight, we propose PLANSEARCH, a +novel search algorithm which shows strong results across HumanEval+, MBPP+, and +LiveCodeBench (a contamination-free benchmark for competitive coding). +PLANSEARCH generates a diverse set of observations about the problem and then +uses these observations to construct plans for solving the problem. By +searching over plans in natural language rather than directly over code +solutions, PLANSEARCH explores a significantly more diverse range of potential +solutions compared to baseline search methods. Using PLANSEARCH on top of +Claude 3.5 Sonnet achieves a state-of-the-art pass@200 of 77.0% on +LiveCodeBench, outperforming both the best score achieved without search +(pass@1 = 41.4%) and using standard repeated sampling (pass@200 = 60.6%). +Finally, we show that, across all models, search algorithms, and benchmarks +analyzed, we can accurately predict performance gains due to search as a direct +function of the diversity over generated ideas. + +
+
+
+
+
+ + ☆ A Deep Generative Learning Approach for Two-stage Adaptive Robust + Optimization + + +
+ Two-stage adaptive robust optimization is a powerful approach for planning +under uncertainty that aims to balance costs of "here-and-now" first-stage +decisions with those of "wait-and-see" recourse decisions made after +uncertainty is realized. To embed robustness against uncertainty, modelers +typically assume a simple polyhedral or ellipsoidal set over which +contingencies may be realized. However, these simple uncertainty sets tend to +yield highly conservative decision-making when uncertainties are +high-dimensional. In this work, we introduce AGRO, a column-and-constraint +generation algorithm that performs adversarial generation for two-stage +adaptive robust optimization using a variational autoencoder. AGRO identifies +realistic and cost-maximizing contingencies by optimizing over spherical +uncertainty sets in a latent space using a projected gradient ascent approach +that differentiates the optimal recourse cost with respect to the latent +variable. To demonstrate the cost- and time-efficiency of our approach +experimentally, we apply AGRO to an adaptive robust capacity expansion problem +for a regional power system and show that AGRO is able to reduce costs by up to +7.8% and runtimes by up to 77% in comparison to the conventional +column-and-constraint generation algorithm. + +
+
+
+
+
+ + ☆ Iterative thresholding for non-linear learning in the strong + $\varepsilon$-contamination model + + +
+ We derive approximation bounds for learning single neuron models using +thresholded gradient descent when both the labels and the covariates are +possibly corrupted adversarially. We assume the data follows the model $y = +\sigma(\mathbf{w}^{*} \cdot \mathbf{x}) + \xi,$ where $\sigma$ is a nonlinear +activation function, the noise $\xi$ is Gaussian, and the covariate vector +$\mathbf{x}$ is sampled from a sub-Gaussian distribution. We study sigmoidal, +leaky-ReLU, and ReLU activation functions and derive a +$O(\nu\sqrt{\epsilon\log(1/\epsilon)})$ approximation bound in $\ell_{2}$-norm, +with sample complexity $O(d/\epsilon)$ and failure probability +$e^{-\Omega(d)}$. + We also study the linear regression problem, where $\sigma(\mathbf{x}) = +\mathbf{x}$. We derive a $O(\nu\epsilon\log(1/\epsilon))$ approximation bound, +improving upon the previous $O(\nu)$ approximation bounds for the +gradient-descent based iterative thresholding algorithms of Bhatia et al. +(NeurIPS 2015) and Shen and Sanghavi (ICML 2019). Our algorithm has a +$O(\textrm{polylog}(N,d)\log(R/\epsilon))$ runtime complexity when +$\|\mathbf{w}^{*}\|_2 \leq R$, improving upon the +$O(\text{polylog}(N,d)/\epsilon^2)$ runtime complexity of Awasthi et al. +(NeurIPS 2022). + +
+
+ comment: 35 pages +
+
+
+
+
+ + ☆ Classification and Prediction of Heart Diseases using Machine Learning + Algorithms + + +
+ Heart disease is a serious worldwide health issue because it claims the lives +of many people who might have been treated if the disease had been identified +earlier. The leading cause of death in the world is cardiovascular disease, +usually referred to as heart disease. Creating reliable, effective, and precise +predictions for these diseases is one of the biggest issues facing the medical +world today. Although there are tools for predicting heart diseases, they are +either expensive or challenging to apply for determining a patient's risk. The +best classifier for foretelling and spotting heart disease was the aim of this +research. This experiment examined a range of machine learning approaches, +including Logistic Regression, K-Nearest Neighbor, Support Vector Machine, and +Artificial Neural Networks, to determine which machine learning algorithm was +most effective at predicting heart diseases. One of the most often utilized +data sets for this purpose, the UCI heart disease repository provided the data +set for this study. The K-Nearest Neighbor technique was shown to be the most +effective machine learning algorithm for determining whether a patient has +heart disease. It will be beneficial to conduct further studies on the +application of additional machine learning algorithms for heart disease +prediction. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ View-Invariant Policy Learning via Zero-Shot Novel View Synthesis + + +
+ Large-scale visuomotor policy learning is a promising approach toward +developing generalizable manipulation systems. Yet, policies that can be +deployed on diverse embodiments, environments, and observational modalities +remain elusive. In this work, we investigate how knowledge from large-scale +visual data of the world may be used to address one axis of variation for +generalizable manipulation: observational viewpoint. Specifically, we study +single-image novel view synthesis models, which learn 3D-aware scene-level +priors by rendering images of the same scene from alternate camera viewpoints +given a single input image. For practical application to diverse robotic data, +these models must operate zero-shot, performing view synthesis on unseen tasks +and environments. We empirically analyze view synthesis models within a simple +data-augmentation scheme that we call View Synthesis Augmentation (VISTA) to +understand their capabilities for learning viewpoint-invariant policies from +single-viewpoint demonstration data. Upon evaluating the robustness of policies +trained with our method to out-of-distribution camera viewpoints, we find that +they outperform baselines in both simulated and real-world manipulation tasks. +Videos and additional visualizations are available at +https://s-tian.github.io/projects/vista. + +
+
+ comment: Accepted to CoRL 2024 +
+
+
+
+
+ + ☆ Predicting quantum channels over general product distributions + + +
+ We investigate the problem of predicting the output behavior of unknown +quantum channels. Given query access to an $n$-qubit channel $E$ and an +observable $O$, we aim to learn the mapping \begin{equation*} + \rho \mapsto \mathrm{Tr}(O E[\rho]) \end{equation*} to within a small error +for most $\rho$ sampled from a distribution $D$. Previously, Huang, Chen, and +Preskill proved a surprising result that even if $E$ is arbitrary, this task +can be solved in time roughly $n^{O(\log(1/\epsilon))}$, where $\epsilon$ is +the target prediction error. However, their guarantee applied only to input +distributions $D$ invariant under all single-qubit Clifford gates, and their +algorithm fails for important cases such as general product distributions over +product states $\rho$. + In this work, we propose a new approach that achieves accurate prediction +over essentially any product distribution $D$, provided it is not "classical" +in which case there is a trivial exponential lower bound. Our method employs a +"biased Pauli analysis," analogous to classical biased Fourier analysis. +Implementing this approach requires overcoming several challenges unique to the +quantum setting, including the lack of a basis with appropriate orthogonality +properties. The techniques we develop to address these issues may have broader +applications in quantum information. + +
+
+ comment: 20 pages, comments welcome +
+
+
+
+
+ + ☆ A New First-Order Meta-Learning Algorithm with Convergence Guarantees + + +
+ Learning new tasks by drawing on prior experience gathered from other +(related) tasks is a core property of any intelligent system. Gradient-based +meta-learning, especially MAML and its variants, has emerged as a viable +solution to accomplish this goal. One problem MAML encounters is its +computational and memory burdens needed to compute the meta-gradients. We +propose a new first-order variant of MAML that we prove converges to a +stationary point of the MAML objective, unlike other first-order variants. We +also show that the MAML objective does not satisfy the smoothness assumption +assumed in previous works; we show instead that its smoothness constant grows +with the norm of the meta-gradient, which theoretically suggests the use of +normalized or clipped-gradient methods compared to the plain gradient method +used in previous works. We validate our theory on a synthetic experiment. + +
+
+
+
+
+ + ☆ Practical Forecasting of Cryptocoins Timeseries using Correlation + Patterns + + +
+ Cryptocoins (i.e., Bitcoin, Ether, Litecoin) are tradable digital assets. +Ownerships of cryptocoins are registered on distributed ledgers (i.e., +blockchains). Secure encryption techniques guarantee the security of the +transactions (transfers of coins among owners), registered into the ledger. +Cryptocoins are exchanged for specific trading prices. The extreme volatility +of such trading prices across all different sets of crypto-assets remains +undisputed. However, the relations between the trading prices across different +cryptocoins remains largely unexplored. Major coin exchanges indicate trend +correlation to advise for sells or buys. However, price correlations remain +largely unexplored. We shed some light on the trend correlations across a large +variety of cryptocoins, by investigating their coin/price correlation trends +over the past two years. We study the causality between the trends, and exploit +the derived correlations to understand the accuracy of state-of-the-art +forecasting techniques for time series modeling (e.g., GBMs, LSTM and GRU) of +correlated cryptocoins. Our evaluation shows (i) strong correlation patterns +between the most traded coins (e.g., Bitcoin and Ether) and other types of +cryptocurrencies, and (ii) state-of-the-art time series forecasting algorithms +can be used to forecast cryptocoins price trends. We released datasets and code +to reproduce our analysis to the research community. + +
+
+
+
+
+ + ☆ Wind turbine condition monitoring based on intra- and inter-farm + federated learning + + +
+ As wind energy adoption is growing, ensuring the efficient operation and +maintenance of wind turbines becomes essential for maximizing energy production +and minimizing costs and downtime. Many AI applications in wind energy, such as +in condition monitoring and power forecasting, may benefit from using +operational data not only from individual wind turbines but from multiple +turbines and multiple wind farms. Collaborative distributed AI which preserves +data privacy holds a strong potential for these applications. Federated +learning has emerged as a privacy-preserving distributed machine learning +approach in this context. We explore federated learning in wind turbine +condition monitoring, specifically for fault detection using normal behaviour +models. We investigate various federated learning strategies, including +collaboration across different wind farms and turbine models, as well as +collaboration restricted to the same wind farm and turbine model. Our case +study results indicate that federated learning across multiple wind turbines +consistently outperforms models trained on a single turbine, especially when +training data is scarce. Moreover, the amount of historical data necessary to +train an effective model can be significantly reduced by employing a +collaborative federated learning strategy. Finally, our findings show that +extending the collaboration to multiple wind farms may result in inferior +performance compared to restricting learning within a farm, specifically when +faced with statistical heterogeneity and imbalanced datasets. + +
+
+
+
+
+ + ☆ A method to benchmark high-dimensional process drift detection + + +
+ Process curves are multi-variate finite time series data coming from +manufacturing processes. This paper studies machine learning methods for drifts +of process curves. A theoretic framework to synthetically generate process +curves in a controlled way is introduced in order to benchmark machine learning +algorithms for process drift detection. A evaluation score, called the temporal +area under the curve, is introduced, which allows to quantify how well machine +learning models unveil curves belonging to drift segments. Finally, a benchmark +study comparing popular machine learning approaches on synthetic data generated +with the introduced framework shown. + +
+
+
+
+
+ + ☆ A Fused Large Language Model for Predicting Startup Success + + +
+ Investors are continuously seeking profitable investment opportunities in +startups and, hence, for effective decision-making, need to predict a startup's +probability of success. Nowadays, investors can use not only various +fundamental information about a startup (e.g., the age of the startup, the +number of founders, and the business sector) but also textual description of a +startup's innovation and business model, which is widely available through +online venture capital (VC) platforms such as Crunchbase. To support the +decision-making of investors, we develop a machine learning approach with the +aim of locating successful startups on VC platforms. Specifically, we develop, +train, and evaluate a tailored, fused large language model to predict startup +success. Thereby, we assess to what extent self-descriptions on VC platforms +are predictive of startup success. Using 20,172 online profiles from +Crunchbase, we find that our fused large language model can predict startup +success, with textual self-descriptions being responsible for a significant +part of the predictive power. Our work provides a decision support tool for +investors to find profitable investment opportunities. + +
+
+
+
+
+ + ☆ Threat Classification on Deployed Optical Networks Using MIMO Digital + Fiber Sensing, Wavelets, and Machine Learning + + +
+ We demonstrate mechanical threats classification including jackhammers and +excavators, leveraging wavelet transform of MIMO-DFS output data across a 57-km +operational network link. Our machine learning framework incorporates transfer +learning and shows 93% classification accuracy from field data, with benefits +for optical network supervision. + +
+
+
+
+
+ + ☆ Weather-Adaptive Multi-Step Forecasting of State of Polarization Changes + in Aerial Fibers Using Wavelet Neural Networks + + +
+ We introduce a novel weather-adaptive approach for multi-step forecasting of +multi-scale SOP changes in aerial fiber links. By harnessing the discrete +wavelet transform and incorporating weather data, our approach improves +forecasting accuracy by over 65% in RMSE and 63% in MAPE compared to baselines. + +
+
+ comment: ECOC 2024 +
+
+
+
+
+ + ☆ The representation landscape of few-shot learning and fine-tuning in + large language models + + +
+ In-context learning (ICL) and supervised fine-tuning (SFT) are two common +strategies for improving the performance of modern large language models (LLMs) +on specific tasks. Despite their different natures, these strategies often lead +to comparable performance gains. However, little is known about whether they +induce similar representations inside LLMs. We approach this problem by +analyzing the probability landscape of their hidden representations in the two +cases. More specifically, we compare how LLMs solve the same question-answering +task, finding that ICL and SFT create very different internal structures, in +both cases undergoing a sharp transition in the middle of the network. In the +first half of the network, ICL shapes interpretable representations +hierarchically organized according to their semantic content. In contrast, the +probability landscape obtained with SFT is fuzzier and semantically mixed. In +the second half of the model, the fine-tuned representations develop +probability modes that better encode the identity of answers, while the +landscape of ICL representations is characterized by less defined peaks. Our +approach reveals the diverse computational strategies developed inside LLMs to +solve the same task across different conditions, allowing us to make a step +towards designing optimal methods to extract information from language models. + +
+
+
+
+
+ + ☆ A DNN Biophysics Model with Topological and Electrostatic Features + + +
+ In this project, we provide a deep-learning neural network (DNN) based +biophysics model to predict protein properties. The model uses multi-scale and +uniform topological and electrostatic features generated with protein +structural information and force field, which governs the molecular mechanics. +The topological features are generated using the element specified persistent +homology (ESPH) while the electrostatic features are fast computed using a +Cartesian treecode. These features are uniform in number for proteins with +various sizes thus the broadly available protein structure database can be used +in training the network. These features are also multi-scale thus the +resolution and computational cost can be balanced by the users. The machine +learning simulation on over 4000 protein structures shows the efficiency and +fidelity of these features in representing the protein structure and force +field for the predication of their biophysical properties such as electrostatic +solvation energy. Tests on topological or electrostatic features alone and the +combination of both showed the optimal performance when both features are used. +This model shows its potential as a general tool in assisting biophysical +properties and function prediction for the broad biomolecules using data from +both theoretical computing and experiments. + +
+
+
+
+
+ + ☆ Unsupervised Anomaly Detection and Localization with Generative + Adversarial Networks + + +
+ We propose a novel unsupervised anomaly detection approach using generative +adversarial networks and SOP-derived spectrograms. Demonstrating remarkable +efficacy, our method achieves over 97% accuracy on SOP datasets from both +submarine and terrestrial fiber links, all achieved without the need for +labelled data. + +
+
+ comment: ECOC 2024 +
+
+
+
+
+ + ☆ Privacy versus Emotion Preservation Trade-offs in Emotion-Preserving + Speaker Anonymization + + +
+ Advances in speech technology now allow unprecedented access to personally +identifiable information through speech. To protect such information, the +differential privacy field has explored ways to anonymize speech while +preserving its utility, including linguistic and paralinguistic aspects. +However, anonymizing speech while maintaining emotional state remains +challenging. We explore this problem in the context of the VoicePrivacy 2024 +challenge. Specifically, we developed various speaker anonymization pipelines +and find that approaches either excel at anonymization or preserving emotion +state, but not both simultaneously. Achieving both would require an in-domain +emotion recognizer. Additionally, we found that it is feasible to train a +semi-effective speaker verification system using only emotion representations, +demonstrating the challenge of separating these two modalities. + +
+
+ comment: accepted by 2024 IEEE Spoken Language Technology Workshop +
+
+
+
+
+ + ☆ On the Limited Generalization Capability of the Implicit Reward Model + Induced by Direct Preference Optimization + + +
+ Reinforcement Learning from Human Feedback (RLHF) is an effective approach +for aligning language models to human preferences. Central to RLHF is learning +a reward function for scoring human preferences. Two main approaches for +learning a reward model are 1) training an EXplicit Reward Model (EXRM) as in +RLHF, and 2) using an implicit reward learned from preference data through +methods such as Direct Preference Optimization (DPO). Prior work has shown that +the implicit reward model of DPO (denoted as DPORM) can approximate an EXRM in +the limit. DPORM's effectiveness directly implies the optimality of the learned +policy, and also has practical implication for LLM alignment methods including +iterative DPO. However, it is unclear how well DPORM empirically matches the +performance of EXRM. This work studies the accuracy at distinguishing preferred +and rejected answers for both DPORM and EXRM. Our findings indicate that even +though DPORM fits the training dataset comparably, it generalizes less +effectively than EXRM, especially when the validation datasets contain +distribution shifts. Across five out-of-distribution settings, DPORM has a mean +drop in accuracy of 3% and a maximum drop of 7%. These findings highlight that +DPORM has limited generalization ability and substantiates the integration of +an explicit reward model in iterative DPO approaches. + +
+
+ comment: 12 pages, 8 tables, 2 figures +
+
+
+
+
+ + ☆ Limited but consistent gains in adversarial robustness by co-training + object recognition models with human EEG + + +
+ In contrast to human vision, artificial neural networks (ANNs) remain +relatively susceptible to adversarial attacks. To address this vulnerability, +efforts have been made to transfer inductive bias from human brains to ANNs, +often by training the ANN representations to match their biological +counterparts. Previous works relied on brain data acquired in rodents or +primates using invasive techniques, from specific regions of the brain, under +non-natural conditions (anesthetized animals), and with stimulus datasets +lacking diversity and naturalness. In this work, we explored whether aligning +model representations to human EEG responses to a rich set of real-world images +increases robustness to ANNs. Specifically, we trained ResNet50-backbone models +on a dual task of classification and EEG prediction; and evaluated their EEG +prediction accuracy and robustness to adversarial attacks. We observed +significant correlation between the networks' EEG prediction accuracy, often +highest around 100 ms post stimulus onset, and their gains in adversarial +robustness. Although effect size was limited, effects were consistent across +different random initializations and robust for architectural variants. We +further teased apart the data from individual EEG channels and observed +strongest contribution from electrodes in the parieto-occipital regions. The +demonstrated utility of human EEG for such tasks opens up avenues for future +efforts that scale to larger datasets under diverse stimuli conditions with the +promise of stronger effects. + +
+
+
+
+
+ + ☆ Beyond Model Interpretability: Socio-Structural Explanations in Machine + Learning + + +
+ What is it to interpret the outputs of an opaque machine learning model. One +approach is to develop interpretable machine learning techniques. These +techniques aim to show how machine learning models function by providing either +model centric local or global explanations, which can be based on mechanistic +interpretations revealing the inner working mechanisms of models or +nonmechanistic approximations showing input feature output data relationships. +In this paper, we draw on social philosophy to argue that interpreting machine +learning outputs in certain normatively salient domains could require appealing +to a third type of explanation that we call sociostructural explanation. The +relevance of this explanation type is motivated by the fact that machine +learning models are not isolated entities but are embedded within and shaped by +social structures. Sociostructural explanations aim to illustrate how social +structures contribute to and partially explain the outputs of machine learning +models. We demonstrate the importance of sociostructural explanations by +examining a racially biased healthcare allocation algorithm. Our proposal +highlights the need for transparency beyond model interpretability, +understanding the outputs of machine learning systems could require a broader +analysis that extends beyond the understanding of the machine learning model +itself. + +
+
+
+
+
+ + ☆ DART2: a robust multiple testing method to smartly leverage helpful or + misleading ancillary information + + +
+ In many applications of multiple testing, ancillary information is available, +reflecting the hypothesis null or alternative status. Several methods have been +developed to leverage this ancillary information to enhance testing power, +typically requiring the ancillary information is helpful enough to ensure +favorable performance. In this paper, we develop a robust and effective +distance-assisted multiple testing procedure named DART2, designed to be +powerful and robust regardless of the quality of ancillary information. When +the ancillary information is helpful, DART2 can asymptotically control FDR +while improving power; otherwise, DART2 can still control FDR and maintain +power at least as high as ignoring the ancillary information. We demonstrated +DART2's superior performance compared to existing methods through numerical +studies under various settings. In addition, DART2 has been applied to a gene +association study where we have shown its superior accuracy and robustness +under two different types of ancillary information. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ☆ 1 Modular Parallel Manipulator for Long-Term Soft Robotic Data + Collection + + +
+ Performing long-term experimentation or large-scale data collection for +machine learning in the field of soft robotics is challenging, due to the +hardware robustness and experimental flexibility required. In this work, we +propose a modular parallel robotic manipulation platform suitable for such +large-scale data collection and compatible with various soft-robotic +fabrication methods. Considering the computational and theoretical difficulty +of replicating the high-fidelity, faster-than-real-time simulations that enable +large-scale data collection in rigid robotic systems, a robust soft-robotic +hardware platform becomes a high priority development task for the field. + The platform's modules consist of a pair of off-the-shelf electrical motors +which actuate a customizable finger consisting of a compliant parallel +structure. The parallel mechanism of the finger can be as simple as a single +3D-printed urethane or molded silicone bulk structure, due to the motors being +able to fully actuate a passive structure. This design flexibility allows +experimentation with soft mechanism varied geometries, bulk properties and +surface properties. Additionally, while the parallel mechanism does not require +separate electronics or additional parts, these can be included, and it can be +constructed using multi-functional soft materials to study compatible soft +sensors and actuators in the learning process. In this work, we validate the +platform's ability to be used for policy gradient reinforcement learning +directly on hardware in a benchmark 2D manipulation task. We additionally +demonstrate compatibility with multiple fingers and characterize the design +constraints for compatible extensions. + +
+
+
+
+
+ + ☆ VFLGAN-TS: Vertical Federated Learning-based Generative Adversarial + Networks for Publication of Vertically Partitioned Time-Series Data + + +
+ In the current artificial intelligence (AI) era, the scale and quality of the +dataset play a crucial role in training a high-quality AI model. However, often +original data cannot be shared due to privacy concerns and regulations. A +potential solution is to release a synthetic dataset with a similar +distribution to the private dataset. Nevertheless, in some scenarios, the +attributes required to train an AI model are distributed among different +parties, and the parties cannot share the local data for synthetic data +construction due to privacy regulations. In PETS 2024, we recently introduced +the first Vertical Federated Learning-based Generative Adversarial Network +(VFLGAN) for publishing vertically partitioned static data. However, VFLGAN +cannot effectively handle time-series data, presenting both temporal and +attribute dimensions. In this article, we proposed VFLGAN-TS, which combines +the ideas of attribute discriminator and vertical federated learning to +generate synthetic time-series data in the vertically partitioned scenario. The +performance of VFLGAN-TS is close to that of its counterpart, which is trained +in a centralized manner and represents the upper limit for VFLGAN-TS. To +further protect privacy, we apply a Gaussian mechanism to make VFLGAN-TS +satisfy an $(\epsilon,\delta)$-differential privacy. Besides, we develop an +enhanced privacy auditing scheme to evaluate the potential privacy breach +through the framework of VFLGAN-TS and synthetic datasets. + +
+
+
+
+
+ + ☆ A practical approach to evaluating the adversarial distance for machine + learning classifiers + + +
+ Robustness is critical for machine learning (ML) classifiers to ensure +consistent performance in real-world applications where models may encounter +corrupted or adversarial inputs. In particular, assessing the robustness of +classifiers to adversarial inputs is essential to protect systems from +vulnerabilities and thus ensure safety in use. However, methods to accurately +compute adversarial robustness have been challenging for complex ML models and +high-dimensional data. Furthermore, evaluations typically measure adversarial +accuracy on specific attack budgets, limiting the informative value of the +resulting metrics. This paper investigates the estimation of the more +informative adversarial distance using iterative adversarial attacks and a +certification approach. Combined, the methods provide a comprehensive +evaluation of adversarial robustness by computing estimates for the upper and +lower bounds of the adversarial distance. We present visualisations and +ablation studies that provide insights into how this evaluation method should +be applied and parameterised. We find that our adversarial attack approach is +effective compared to related implementations, while the certification method +falls short of expectations. The approach in this paper should encourage a more +informative way of evaluating the adversarial robustness of ML classifiers. + +
+
+ comment: Accepted manuscript at International Mechanical Engineering Congress + and Exposition IMECE2024 +
+
+
+
+
+ + ☆ Costs Estimation in Unit Commitment Problems using Simulation-Based + Inference + + +
+ The Unit Commitment (UC) problem is a key optimization task in power systems +to forecast the generation schedules of power units over a finite time period +by minimizing costs while meeting demand and technical constraints. However, +many parameters required by the UC problem are unknown, such as the costs. In +this work, we estimate these unknown costs using simulation-based inference on +an illustrative UC problem, which provides an approximated posterior +distribution of the parameters given observed generation schedules and demands. +Our results highlight that the learned posterior distribution effectively +captures the underlying distribution of the data, providing a range of possible +values for the unknown parameters given a past observation. This posterior +allows for the estimation of past costs using observed past generation +schedules, enabling operators to better forecast future costs and make more +robust generation scheduling forecasts. We present avenues for future research +to address overconfidence in posterior estimation, enhance the scalability of +the methodology and apply it to more complex UC problems modeling the network +constraints and renewable energy sources. + +
+
+
+
+
+ + ☆ CHIRPs: Change-Induced Regret Proxy metrics for Lifelong Reinforcement + Learning + + +
+ Reinforcement learning agents can achieve superhuman performance in static +tasks but are costly to train and fragile to task changes. This limits their +deployment in real-world scenarios where training experience is expensive or +the context changes through factors like sensor degradation, environmental +processes or changing mission priorities. Lifelong reinforcement learning aims +to improve sample efficiency and adaptability by studying how agents perform in +evolving problems. The difficulty that these changes pose to an agent is rarely +measured directly, however. Agent performances can be compared across a change, +but this is often prohibitively expensive. We propose Change-Induced Regret +Proxy (CHIRP) metrics, a class of metrics for approximating a change's +difficulty while avoiding the high costs of using trained agents. A +relationship between a CHIRP metric and agent performance is identified in two +environments, a simple grid world and MetaWorld's suite of robotic arm tasks. +We demonstrate two uses for these metrics: for learning, an agent that clusters +MDPs based on a CHIRP metric achieves $17\%$ higher average returns than three +existing agents in a sequence of MetaWorld tasks. We also show how a CHIRP can +be calibrated to compare the difficulty of changes across distinctly different +environments. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ 100 instances is all you need: predicting the success of a new LLM on + unseen data by testing on a few instances KDD + + +
+ Predicting the performance of LLMs on individual task instances is essential +to ensure their reliability in high-stakes applications. To do so, a +possibility is to evaluate the considered LLM on a set of task instances and +train an assessor to predict its performance based on features of the +instances. However, this approach requires evaluating each new LLM on a +sufficiently large set of task instances to train an assessor specific to it. +In this work, we leverage the evaluation results of previously tested LLMs to +reduce the number of evaluations required to predict the performance of a new +LLM. In practice, we propose to test the new LLM on a small set of reference +instances and train a generic assessor which predicts the performance of the +LLM on an instance based on the performance of the former on the reference set +and features of the instance of interest. We conduct empirical studies on +HELM-Lite and KindsOfReasoning, a collection of existing reasoning datasets +that we introduce, where we evaluate all instruction-fine-tuned OpenAI models +until the January 2024 version of GPT4. When predicting performance on +instances with the same distribution as those used to train the generic +assessor, we find this achieves performance comparable to the LLM-specific +assessors trained on the full set of instances. Additionally, we find that +randomly selecting the reference instances performs as well as some advanced +selection methods we tested. For out of distribution, however, no clear winner +emerges and the overall performance is worse, suggesting that the inherent +predictability of LLMs is low. + +
+
+ comment: Presented at the 2024 KDD workshop on Evaluation and Trustworthiness + of Generative AI Models +
+
+
+
+
+ + ☆ MaskVal: Simple but Effective Uncertainty Quantification for 6D Pose + Estimation + + +
+ For the use of 6D pose estimation in robotic applications, reliable poses are +of utmost importance to ensure a safe, reliable and predictable operational +performance. Despite these requirements, state-of-the-art 6D pose estimators +often do not provide any uncertainty quantification for their pose estimates at +all, or if they do, it has been shown that the uncertainty provided is only +weakly correlated with the actual true error. To address this issue, we +investigate a simple but effective uncertainty quantification, that we call +MaskVal, which compares the pose estimates with their corresponding instance +segmentations by rendering and does not require any modification of the pose +estimator itself. Despite its simplicity, MaskVal significantly outperforms a +state-of-the-art ensemble method on both a dataset and a robotic setup. We show +that by using MaskVal, the performance of a state-of-the-art 6D pose estimator +is significantly improved towards a safe and reliable operation. In addition, +we propose a new and specific approach to compare and evaluate uncertainty +quantification methods for 6D pose estimation in the context of robotic +manipulation. + +
+
+
+
+
+ + ☆ Unified Framework for Neural Network Compression via Decomposition and + Optimal Rank Selection + + +
+ Despite their high accuracy, complex neural networks demand significant +computational resources, posing challenges for deployment on +resource-constrained devices such as mobile phones and embedded systems. +Compression algorithms have been developed to address these challenges by +reducing model size and computational demands while maintaining accuracy. Among +these approaches, factorization methods based on tensor decomposition are +theoretically sound and effective. However, they face difficulties in selecting +the appropriate rank for decomposition. This paper tackles this issue by +presenting a unified framework that simultaneously applies decomposition and +optimal rank selection, employing a composite compression loss within defined +rank constraints. Our approach includes an automatic rank search in a +continuous space, efficiently identifying optimal rank configurations without +the use of training data, making it computationally efficient. Combined with a +subsequent fine-tuning step, our approach maintains the performance of highly +compressed models on par with their original counterparts. Using various +benchmark datasets, we demonstrate the efficacy of our method through a +comprehensive analysis. + +
+
+
+
+
+ + ☆ DKDM: Data-Free Knowledge Distillation for Diffusion Models with Any + Architecture + + +
+ Diffusion models (DMs) have demonstrated exceptional generative capabilities +across various areas, while they are hindered by slow inference speeds and high +computational demands during deployment. The most common way to accelerate DMs +involves reducing the number of denoising steps during generation, achieved +through faster sampling solvers or knowledge distillation (KD). In contrast to +prior approaches, we propose a novel method that transfers the capability of +large pretrained DMs to faster architectures. Specifically, we employ KD in a +distinct manner to compress DMs by distilling their generative ability into +more rapid variants. Furthermore, considering that the source data is either +unaccessible or too enormous to store for current generative models, we +introduce a new paradigm for their distillation without source data, termed +Data-Free Knowledge Distillation for Diffusion Models (DKDM). Generally, our +established DKDM framework comprises two main components: 1) a DKDM objective +that uses synthetic denoising data produced by pretrained DMs to optimize +faster DMs without source data, and 2) a dynamic iterative distillation method +that flexibly organizes the synthesis of denoising data, preventing it from +slowing down the optimization process as the generation is slow. To our +knowledge, this is the first attempt at using KD to distill DMs into any +architecture in a data-free manner. Importantly, our DKDM is orthogonal to most +existing acceleration methods, such as denoising step reduction, quantization +and pruning. Experiments show that our DKDM is capable of deriving 2x faster +DMs with performance remaining on par with the baseline. Notably, our DKDM +enables pretrained DMs to function as "datasets" for training new DMs. + +
+
+
+
+
+ + ☆ The Power of Second Chance: Personalized Submodular Maximization with + Two Candidates + + +
+ Most of existing studies on submodular maximization focus on selecting a +subset of items that maximizes a \emph{single} submodular function. However, in +many real-world scenarios, we might have multiple user-specific functions, each +of which models the utility of a particular type of user. In these settings, +our goal would be to choose a set of items that performs well across all the +user-specific functions. One way to tackle this problem is to select a single +subset that maximizes the sum of all of the user-specific functions. Although +this aggregate approach is efficient in the sense that it avoids computation of +sets for individual functions, it really misses the power of personalization - +for it does not allow to choose different sets for different functions. In this +paper, we introduce the problem of personalized submodular maximization with +two candidate solutions. For any two candidate solutions, the utility of each +user-specific function is defined as the better of these two candidates. Our +objective is, therefore, to select the best set of two candidates that maximize +the sum of utilities of all the user-specific functions. We have designed +effective algorithms for this problem. We also discuss how our approach +generalizes to multiple candidate solutions, increasing flexibility and +personalization in our solution. + +
+
+
+
+
+ + ☆ Risk-based Calibration for Probabilistic Classifiers + + +
+ We introduce a general iterative procedure called risk-based calibration (RC) +designed to minimize the empirical risk under the 0-1 loss (empirical error) +for probabilistic classifiers. These classifiers are based on modeling +probability distributions, including those constructed from the joint +distribution (generative) and those based on the class conditional distribution +(conditional). RC can be particularized to any probabilistic classifier +provided a specific learning algorithm that computes the classifier's +parameters in closed form using data statistics. RC reinforces the statistics +aligned with the true class while penalizing those associated with other +classes, guided by the 0-1 loss. The proposed method has been empirically +tested on 30 datasets using na\"ive Bayes, quadratic discriminant analysis, and +logistic regression classifiers. RC improves the empirical error of the +original closed-form learning algorithms and, more notably, consistently +outperforms the gradient descent approach with the three classifiers. + +
+
+
+
+
+ + ☆ Prediction Accuracy & Reliability: Classification and Object + Localization under Distribution Shift + + +
+ Natural distribution shift causes a deterioration in the perception +performance of convolutional neural networks (CNNs). This comprehensive +analysis for real-world traffic data addresses: 1) investigating the effect of +natural distribution shift and weather augmentations on both detection quality +and confidence estimation, 2) evaluating model performance for both +classification and object localization, and 3) benchmarking two common +uncertainty quantification methods - Ensembles and different variants of +Monte-Carlo (MC) Dropout - under natural and close-to-natural distribution +shift. For this purpose, a novel dataset has been curated from publicly +available autonomous driving datasets. The in-distribution (ID) data is based +on cutouts of a single object, for which both class and bounding box +annotations are available. The six distribution-shift datasets cover adverse +weather scenarios, simulated rain and fog, corner cases, and +out-of-distribution data. A granular analysis of CNNs under distribution shift +allows to quantize the impact of different types of shifts on both, task +performance and confidence estimation: ConvNeXt-Tiny is more robust than +EfficientNet-B0; heavy rain degrades classification stronger than localization, +contrary to heavy fog; integrating MC-Dropout into selected layers only has the +potential to enhance task performance and confidence estimation, whereby the +identification of these layers depends on the type of distribution shift and +the considered task. + +
+
+ comment: This preprint has not undergone any post-submission improvements or + corrections +
+
+
+
+
+ + ☆ A Physics-Informed Machine Learning Approach for Solving Distributed + Order Fractional Differential Equations + + +
+ This paper introduces a novel methodology for solving distributed-order +fractional differential equations using a physics-informed machine learning +framework. The core of this approach involves extending the support vector +regression (SVR) algorithm to approximate the unknown solutions of the +governing equations during the training phase. By embedding the +distributed-order functional equation into the SVR framework, we incorporate +physical laws directly into the learning process. To further enhance +computational efficiency, Gegenbauer orthogonal polynomials are employed as the +kernel function, capitalizing on their fractional differentiation properties to +streamline the problem formulation. Finally, the resulting optimization problem +of SVR is addressed either as a quadratic programming problem or as a positive +definite system in its dual form. The effectiveness of the proposed approach is +validated through a series of numerical experiments on Caputo-based +distributed-order fractional differential equations, encompassing both ordinary +and partial derivatives. + +
+
+
+
+
+ + ☆ Survey of Data-driven Newsvendor: Unified Analysis and Spectrum of + Achievable Regrets + + +
+ In the Newsvendor problem, the goal is to guess the number that will be drawn +from some distribution, with asymmetric consequences for guessing too high vs. +too low. In the data-driven version, the distribution is unknown, and one must +work with samples from the distribution. Data-driven Newsvendor has been +studied under many variants: additive vs. multiplicative regret, high +probability vs. expectation bounds, and different distribution classes. This +paper studies all combinations of these variants, filling in many gaps in the +literature and simplifying many proofs. In particular, we provide a unified +analysis based on the notion of clustered distributions, which in conjunction +with our new lower bounds, shows that the entire spectrum of regrets between +$1/\sqrt{n}$ and $1/n$ can be possible. + +
+
+
+
+
+ + ☆ Maximum likelihood inference for high-dimensional problems with + multiaffine variable relations + + +
+ Maximum Likelihood Estimation of continuous variable models can be very +challenging in high dimensions, due to potentially complex probability +distributions. The existence of multiple interdependencies among variables can +make it very difficult to establish convergence guarantees. This leads to a +wide use of brute-force methods, such as grid searching and Monte-Carlo +sampling and, when applicable, complex and problem-specific algorithms. In this +paper, we consider inference problems where the variables are related by +multiaffine expressions. We propose a novel Alternating and +Iteratively-Reweighted Least Squares (AIRLS) algorithm, and prove its +convergence for problems with Generalized Normal Distributions. We also provide +an efficient method to compute the variance of the estimates obtained using +AIRLS. Finally, we show how the method can be applied to graphical statistical +models. We perform numerical experiments on several inference problems, showing +significantly better performance than state-of-the-art approaches in terms of +scalability, robustness to noise, and convergence speed due to an empirically +observed super-linear convergence rate. + +
+
+
+
+
+ + ☆ Distributionally Robust Optimisation with Bayesian Ambiguity Sets + + +
+ Decision making under uncertainty is challenging since the data-generating +process (DGP) is often unknown. Bayesian inference proceeds by estimating the +DGP through posterior beliefs about the model's parameters. However, minimising +the expected risk under these posterior beliefs can lead to sub-optimal +decisions due to model uncertainty or limited, noisy observations. To address +this, we introduce Distributionally Robust Optimisation with Bayesian Ambiguity +Sets (DRO-BAS) which hedges against uncertainty in the model by optimising the +worst-case risk over a posterior-informed ambiguity set. We show that our +method admits a closed-form dual representation for many exponential family +members and showcase its improved out-of-sample robustness against existing +Bayesian DRO methodology in the Newsvendor problem. + +
+
+ comment: 13 pages, 3 figures. Under review +
+
+
+
+
+ + ☆ Sparsifying Parametric Models with L0 Regularization + + +
+ This document contains an educational introduction to the problem of +sparsifying parametric models with L0 regularization. We utilize this approach +together with dictionary learning to learn sparse polynomial policies for deep +reinforcement learning to control parametric partial differential equations. +The code and a tutorial are provided here: +https://github.com/nicob15/Sparsifying-Parametric-Models-with-L0. + +
+
+
+
+
+ + ☆ LLM-based event abstraction and integration for IoT-sourced logs + + +
+ The continuous flow of data collected by Internet of Things (IoT) devices, +has revolutionised our ability to understand and interact with the world across +various applications. However, this data must be prepared and transformed into +event data before analysis can begin. In this paper, we shed light on the +potential of leveraging Large Language Models (LLMs) in event abstraction and +integration. Our approach aims to create event records from raw sensor readings +and merge the logs from multiple IoT sources into a single event log suitable +for further Process Mining applications. We demonstrate the capabilities of +LLMs in event abstraction considering a case study for IoT application in +elderly care and longitudinal health monitoring. The results, showing on +average an accuracy of 90% in detecting high-level activities. These results +highlight LLMs' promising potential in addressing event abstraction and +integration challenges, effectively bridging the existing gap. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Improving Uncertainty-Error Correspondence in Deep Bayesian Medical + Image Segmentation + + +
+ Increased usage of automated tools like deep learning in medical image +segmentation has alleviated the bottleneck of manual contouring. This has +shifted manual labour to quality assessment (QA) of automated contours which +involves detecting errors and correcting them. A potential solution to +semi-automated QA is to use deep Bayesian uncertainty to recommend potentially +erroneous regions, thus reducing time spent on error detection. Previous work +has investigated the correspondence between uncertainty and error, however, no +work has been done on improving the "utility" of Bayesian uncertainty maps such +that it is only present in inaccurate regions and not in the accurate ones. Our +work trains the FlipOut model with the Accuracy-vs-Uncertainty (AvU) loss which +promotes uncertainty to be present only in inaccurate regions. We apply this +method on datasets of two radiotherapy body sites, c.f. head-and-neck CT and +prostate MR scans. Uncertainty heatmaps (i.e. predictive entropy) are evaluated +against voxel inaccuracies using Receiver Operating Characteristic (ROC) and +Precision-Recall (PR) curves. Numerical results show that when compared to the +Bayesian baseline the proposed method successfully suppresses uncertainty for +accurate voxels, with similar presence of uncertainty for inaccurate voxels. +Code to reproduce experiments is available at +https://github.com/prerakmody/bayesuncertainty-error-correspondence + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024:018 +
+
+
+
+
+ + ☆ Panopticon: a novel deep learning model to detect single transit events + with no prior data filtering in PLATO light curves + + +
+ To prepare for the analyses of the future PLATO light curves, we develop a +deep learning model, Panopticon, to detect transits in high precision +photometric light curves. Since PLATO's main objective is the detection of +temperate Earth-size planets around solar-type stars, the code is designed to +detect individual transit events. The filtering step, required by conventional +detection methods, can affect the transit, which could be an issue for long and +shallow transits. To protect transit shape and depth, the code is also designed +to work on unfiltered light curves. We trained the model on a set of simulated +PLATO light curves in which we injected, at pixel level, either planetary, +eclipsing binary, or background eclipsing binary signals. We also include a +variety of noises in our data, such as granulation, stellar spots or cosmic +rays. The approach is able to recover 90% of our test population, including +more than 25% of the Earth-analogs, even in the unfiltered light curves. The +model also recovers the transits irrespective of the orbital period, and is +able to retrieve transits on a unique event basis. These figures are obtained +when accepting a false alarm rate of 1%. When keeping the false alarm rate low +(<0.01%), it is still able to recover more than 85% of the transit signals. Any +transit deeper than 180ppm is essentially guaranteed to be recovered. This +method is able to recover transits on a unique event basis, and does so with a +low false alarm rate. Thanks to light curves being one-dimensional, model +training is fast, on the order of a few hours per model. This speed in training +and inference, coupled to the recovery effectiveness and precision of the model +make it an ideal tool to complement, or be used ahead of, classical approaches. + +
+
+ comment: Submitted to A&A +
+
+
+
+
+ + ☆ Characterizing Massive Activations of Attention Mechanism in Graph + Neural Networks + + +
+ Graph Neural Networks (GNNs) have become increasingly popular for effectively +modeling data with graph structures. Recently, attention mechanisms have been +integrated into GNNs to improve their ability to capture complex patterns. This +paper presents the first comprehensive study revealing a critical, unexplored +consequence of this integration: the emergence of Massive Activations (MAs) +within attention layers. We introduce a novel method for detecting and +analyzing MAs, focusing on edge features in different graph transformer +architectures. Our study assesses various GNN models using benchmark datasets, +including ZINC, TOX21, and PROTEINS. Key contributions include (1) establishing +the direct link between attention mechanisms and MAs generation in GNNs, (2) +developing a robust definition and detection method for MAs based on activation +ratio distributions, (3) introducing the Explicit Bias Term (EBT) as a +potential countermeasure and exploring it as an adversarial framework to assess +models robustness based on the presence or absence of MAs. Our findings +highlight the prevalence and impact of attention-induced MAs across different +architectures, such as GraphTransformer, GraphiT, and SAN. The study reveals +the complex interplay between attention mechanisms, model architecture, dataset +characteristics, and MAs emergence, providing crucial insights for developing +more robust and reliable graph models. + +
+
+
+
+
+ + ☆ Raw Speech Enhancement with Deep State Space Modeling + + +
+ We present aTENNuate, a simple deep state-space autoencoder configured for +efficient online raw speech enhancement in an end-to-end fashion. The network's +performance is primarily evaluated on raw speech denoising, with additional +assessments on tasks such as super-resolution and de-quantization. We benchmark +aTENNuate on the VoiceBank + DEMAND and the Microsoft DNS1 synthetic test sets. +The network outperforms previous real-time denoising models in terms of PESQ +score, parameter count, MACs, and latency. Even as a raw waveform processing +model, the model maintains high fidelity to the clean signal with minimal +audible artifacts. In addition, the model remains performant even when the +noisy input is compressed down to 4000Hz and 4 bits, suggesting general speech +enhancement capabilities in low-resource environments. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ Leveraging Large Language Models through Natural Language Processing to + provide interpretable Machine Learning predictions of mental deterioration in + real time + + +
+ Based on official estimates, 50 million people worldwide are affected by +dementia, and this number increases by 10 million new patients every year. +Without a cure, clinical prognostication and early intervention represent the +most effective ways to delay its progression. To this end, Artificial +Intelligence and computational linguistics can be exploited for natural +language analysis, personalized assessment, monitoring, and treatment. However, +traditional approaches need more semantic knowledge management and +explicability capabilities. Moreover, using Large Language Models (LLMs) for +cognitive decline diagnosis is still scarce, even though these models represent +the most advanced way for clinical-patient communication using intelligent +systems. Consequently, we leverage an LLM using the latest Natural Language +Processing (NLP) techniques in a chatbot solution to provide interpretable +Machine Learning prediction of cognitive decline in real-time. +Linguistic-conceptual features are exploited for appropriate natural language +analysis. Through explainability, we aim to fight potential biases of the +models and improve their potential to help clinical workers in their diagnosis +decisions. More in detail, the proposed pipeline is composed of (i) data +extraction employing NLP-based prompt engineering; (ii) stream-based data +processing including feature engineering, analysis, and selection; (iii) +real-time classification; and (iv) the explainability dashboard to provide +visual and natural language descriptions of the prediction outcome. +Classification results exceed 80 % in all evaluation metrics, with a recall +value for the mental deterioration class about 85 %. To sum up, we contribute +with an affordable, flexible, non-invasive, personalized diagnostic system to +this work. + +
+
+
+
+
+ + ☆ Efficient Multi-Task Large Model Training via Data Heterogeneity-aware + Model Management + + +
+ Recent foundation models are capable of handling multiple machine learning +(ML) tasks and multiple data modalities with the unified base model structure +and several specialized model components. However, the development of such +multi-task (MT) multi-modal (MM) models poses significant model management +challenges to existing training systems. Due to the sophisticated model +architecture and the heterogeneous workloads of different ML tasks and data +modalities, training these models usually requires massive GPU resources and +suffers from sub-optimal system efficiency. + In this paper, we investigate how to achieve high-performance training of +large-scale MT MM models through data heterogeneity-aware model management +optimization. The key idea is to decompose the model execution into stages and +address the joint optimization problem sequentially, including both +heterogeneity-aware workload parallelization and dependency-driven execution +scheduling. Based on this, we build a prototype system and evaluate it on +various large MT MM models. Experiments demonstrate the superior performance +and efficiency of our system, with speedup ratio up to 71% compared to +state-of-the-art training systems. + +
+
+
+
+
+ + ☆ MouseSIS: A Frames-and-Events Dataset for Space-Time Instance + Segmentation of Mice ECCV + + +
+ Enabled by large annotated datasets, tracking and segmentation of objects in +videos has made remarkable progress in recent years. Despite these +advancements, algorithms still struggle under degraded conditions and during +fast movements. Event cameras are novel sensors with high temporal resolution +and high dynamic range that offer promising advantages to address these +challenges. However, annotated data for developing learning-based mask-level +tracking algorithms with events is not available. To this end, we introduce: +($i$) a new task termed \emph{space-time instance segmentation}, similar to +video instance segmentation, whose goal is to segment instances throughout the +entire duration of the sensor input (here, the input are quasi-continuous +events and optionally aligned frames); and ($ii$) \emph{\dname}, a dataset for +the new task, containing aligned grayscale frames and events. It includes +annotated ground-truth labels (pixel-level instance segmentation masks) of a +group of up to seven freely moving and interacting mice. We also provide two +reference methods, which show that leveraging event data can consistently +improve tracking performance, especially when used in combination with +conventional cameras. The results highlight the potential of event-aided +tracking in difficult scenarios. We hope our dataset opens the field of +event-based video instance segmentation and enables the development of robust +tracking algorithms for challenging +conditions.\url{https://github.com/tub-rip/MouseSIS} + +
+
+ comment: 18 pages, 5 figures, ECCV Workshops +
+
+
+
+
+ + ☆ Semi-Supervised Sparse Gaussian Classification: Provable Benefits of + Unlabeled Data + + +
+ The premise of semi-supervised learning (SSL) is that combining labeled and +unlabeled data yields significantly more accurate models. Despite empirical +successes, the theoretical understanding of SSL is still far from complete. In +this work, we study SSL for high dimensional sparse Gaussian classification. To +construct an accurate classifier a key task is feature selection, detecting the +few variables that separate the two classes. % For this SSL setting, we analyze +information theoretic lower bounds for accurate feature selection as well as +computational lower bounds, assuming the low-degree likelihood hardness +conjecture. % Our key contribution is the identification of a regime in the +problem parameters (dimension, sparsity, number of labeled and unlabeled +samples) where SSL is guaranteed to be advantageous for classification. +Specifically, there is a regime where it is possible to construct in polynomial +time an accurate SSL classifier. However, % any computationally efficient +supervised or unsupervised learning schemes, that separately use only the +labeled or unlabeled data would fail. Our work highlights the provable benefits +of combining labeled and unlabeled data for {classification and} feature +selection in high dimensions. We present simulations that complement our +theoretical analysis. + +
+
+
+
+
+ + ☆ Towards training digitally-tied analog blocks via hybrid gradient + computation + + +
+ Power efficiency is plateauing in the standard digital electronics realm such +that novel hardware, models, and algorithms are needed to reduce the costs of +AI training. The combination of energy-based analog circuits and the +Equilibrium Propagation (EP) algorithm constitutes one compelling alternative +compute paradigm for gradient-based optimization of neural nets. Existing +analog hardware accelerators, however, typically incorporate digital circuitry +to sustain auxiliary non-weight-stationary operations, mitigate analog device +imperfections, and leverage existing digital accelerators.This heterogeneous +hardware approach calls for a new theoretical model building block. In this +work, we introduce Feedforward-tied Energy-based Models (ff-EBMs), a hybrid +model comprising feedforward and energy-based blocks accounting for digital and +analog circuits. We derive a novel algorithm to compute gradients end-to-end in +ff-EBMs by backpropagating and "eq-propagating" through feedforward and +energy-based parts respectively, enabling EP to be applied to much more +flexible and realistic architectures. We experimentally demonstrate the +effectiveness of the proposed approach on ff-EBMs where Deep Hopfield Networks +(DHNs) are used as energy-based blocks. We first show that a standard DHN can +be arbitrarily split into any uniform size while maintaining performance. We +then train ff-EBMs on ImageNet32 where we establish new SOTA performance in the +EP literature (46 top-1 %). Our approach offers a principled, scalable, and +incremental roadmap to gradually integrate self-trainable analog computational +primitives into existing digital accelerators. + +
+
+
+
+
+ + ☆ Improving Robustness to Multiple Spurious Correlations by + Multi-Objective Optimization + + +
+ We study the problem of training an unbiased and accurate model given a +dataset with multiple biases. This problem is challenging since the multiple +biases cause multiple undesirable shortcuts during training, and even worse, +mitigating one may exacerbate the other. We propose a novel training method to +tackle this challenge. Our method first groups training data so that different +groups induce different shortcuts, and then optimizes a linear combination of +group-wise losses while adjusting their weights dynamically to alleviate +conflicts between the groups in performance; this approach, rooted in the +multi-objective optimization theory, encourages to achieve the minimax Pareto +solution. We also present a new benchmark with multiple biases, dubbed +MultiCelebA, for evaluating debiased training methods under realistic and +challenging scenarios. Our method achieved the best on three datasets with +multiple biases, and also showed superior performance on conventional +single-bias datasets. + +
+
+ comment: International Conference on Machine Learning 2024 +
+
+
+
+
+ + ☆ Fourier Neural Operators for Learning Dynamics in Quantum Spin Systems + + +
+ Fourier Neural Operators (FNOs) excel on tasks using functional data, such as +those originating from partial differential equations. Such characteristics +render them an effective approach for simulating the time evolution of quantum +wavefunctions, which is a computationally challenging, yet coveted task for +understanding quantum systems. In this manuscript, we use FNOs to model the +evolution of random quantum spin systems, so chosen due to their representative +quantum dynamics and minimal symmetry. We explore two distinct FNO +architectures and examine their performance for learning and predicting time +evolution using both random and low-energy input states. Additionally, we apply +FNOs to a compact set of Hamiltonian observables ($\sim\text{poly}(n)$) instead +of the entire $2^n$ quantum wavefunction, which greatly reduces the size of our +inputs and outputs and, consequently, the requisite dimensions of the resulting +FNOs. Moreover, this Hamiltonian observable-based method demonstrates that FNOs +can effectively distill information from high-dimensional spaces into +lower-dimensional spaces. The extrapolation of Hamiltonian observables to times +later than those used in training is of particular interest, as this stands to +fundamentally increase the simulatability of quantum systems past both the +coherence times of contemporary quantum architectures and the circuit-depths of +tractable tensor networks. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ ELO-Rated Sequence Rewards: Advancing Reinforcement Learning Models + + +
+ Reinforcement Learning (RL) is highly dependent on the meticulous design of +the reward function. However, accurately assigning rewards to each state-action +pair in Long-Term RL (LTRL) challenges is formidable. Consequently, RL agents +are predominantly trained with expert guidance. Drawing on the principles of +ordinal utility theory from economics, we propose a novel reward estimation +algorithm: ELO-Rating based RL (ERRL). This approach is distinguished by two +main features. Firstly, it leverages expert preferences over trajectories +instead of cardinal rewards (utilities) to compute the ELO rating of each +trajectory as its reward. Secondly, a new reward redistribution algorithm is +introduced to mitigate training volatility in the absence of a fixed anchor +reward. Our method demonstrates superior performance over several leading +baselines in long-term scenarios (extending up to 5000 steps), where +conventional RL algorithms falter. Furthermore, we conduct a thorough analysis +of how expert preferences affect the outcomes. + +
+
+
+
+
+ + ☆ Bringing the RT-1-X Foundation Model to a SCARA robot + + +
+ Traditional robotic systems require specific training data for each task, +environment, and robot form. While recent advancements in machine learning have +enabled models to generalize across new tasks and environments, the challenge +of adapting these models to entirely new settings remains largely unexplored. +This study addresses this by investigating the generalization capabilities of +the RT-1-X robotic foundation model to a type of robot unseen during its +training: a SCARA robot from UMI-RTX. + Initial experiments reveal that RT-1-X does not generalize zero-shot to the +unseen type of robot. However, fine-tuning of the RT-1-X model by demonstration +allows the robot to learn a pickup task which was part of the foundation model +(but learned for another type of robot). When the robot is presented with an +object that is included in the foundation model but not in the fine-tuning +dataset, it demonstrates that only the skill, but not the object-specific +knowledge, has been transferred. + +
+
+ comment: 14 pages, submitted to the joint Artificial Intelligence & Machine + Learning conference for Belgium, Netherlands & Luxembourg (BNAIC/BeNeLearn) +
+
+
+
+
+ + ☆ LLM Detectors Still Fall Short of Real World: Case of LLM-Generated + Short News-Like Posts EMNLP + + +
+ With the emergence of widely available powerful LLMs, disinformation +generated by large Language Models (LLMs) has become a major concern. +Historically, LLM detectors have been touted as a solution, but their +effectiveness in the real world is still to be proven. In this paper, we focus +on an important setting in information operations -- short news-like posts +generated by moderately sophisticated attackers. + We demonstrate that existing LLM detectors, whether zero-shot or +purpose-trained, are not ready for real-world use in that setting. All tested +zero-shot detectors perform inconsistently with prior benchmarks and are highly +vulnerable to sampling temperature increase, a trivial attack absent from +recent benchmarks. A purpose-trained detector generalizing across LLMs and +unseen attacks can be developed, but it fails to generalize to new +human-written texts. + We argue that the former indicates domain-specific benchmarking is needed, +while the latter suggests a trade-off between the adversarial evasion +resilience and overfitting to the reference human text, with both needing +evaluation in benchmarks and currently absent. We believe this suggests a +re-consideration of current LLM detector benchmarking approaches and provides a +dynamically extensible benchmark to allow it +(https://github.com/Reliable-Information-Lab-HEVS/dynamic_llm_detector_benchmark). + +
+
+ comment: 20 pages, 7 tables, 13 figures, under consideration for EMNLP +
+
+
+
+
+ + ☆ Interpretable mixture of experts for time series prediction under + recurrent and non-recurrent conditions + + +
+ Non-recurrent conditions caused by incidents are different from recurrent +conditions that follow periodic patterns. Existing traffic speed prediction +studies are incident-agnostic and use one single model to learn all possible +patterns from these drastically diverse conditions. This study proposes a novel +Mixture of Experts (MoE) model to improve traffic speed prediction under two +separate conditions, recurrent and non-recurrent (i.e., with and without +incidents). The MoE leverages separate recurrent and non-recurrent expert +models (Temporal Fusion Transformers) to capture the distinct patterns of each +traffic condition. Additionally, we propose a training pipeline for +non-recurrent models to remedy the limited data issues. To train our model, +multi-source datasets, including traffic speed, incident reports, and weather +data, are integrated and processed to be informative features. Evaluations on a +real road network demonstrate that the MoE achieves lower errors compared to +other benchmark algorithms. The model predictions are interpreted in terms of +temporal dependencies and variable importance in each condition separately to +shed light on the differences between recurrent and non-recurrent conditions. + +
+
+
+
+
+ + ☆ Tensor network square root Kalman filter for online Gaussian process + regression + + +
+ The state-of-the-art tensor network Kalman filter lifts the curse of +dimensionality for high-dimensional recursive estimation problems. However, the +required rounding operation can cause filter divergence due to the loss of +positive definiteness of covariance matrices. We solve this issue by +developing, for the first time, a tensor network square root Kalman filter, and +apply it to high-dimensional online Gaussian process regression. In our +experiments, we demonstrate that our method is equivalent to the conventional +Kalman filter when choosing a full-rank tensor network. Furthermore, we apply +our method to a real-life system identification problem where we estimate +$4^{14}$ parameters on a standard laptop. The estimated model outperforms the +state-of-the-art tensor network Kalman filter in terms of prediction accuracy +and uncertainty quantification. + +
+
+
+
+
+ + ☆ In Search of Trees: Decision-Tree Policy Synthesis for Black-Box Systems + via Search + + +
+ Decision trees, owing to their interpretability, are attractive as control +policies for (dynamical) systems. Unfortunately, constructing, or synthesising, +such policies is a challenging task. Previous approaches do so by imitating a +neural-network policy, approximating a tabular policy obtained via formal +synthesis, employing reinforcement learning, or modelling the problem as a +mixed-integer linear program. However, these works may require access to a +hard-to-obtain accurate policy or a formal model of the environment (within +reach of formal synthesis), and may not provide guarantees on the quality or +size of the final tree policy. In contrast, we present an approach to +synthesise optimal decision-tree policies given a black-box environment and +specification, and a discretisation of the tree predicates, where optimality is +defined with respect to the number of steps to achieve the goal. Our approach +is a specialised search algorithm which systematically explores the +(exponentially large) space of decision trees under the given discretisation. +The key component is a novel pruning mechanism that significantly reduces the +search space. Our approach represents a conceptually novel way of synthesising +small decision-tree policies with optimality guarantees even for black-box +environments with black-box specifications. + +
+
+ comment: 8 pages main text incl. references, 1 page appendix +
+
+
+
+
+ + ☆ SpinMultiNet: Neural Network Potential Incorporating Spin Degrees of + Freedom with Multi-Task Learning + + +
+ Neural Network Potentials (NNPs) have attracted significant attention as a +method for accelerating density functional theory (DFT) calculations. However, +conventional NNP models typically do not incorporate spin degrees of freedom, +limiting their applicability to systems where spin states critically influence +material properties, such as transition metal oxides. This study introduces +SpinMultiNet, a novel NNP model that integrates spin degrees of freedom through +multi-task learning. SpinMultiNet achieves accurate predictions without relying +on correct spin values obtained from DFT calculations. Instead, it utilizes +initial spin estimates as input and leverages multi-task learning to optimize +the spin latent representation while maintaining both $E(3)$ and time-reversal +equivariance. Validation on a dataset of transition metal oxides demonstrates +the high predictive accuracy of SpinMultiNet. The model successfully reproduces +the energy ordering of stable spin configurations originating from +superexchange interactions and accurately captures the rhombohedral distortion +of the rocksalt structure. These results pave the way for new possibilities in +materials simulations that consider spin degrees of freedom, promising future +applications in large-scale simulations of various material systems, including +magnetic materials. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Dual-TSST: A Dual-Branch Temporal-Spectral-Spatial Transformer Model for + EEG Decoding + + +
+ The decoding of electroencephalography (EEG) signals allows access to user +intentions conveniently, which plays an important role in the fields of +human-machine interaction. To effectively extract sufficient characteristics of +the multichannel EEG, a novel decoding architecture network with a dual-branch +temporal-spectral-spatial transformer (Dual-TSST) is proposed in this study. +Specifically, by utilizing convolutional neural networks (CNNs) on different +branches, the proposed processing network first extracts the temporal-spatial +features of the original EEG and the temporal-spectral-spatial features of +time-frequency domain data converted by wavelet transformation, respectively. +These perceived features are then integrated by a feature fusion block, serving +as the input of the transformer to capture the global long-range dependencies +entailed in the non-stationary EEG, and being classified via the global average +pooling and multi-layer perceptron blocks. To evaluate the efficacy of the +proposed approach, the competitive experiments are conducted on three publicly +available datasets of BCI IV 2a, BCI IV 2b, and SEED, with the head-to-head +comparison of more than ten other state-of-the-art methods. As a result, our +proposed Dual-TSST performs superiorly in various tasks, which achieves the +promising EEG classification performance of average accuracy of 80.67% in BCI +IV 2a, 88.64% in BCI IV 2b, and 96.65% in SEED, respectively. Extensive +ablation experiments conducted between the Dual-TSST and comparative baseline +model also reveal the enhanced decoding performance with each module of our +proposed method. This study provides a new approach to high-performance EEG +decoding, and has great potential for future CNN-Transformer based +applications. + +
+
+
+
+
+ + ☆ DiffGrad for Physics-Informed Neural Networks + + +
+ Physics-Informed Neural Networks (PINNs) are regarded as state-of-the-art +tools for addressing highly nonlinear problems based on partial differential +equations. Despite their broad range of applications, PINNs encounter several +performance challenges, including issues related to efficiency, minimization of +computational cost, and enhancement of accuracy. Burgers' equation, a +fundamental equation in fluid dynamics that is extensively used in PINNs, +provides flexible results with the Adam optimizer that does not account for +past gradients. This paper introduces a novel strategy for solving Burgers' +equation by incorporating DiffGrad with PINNs, a method that leverages the +difference between current and immediately preceding gradients to enhance +performance. A comprehensive computational analysis is conducted using +optimizers such as Adam, Adamax, RMSprop, and DiffGrad to evaluate and compare +their effectiveness. Our approach includes visualizing the solutions over space +at various time intervals to demonstrate the accuracy of the network. The +results show that DiffGrad not only improves the accuracy of the solution but +also reduces training time compared to the other optimizers. + +
+
+ comment: 20 pages, 14 figures +
+
+
+
+
+ + ☆ Preserving Empirical Probabilities in BERT for Small-sample Clinical + Entity Recognition + + +
+ Named Entity Recognition (NER) encounters the challenge of unbalanced labels, +where certain entity types are overrepresented while others are +underrepresented in real-world datasets. This imbalance can lead to biased +models that perform poorly on minority entity classes, impeding accurate and +equitable entity recognition. This paper explores the effects of unbalanced +entity labels of the BERT-based pre-trained model. We analyze the different +mechanisms of loss calculation and loss propagation for the task of token +classification on randomized datasets. Then we propose ways to improve the +token classification for the highly imbalanced task of clinical entity +recognition. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Robust Q-Learning under Corrupted Rewards + + +
+ Recently, there has been a surge of interest in analyzing the non-asymptotic +behavior of model-free reinforcement learning algorithms. However, the +performance of such algorithms in non-ideal environments, such as in the +presence of corrupted rewards, is poorly understood. Motivated by this gap, we +investigate the robustness of the celebrated Q-learning algorithm to a +strong-contamination attack model, where an adversary can arbitrarily perturb a +small fraction of the observed rewards. We start by proving that such an attack +can cause the vanilla Q-learning algorithm to incur arbitrarily large errors. +We then develop a novel robust synchronous Q-learning algorithm that uses +historical reward data to construct robust empirical Bellman operators at each +time step. Finally, we prove a finite-time convergence rate for our algorithm +that matches known state-of-the-art bounds (in the absence of attacks) up to a +small inevitable $O(\varepsilon)$ error term that scales with the adversarial +corruption fraction $\varepsilon$. Notably, our results continue to hold even +when the true reward distributions have infinite support, provided they admit +bounded second moments. + +
+
+ comment: Accepted to the Decision and Control Conference (CDC) 2024 +
+
+
+
+
+ + ☆ State-space models are accurate and efficient neural operators for + dynamical systems + + +
+ Physics-informed machine learning (PIML) has emerged as a promising +alternative to classical methods for predicting dynamical systems, offering +faster and more generalizable solutions. However, existing models, including +recurrent neural networks (RNNs), transformers, and neural operators, face +challenges such as long-time integration, long-range dependencies, chaotic +dynamics, and extrapolation, to name a few. To this end, this paper introduces +state-space models implemented in Mamba for accurate and efficient dynamical +system operator learning. Mamba addresses the limitations of existing +architectures by dynamically capturing long-range dependencies and enhancing +computational efficiency through reparameterization techniques. To extensively +test Mamba and compare against another 11 baselines, we introduce several +strict extrapolation testbeds that go beyond the standard interpolation +benchmarks. We demonstrate Mamba's superior performance in both interpolation +and challenging extrapolation tasks. Mamba consistently ranks among the top +models while maintaining the lowest computational cost and exceptional +extrapolation capabilities. Moreover, we demonstrate the good performance of +Mamba for a real-world application in quantitative systems pharmacology for +assessing the efficacy of drugs in tumor growth under limited data scenarios. +Taken together, our findings highlight Mamba's potential as a powerful tool for +advancing scientific machine learning in dynamical systems modeling. (The code +will be available at +https://github.com/zheyuanhu01/State_Space_Model_Neural_Operator upon +acceptance.) + +
+
+ comment: 34 pages +
+
+
+
+
+ + ☆ FairQuant: Certifying and Quantifying Fairness of Deep Neural Networks ICSE 2025 + + +
+ We propose a method for formally certifying and quantifying individual +fairness of deep neural networks (DNN). Individual fairness guarantees that any +two individuals who are identical except for a legally protected attribute +(e.g., gender or race) receive the same treatment. While there are existing +techniques that provide such a guarantee, they tend to suffer from lack of +scalability or accuracy as the size and input dimension of the DNN increase. +Our method overcomes this limitation by applying abstraction to a symbolic +interval based analysis of the DNN followed by iterative refinement guided by +the fairness property. Furthermore, our method lifts the symbolic interval +based analysis from conventional qualitative certification to quantitative +certification, by computing the percentage of individuals whose classification +outputs are provably fair, instead of merely deciding if the DNN is fair. We +have implemented our method and evaluated it on deep neural networks trained on +four popular fairness research datasets. The experimental results show that our +method is not only more accurate than state-of-the-art techniques but also +several orders-of-magnitude faster. + +
+
+ comment: To Appear In Proceedings of the 47th IEEE/ACM International + Conference on Software Engineering (ICSE 2025) +
+
+
+
+
+ + ☆ Content Moderation by LLM: From Accuracy to Legitimacy + + +
+ One trending application of LLM (large language model) is to use it for +content moderation in online platforms. Most current studies on this +application have focused on the metric of accuracy - the extent to which LLM +makes correct decisions about content. This article argues that accuracy is +insufficient and misleading, because it fails to grasp the distinction between +easy cases and hard cases as well as the inevitable trade-offs in achieving +higher accuracy. Closer examination reveals that content moderation is a +constitutive part of platform governance, the key of which is to gain and +enhance legitimacy. Instead of making moderation decisions correct, the chief +goal of LLM is to make them legitimate. In this regard, this article proposes a +paradigm shift from the single benchmark of accuracy towards a legitimacy-based +framework of evaluating the performance of LLM moderators. The framework +suggests that for easy cases, the key is to ensure accuracy, speed and +transparency, while for hard cases, what matters is reasoned justification and +user participation. Examined under this framework, LLM's real potential in +moderation is not accuracy improvement. Rather, LLM can better contribute in +four other aspects: to conduct screening of hard cases from easy cases, to +provide quality explanations for moderation decisions, to assist human +reviewers in getting more contextual information, and to facilitate user +participation in a more interactive way. Using normative theories from law and +social sciences to critically assess the new technological application, this +article seeks to redefine LLM's role in content moderation and redirect +relevant research in this field. + +
+
+
+
+
+ + ☆ Application Research On Real-Time Perception Of Device Performance + Status + + +
+ In order to accurately identify the performance status of mobile devices and +finely adjust the user experience, a real-time performance perception +evaluation method based on TOPSIS (Technique for Order Preference by Similarity +to Ideal Solution) combined with entropy weighting method and time series model +construction was studied. After collecting the performance characteristics of +various mobile devices, the device performance profile was fitted by using PCA +(principal component analysis) dimensionality reduction and feature engineering +methods such as descriptive time series analysis. The ability of performance +features and profiles to describe the real-time performance status of devices +was understood and studied by applying the TOPSIS method and multi-level +weighting processing. A time series model was constructed for the feature set +under objective weighting, and multiple sensitivity (real-time, short-term, +long-term) performance status perception results were provided to obtain +real-time performance evaluation data and long-term stable performance +prediction data. Finally, by configuring dynamic AB experiments and overlaying +fine-grained power reduction strategies, the usability of the method was +verified, and the accuracy of device performance status identification and +prediction was compared with the performance of the profile features including +dimensionality reduction time series modeling, TOPSIS method and entropy +weighting method, subjective weighting, HMA method. The results show that +accurate real-time performance perception results can greatly enhance business +value, and this research has application effectiveness and certain +forward-looking significance. + +
+
+
+
+
+ + ☆ xLAM: A Family of Large Action Models to Empower AI Agent Systems + + +
+ Autonomous agents powered by large language models (LLMs) have attracted +significant research interest. However, the open-source community faces many +challenges in developing specialized models for agent tasks, driven by the +scarcity of high-quality agent datasets and the absence of standard protocols +in this area. We introduce and publicly release xLAM, a series of large action +models designed for AI agent tasks. The xLAM series includes five models with +both dense and mixture-of-expert architectures, ranging from 1B to 8x22B +parameters, trained using a scalable, flexible pipeline that unifies, augments, +and synthesizes diverse datasets to enhance AI agents' generalizability and +performance across varied environments. Our experimental results demonstrate +that xLAM consistently delivers exceptional performance across multiple agent +ability benchmarks, notably securing the 1st position on the Berkeley +Function-Calling Leaderboard, outperforming GPT-4, Claude-3, and many other +models in terms of tool use. By releasing the xLAM series, we aim to advance +the performance of open-source LLMs for autonomous AI agents, potentially +accelerating progress and democratizing access to high-performance models for +agent tasks. Models are available at +https://huggingface.co/collections/Salesforce/xlam-models-65f00e2a0a63bbcd1c2dade4 + +
+
+ comment: Technical report for the Salesforce xLAM model series +
+
+
+
+
+ + ☆ Bi-capacity Choquet Integral for Sensor Fusion with Label Uncertainty + + +
+ Sensor fusion combines data from multiple sensor sources to improve +reliability, robustness, and accuracy of data interpretation. The Fuzzy +Integral (FI), in particular, the Choquet integral (ChI), is often used as a +powerful nonlinear aggregator for fusion across multiple sensors. However, +existing supervised ChI learning algorithms typically require precise training +labels for each input data point, which can be difficult or impossible to +obtain. Additionally, prior work on ChI fusion is often based only on the +normalized fuzzy measures, which bounds the fuzzy measure values between [0, +1]. This can be limiting in cases where the underlying scales of input data +sources are bipolar (i.e., between [-1, 1]). To address these challenges, this +paper proposes a novel Choquet integral-based fusion framework, named Bi-MIChI +(pronounced "bi-mi-kee"), which uses bi-capacities to represent the +interactions between pairs of subsets of the input sensor sources on a bi-polar +scale. This allows for extended non-linear interactions between the sensor +sources and can lead to interesting fusion results. Bi-MIChI also addresses +label uncertainty through Multiple Instance Learning, where training labels are +applied to "bags" (sets) of data instead of per-instance. Our proposed Bi-MIChI +framework shows effective classification and detection performance on both +synthetic and real-world experiments for sensor fusion with label uncertainty. +We also provide detailed analyses on the behavior of the fuzzy measures to +demonstrate our fusion process. + +
+
+ comment: 10 pages, 7 figures, 7 tables; Accepted to 2024 FUZZ-IEEE and + presented at 2024 IEEE WCCI; Code available at + https://github.com/hvak/Bi-MIChI +
+
+
+
+
+ + ☆ Pricing American Options using Machine Learning Algorithms + + +
+ This study investigates the application of machine learning algorithms, +particularly in the context of pricing American options using Monte Carlo +simulations. Traditional models, such as the Black-Scholes-Merton framework, +often fail to adequately address the complexities of American options, which +include the ability for early exercise and non-linear payoff structures. By +leveraging Monte Carlo methods in conjunction Least Square Method machine +learning was used. This research aims to improve the accuracy and efficiency of +option pricing. The study evaluates several machine learning models, including +neural networks and decision trees, highlighting their potential to outperform +traditional approaches. The results from applying machine learning algorithm in +LSM indicate that integrating machine learning with Monte Carlo simulations can +enhance pricing accuracy and provide more robust predictions, offering +significant insights into quantitative finance by merging classical financial +theories with modern computational techniques. The dataset was split into +features and the target variable representing bid prices, with an 80-20 +train-validation split. LSTM and GRU models were constructed using TensorFlow's +Keras API, each with four hidden layers of 200 neurons and an output layer for +bid price prediction, optimized with the Adam optimizer and MSE loss function. +The GRU model outperformed the LSTM model across all evaluated metrics, +demonstrating lower mean absolute error, mean squared error, and root mean +squared error, along with greater stability and efficiency in training. + +
+
+
+
+
+ + ☆ How noise affects memory in linear recurrent networks + + +
+ The effects of noise on memory in a linear recurrent network are +theoretically investigated. Memory is characterized by its ability to store +previous inputs in its instantaneous state of network, which receives a +correlated or uncorrelated noise. Two major properties are revealed: First, the +memory reduced by noise is uniquely determined by the noise's power spectral +density (PSD). Second, the memory will not decrease regardless of noise +intensity if the PSD is in a certain class of distribution (including power +law). The results are verified using the human brain signals, showing good +agreement. + +
+
+
+
+
+ + ☆ Machine learning-based algorithms for at-home respiratory disease + monitoring and respiratory assessment + + +
+ Respiratory diseases impose a significant burden on global health, with +current diagnostic and management practices primarily reliant on specialist +clinical testing. This work aims to develop machine learning-based algorithms +to facilitate at-home respiratory disease monitoring and assessment for +patients undergoing continuous positive airway pressure (CPAP) therapy. Data +were collected from 30 healthy adults, encompassing respiratory pressure, flow, +and dynamic thoraco-abdominal circumferential measurements under three +breathing conditions: normal, panting, and deep breathing. Various machine +learning models, including the random forest classifier, logistic regression, +and support vector machine (SVM), were trained to predict breathing types. The +random forest classifier demonstrated the highest accuracy, particularly when +incorporating breathing rate as a feature. These findings support the potential +of AI-driven respiratory monitoring systems to transition respiratory +assessments from clinical settings to home environments, enhancing +accessibility and patient autonomy. Future work involves validating these +models with larger, more diverse populations and exploring additional machine +learning techniques. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ InfraLib: Enabling Reinforcement Learning and Decision Making for Large + Scale Infrastructure Management + + +
+ Efficient management of infrastructure systems is crucial for economic +stability, sustainability, and public safety. However, infrastructure +management is challenging due to the vast scale of systems, stochastic +deterioration of components, partial observability, and resource constraints. +While data-driven approaches like reinforcement learning (RL) offer a promising +avenue for optimizing management policies, their application to infrastructure +has been limited by the lack of suitable simulation environments. We introduce +InfraLib, a comprehensive framework for modeling and analyzing infrastructure +management problems. InfraLib employs a hierarchical, stochastic approach to +realistically model infrastructure systems and their deterioration. It supports +practical functionality such as modeling component unavailability, cyclical +budgets, and catastrophic failures. To facilitate research, InfraLib provides +tools for expert data collection, simulation-driven analysis, and +visualization. We demonstrate InfraLib's capabilities through case studies on a +real-world road network and a synthetic benchmark with 100,000 components. + +
+
+
+
+
+ + ☆ A Scalable Matrix Visualization for Understanding Tree Ensemble + Classifiers + + +
+ The high performance of tree ensemble classifiers benefits from a large set +of rules, which, in turn, makes the models hard to understand. To improve +interpretability, existing methods extract a subset of rules for approximation +using model reduction techniques. However, by focusing on the reduced rule set, +these methods often lose fidelity and ignore anomalous rules that, despite +their infrequency, play crucial roles in real-world applications. This paper +introduces a scalable visual analysis method to explain tree ensemble +classifiers that contain tens of thousands of rules. The key idea is to address +the issue of losing fidelity by adaptively organizing the rules as a hierarchy +rather than reducing them. To ensure the inclusion of anomalous rules, we +develop an anomaly-biased model reduction method to prioritize these rules at +each hierarchical level. Synergized with this hierarchical organization of +rules, we develop a matrix-based hierarchical visualization to support +exploration at different levels of detail. Our quantitative experiments and +case studies demonstrate how our method fosters a deeper understanding of both +common and anomalous rules, thereby enhancing interpretability without +sacrificing comprehensiveness. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ☆ Standing on the shoulders of giants + + +
+ Although fundamental to the advancement of Machine Learning, the classic +evaluation metrics extracted from the confusion matrix, such as precision and +F1, are limited. Such metrics only offer a quantitative view of the models' +performance, without considering the complexity of the data or the quality of +the hit. To overcome these limitations, recent research has introduced the use +of psychometric metrics such as Item Response Theory (IRT), which allows an +assessment at the level of latent characteristics of instances. This work +investigates how IRT concepts can enrich a confusion matrix in order to +identify which model is the most appropriate among options with similar +performance. In the study carried out, IRT does not replace, but complements +classical metrics by offering a new layer of evaluation and observation of the +fine behavior of models in specific instances. It was also observed that there +is 97% confidence that the score from the IRT has different contributions from +66% of the classical metrics analyzed. + +
+
+ comment: 15 pages, 8 figures, 3 tables, submitted for the BRACIS'24 conference +
+
+
+
+
+ + ☆ Non-stationary and Sparsely-correlated Multi-output Gaussian Process + with Spike-and-Slab Prior + + +
+ Multi-output Gaussian process (MGP) is commonly used as a transfer learning +method to leverage information among multiple outputs. A key advantage of MGP +is providing uncertainty quantification for prediction, which is highly +important for subsequent decision-making tasks. However, traditional MGP may +not be sufficiently flexible to handle multivariate data with dynamic +characteristics, particularly when dealing with complex temporal correlations. +Additionally, since some outputs may lack correlation, transferring information +among them may lead to negative transfer. To address these issues, this study +proposes a non-stationary MGP model that can capture both the dynamic and +sparse correlation among outputs. Specifically, the covariance functions of MGP +are constructed using convolutions of time-varying kernel functions. Then a +dynamic spike-and-slab prior is placed on correlation parameters to +automatically decide which sources are informative to the target output in the +training process. An expectation-maximization (EM) algorithm is proposed for +efficient model fitting. Both numerical studies and a real case demonstrate its +efficacy in capturing dynamic and sparse correlation structure and mitigating +negative transfer for high-dimensional time-series data. Finally, a +mountain-car reinforcement learning case highlights its potential application +in decision making problems. + +
+
+
+
+
+ + ☆ Discovering Cyclists' Street Visual Preferences Through Multi-Source Big + Data Using Deep Inverse Reinforcement Learning + + +
+ Cycling has gained global popularity for its health benefits and positive +urban impacts. To effectively promote cycling, early studies have extensively +investigated the relationship between cycling behaviors and environmental +factors, especially cyclists' preferences when making route decisions. However, +these studies often struggle to comprehensively describe detailed cycling +procedures at a large scale due to data limitations, and they tend to overlook +the complex nature of cyclists' preferences. To address these issues, we +propose a novel framework aimed to quantify and interpret cyclists' complicated +street visual preferences from cycling records by leveraging maximum entropy +deep inverse reinforcement learning (MEDIRL) and explainable artificial +intelligence (XAI). Implemented in Bantian Sub-district, Shenzhen, we adapt +MEDIRL model for efficient estimation of cycling reward function by integrating +dockless-bike-sharing (DBS) trajectory and street view images (SVIs), which +serves as a representation of cyclists' preferences for street visual +environments during routing. In addition, we demonstrate the feasibility and +reliability of MEDIRL in discovering cyclists' street visual preferences. +Further analysis reveals the nonlinear and interactive effects of street visual +elements on cyclists' preferences, offering a holistic perspective on +streetscape design. Our proposed framework advances the understanding of +individual cycling behaviors and provides actionable insights for urban +planners to design bicycle-friendly streetscapes that prioritize cyclists' +preferences. + +
+
+ comment: 38 pages, 16 figures +
+
+
+
+
+ + ☆ Addressing the Gaps in Early Dementia Detection: A Path Towards Enhanced + Diagnostic Models through Machine Learning + + +
+ The rapid global aging trend has led to an increase in dementia cases, +including Alzheimer's disease, underscoring the urgent need for early and +accurate diagnostic methods. Traditional diagnostic techniques, such as +cognitive tests, neuroimaging, and biomarker analysis, face significant +limitations in sensitivity, accessibility, and cost, particularly in the early +stages. This study explores the potential of machine learning (ML) as a +transformative approach to enhance early dementia detection by leveraging ML +models to analyze and integrate complex multimodal datasets, including +cognitive assessments, neuroimaging, and genetic information. A comprehensive +review of existing literature was conducted to evaluate various ML models, +including supervised learning, deep learning, and advanced techniques such as +ensemble learning and transformer models, assessing their accuracy, +interpretability, and potential for clinical integration. The findings indicate +that while ML models show significant promise in improving diagnostic precision +and enabling earlier interventions, challenges remain in their +generalizability, interpretability, and ethical deployment. This research +concludes by outlining future directions aimed at enhancing the clinical +utility of ML models in dementia detection, emphasizing interdisciplinary +collaboration and ethically sound frameworks to improve early detection and +intervention strategies for Alzheimer's disease and other forms of dementia. + +
+
+
+
+
+ + ☆ Causal Temporal Representation Learning with Nonstationary Sparse + Transition + + +
+ Causal Temporal Representation Learning (Ctrl) methods aim to identify the +temporal causal dynamics of complex nonstationary temporal sequences. Despite +the success of existing Ctrl methods, they require either directly observing +the domain variables or assuming a Markov prior on them. Such requirements +limit the application of these methods in real-world scenarios when we do not +have such prior knowledge of the domain variables. To address this problem, +this work adopts a sparse transition assumption, aligned with intuitive human +understanding, and presents identifiability results from a theoretical +perspective. In particular, we explore under what conditions on the +significance of the variability of the transitions we can build a model to +identify the distribution shifts. Based on the theoretical result, we introduce +a novel framework, Causal Temporal Representation Learning with Nonstationary +Sparse Transition (CtrlNS), designed to leverage the constraints on transition +sparsity and conditional independence to reliably identify both distribution +shifts and latent factors. Our experimental evaluations on synthetic and +real-world datasets demonstrate significant improvements over existing +baselines, highlighting the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Towards Autonomous Cybersecurity: An Intelligent AutoML Framework for + Autonomous Intrusion Detection CCS 2024 + + +
+ The rapid evolution of mobile networks from 5G to 6G has necessitated the +development of autonomous network management systems, such as Zero-Touch +Networks (ZTNs). However, the increased complexity and automation of these +networks have also escalated cybersecurity risks. Existing Intrusion Detection +Systems (IDSs) leveraging traditional Machine Learning (ML) techniques have +shown effectiveness in mitigating these risks, but they often require extensive +manual effort and expert knowledge. To address these challenges, this paper +proposes an Automated Machine Learning (AutoML)-based autonomous IDS framework +towards achieving autonomous cybersecurity for next-generation networks. To +achieve autonomous intrusion detection, the proposed AutoML framework automates +all critical procedures of the data analytics pipeline, including data +pre-processing, feature engineering, model selection, hyperparameter tuning, +and model ensemble. Specifically, it utilizes a Tabular Variational +Auto-Encoder (TVAE) method for automated data balancing, tree-based ML models +for automated feature selection and base model learning, Bayesian Optimization +(BO) for hyperparameter optimization, and a novel Optimized Confidence-based +Stacking Ensemble (OCSE) method for automated model ensemble. The proposed +AutoML-based IDS was evaluated on two public benchmark network security +datasets, CICIDS2017 and 5G-NIDD, and demonstrated improved performance +compared to state-of-the-art cybersecurity methods. This research marks a +significant step towards fully autonomous cybersecurity in next-generation +networks, potentially revolutionizing network security applications. + +
+
+ comment: Accepted to the Workshop on Autonomous Cybersecurity, ACM CCS 2024; + Code is available at Github link: + https://github.com/Western-OC2-Lab/AutonomousCyber-AutoML-based-Autonomous-Intrusion-Detection-System +
+
+
+
+
+ + ☆ GraphEx: A Graph-based Extraction Method for Advertiser Keyphrase + Recommendation + + +
+ Online sellers and advertisers are recommended keyphrases for their listed +products, which they bid on to enhance their sales. One popular paradigm that +generates such recommendations is Extreme Multi-Label Classification (XMC), +which involves tagging/mapping keyphrases to items. We outline the limitations +of using traditional item-query based tagging or mapping techniques for +keyphrase recommendations on E-Commerce platforms. We introduce GraphEx, an +innovative graph-based approach that recommends keyphrases to sellers using +extraction of token permutations from item titles. Additionally, we demonstrate +that relying on traditional metrics such as precision/recall can be misleading +in practical applications, thereby necessitating a combination of metrics to +evaluate performance in real-world scenarios. These metrics are designed to +assess the relevance of keyphrases to items and the potential for buyer +outreach. GraphEx outperforms production models at eBay, achieving the +objectives mentioned above. It supports near real-time inferencing in +resource-constrained production environments and scales effectively for +billions of items. + +
+
+
+
+
+ + ☆ The AdEMAMix Optimizer: Better, Faster, Older + + +
+ Momentum based optimizers are central to a wide range of machine learning +applications. These typically rely on an Exponential Moving Average (EMA) of +gradients, which decays exponentially the present contribution of older +gradients. This accounts for gradients being local linear approximations which +lose their relevance as the iterate moves along the loss landscape. This work +questions the use of a single EMA to accumulate past gradients and empirically +demonstrates how this choice can be sub-optimal: a single EMA cannot +simultaneously give a high weight to the immediate past, and a non-negligible +weight to older gradients. Building on this observation, we propose AdEMAMix, a +simple modification of the Adam optimizer with a mixture of two EMAs to better +take advantage of past gradients. Our experiments on language modeling and +image classification show -- quite surprisingly -- that gradients can stay +relevant for tens of thousands of steps. They help to converge faster, and +often to lower minima: e.g., a $1.3$B parameter AdEMAMix LLM trained on $101$B +tokens performs comparably to an AdamW model trained on $197$B tokens +($+95\%$). Moreover, our method significantly slows-down model forgetting +during training. Our work motivates further exploration of different types of +functions to leverage past gradients, beyond EMAs. + +
+
+ comment: 38 pages, 27 figures +
+
+
+
+
+ + ☆ Generating High Dimensional User-Specific Wireless Channels using + Diffusion Models + + +
+ Deep neural network (DNN)-based algorithms are emerging as an important tool +for many physical and MAC layer functions in future wireless communication +systems, including for large multi-antenna channels. However, training such +models typically requires a large dataset of high-dimensional channel +measurements, which are very difficult and expensive to obtain. This paper +introduces a novel method for generating synthetic wireless channel data using +diffusion-based models to produce user-specific channels that accurately +reflect real-world wireless environments. Our approach employs a conditional +denoising diffusion implicit models (cDDIM) framework, effectively capturing +the relationship between user location and multi-antenna channel +characteristics. We generate synthetic high fidelity channel samples using user +positions as conditional inputs, creating larger augmented datasets to overcome +measurement scarcity. The utility of this method is demonstrated through its +efficacy in training various downstream tasks such as channel compression and +beam alignment. Our approach significantly improves over prior methods, such as +adding noise or using generative adversarial networks (GANs), especially in +scenarios with limited initial measurements. + +
+
+
+
+
+ + ☆ A Survey on Signed Graph Embedding: Methods and Applications + + +
+ A signed graph (SG) is a graph where edges carry sign information attached to +it. The sign of a network can be positive, negative, or neutral. A signed +network is ubiquitous in a real-world network like social networks, citation +networks, and various technical networks. There are many network embedding +models have been proposed and developed for signed networks for both +homogeneous and heterogeneous types. SG embedding learns low-dimensional vector +representations for nodes of a network, which helps to do many network analysis +tasks such as link prediction, node classification, and community detection. In +this survey, we perform a comprehensive study of SG embedding methods and +applications. We introduce here the basic theories and methods of SGs and +survey the current state of the art of signed graph embedding methods. In +addition, we explore the applications of different types of SG embedding +methods in real-world scenarios. As an application, we have explored the +citation network to analyze authorship networks. We also provide source code +and datasets to give future direction. Lastly, we explore the challenges of SG +embedding and forecast various future research directions in this field. + +
+
+
+
+
+ + ☆ Asynchronous Stochastic Approximation and Average-Reward Reinforcement + Learning + + +
+ This paper studies asynchronous stochastic approximation (SA) algorithms and +their application to reinforcement learning in semi-Markov decision processes +(SMDPs) with an average-reward criterion. We first extend Borkar and Meyn's +stability proof method to accommodate more general noise conditions, leading to +broader convergence guarantees for asynchronous SA algorithms. Leveraging these +results, we establish the convergence of an asynchronous SA analogue of +Schweitzer's classical relative value iteration algorithm, RVI Q-learning, for +finite-space, weakly communicating SMDPs. Furthermore, to fully utilize the SA +results in this application, we introduce new monotonicity conditions for +estimating the optimal reward rate in RVI Q-learning. These conditions +substantially expand the previously considered algorithmic framework, and we +address them with novel proof arguments in the stability and convergence +analysis of RVI Q-learning. + +
+
+ comment: The materials in this paper extend the authors' results from 2023, + reported in arXiv:2408.16262 and arXiv:2312.15091. This paper incorporates + and subsumes the results of arXiv:2312.15091 and serves as Part II of + arXiv:2408.16262 +
+
+
+
+
+ + ☆ WaterMAS: Sharpness-Aware Maximization for Neural Network Watermarking + + +
+ Nowadays, deep neural networks are used for solving complex tasks in several +critical applications and protecting both their integrity and intellectual +property rights (IPR) has become of utmost importance. To this end, we advance +WaterMAS, a substitutive, white-box neural network watermarking method that +improves the trade-off among robustness, imperceptibility, and computational +complexity, while making provisions for increased data payload and security. +WasterMAS insertion keeps unchanged the watermarked weights while sharpening +their underlying gradient space. The robustness is thus ensured by limiting the +attack's strength: even small alterations of the watermarked weights would +impact the model's performance. The imperceptibility is ensured by inserting +the watermark during the training process. The relationship among the WaterMAS +data payload, imperceptibility, and robustness properties is discussed. The +secret key is represented by the positions of the weights conveying the +watermark, randomly chosen through multiple layers of the model. The security +is evaluated by investigating the case in which an attacker would intercept the +key. The experimental validations consider 5 models and 2 tasks (VGG16, +ResNet18, MobileNetV3, SwinT for CIFAR10 image classification, and DeepLabV3 +for Cityscapes image segmentation) as well as 4 types of attacks (Gaussian +noise addition, pruning, fine-tuning, and quantization). The code will be +released open-source upon acceptance of the article. + +
+
+
+
+
+ + ☆ On the Convergence Rates of Federated Q-Learning across Heterogeneous + Environments + + +
+ Large-scale multi-agent systems are often deployed across wide geographic +areas, where agents interact with heterogeneous environments. There is an +emerging interest in understanding the role of heterogeneity in the performance +of the federated versions of classic reinforcement learning algorithms. In this +paper, we study synchronous federated Q-learning, which aims to learn an +optimal Q-function by having $K$ agents average their local Q-estimates per $E$ +iterations. We observe an interesting phenomenon on the convergence speeds in +terms of $K$ and $E$. Similar to the homogeneous environment settings, there is +a linear speed-up concerning $K$ in reducing the errors that arise from +sampling randomness. Yet, in sharp contrast to the homogeneous settings, $E>1$ +leads to significant performance degradation. Specifically, we provide a +fine-grained characterization of the error evolution in the presence of +environmental heterogeneity, which decay to zero as the number of iterations +$T$ increases. The slow convergence of having $E>1$ turns out to be fundamental +rather than an artifact of our analysis. We prove that, for a wide range of +stepsizes, the $\ell_{\infty}$ norm of the error cannot decay faster than +$\Theta (E/T)$. In addition, our experiments demonstrate that the convergence +exhibits an interesting two-phase phenomenon. For any given stepsize, there is +a sharp phase-transition of the convergence: the error decays rapidly in the +beginning yet later bounces up and stabilizes. Provided that the +phase-transition time can be estimated, choosing different stepsizes for the +two phases leads to faster overall convergence. + +
+
+
+
+
+ + ☆ Understanding Fairness Metrics in Recommender Systems: A Healthcare + Perspective + + +
+ Fairness in AI-driven decision-making systems has become a critical concern, +especially when these systems directly affect human lives. This paper explores +the public's comprehension of fairness in healthcare recommendations. We +conducted a survey where participants selected from four fairness metrics -- +Demographic Parity, Equal Accuracy, Equalized Odds, and Positive Predictive +Value -- across different healthcare scenarios to assess their understanding of +these concepts. Our findings reveal that fairness is a complex and often +misunderstood concept, with a generally low level of public understanding +regarding fairness metrics in recommender systems. This study highlights the +need for enhanced information and education on algorithmic fairness to support +informed decision-making in using these systems. Furthermore, the results +suggest that a one-size-fits-all approach to fairness may be insufficient, +pointing to the importance of context-sensitive designs in developing equitable +AI systems. + +
+
+ comment: Accepted to the 18th ACM Conference on Recommender Systems +
+
+
+
+
+ + ☆ Active Sampling of Interpolation Points to Identify Dominant Subspaces + for Model Reduction + + +
+ Model reduction is an active research field to construct low-dimensional +surrogate models of high fidelity to accelerate engineering design cycles. In +this work, we investigate model reduction for linear structured systems using +dominant reachable and observable subspaces. When the training set $-$ +containing all possible interpolation points $-$ is large, then these subspaces +can be determined by solving many large-scale linear systems. However, for +high-fidelity models, this easily becomes computationally intractable. To +circumvent this issue, in this work, we propose an active sampling strategy to +sample only a few points from the given training set, which can allow us to +estimate those subspaces accurately. To this end, we formulate the +identification of the subspaces as the solution of the generalized Sylvester +equations, guiding us to select the most relevant samples from the training set +to achieve our goals. Consequently, we construct solutions of the matrix +equations in low-rank forms, which encode subspace information. We extensively +discuss computational aspects and efficient usage of the low-rank factors in +the process of obtaining reduced-order models. We illustrate the proposed +active sampling scheme to obtain reduced-order models via dominant reachable +and observable subspaces and present its comparison with the method where all +the points from the training set are taken into account. It is shown that the +active sample strategy can provide us $17$x speed-up without sacrificing any +noticeable accuracy. + +
+
+ comment: 20 pages, 9 figures +
+
+
+
+
+ + ☆ Overfitting Behaviour of Gaussian Kernel Ridgeless Regression: Varying + Bandwidth or Dimensionality + + +
+ We consider the overfitting behavior of minimum norm interpolating solutions +of Gaussian kernel ridge regression (i.e. kernel ridgeless regression), when +the bandwidth or input dimension varies with the sample size. For fixed +dimensions, we show that even with varying or tuned bandwidth, the ridgeless +solution is never consistent and, at least with large enough noise, always +worse than the null predictor. For increasing dimension, we give a generic +characterization of the overfitting behavior for any scaling of the dimension +with sample size. We use this to provide the first example of benign +overfitting using the Gaussian kernel with sub-polynomial scaling dimension. +All our results are under the Gaussian universality ansatz and the +(non-rigorous) risk predictions in terms of the kernel eigenstructure. + +
+
+
+
+
+ + ☆ The Influence of Faulty Labels in Data Sets on Human Pose Estimation + + +
+ In this study we provide empirical evidence demonstrating that the quality of +training data impacts model performance in Human Pose Estimation (HPE). +Inaccurate labels in widely used data sets, ranging from minor errors to severe +mislabeling, can negatively influence learning and distort performance metrics. +We perform an in-depth analysis of popular HPE data sets to show the extent and +nature of label inaccuracies. Our findings suggest that accounting for the +impact of faulty labels will facilitate the development of more robust and +accurate HPE models for a variety of real-world applications. We show improved +performance with cleansed data. + +
+
+ comment: 15 pages, 7 figures, 5 tables +
+
+
+
+
+ + ☆ Cost-Control in Display Advertising: Theory vs Practice + + +
+ In display advertising, advertisers want to achieve a marketing objective +with constraints on budget and cost-per-outcome. This is usually formulated as +an optimization problem that maximizes the total utility under constraints. The +optimization is carried out in an online fashion in the dual space - for an +incoming Ad auction, a bid is placed using an optimal bidding formula, assuming +optimal values for the dual variables; based on the outcome of the previous +auctions, the dual variables are updated in an online fashion. While this +approach is theoretically sound, in practice, the dual variables are not +optimal from the beginning, but rather converge over time. Specifically, for +the cost-constraint, the convergence is asymptotic. As a result, we find that +cost-control is ineffective. In this work, we analyse the shortcomings of the +optimal bidding formula and propose a modification that deviates from the +theoretical derivation. We simulate various practical scenarios and study the +cost-control behaviors of the two algorithms. Through a large-scale evaluation +on the real-word data, we show that the proposed modification reduces the cost +violations by 50%, thereby achieving a better cost-control than the theoretical +bidding formula. + +
+
+
+
+
+ + ☆ Can We Theoretically Quantify the Impacts of Local Updates on the + Generalization Performance of Federated Learning? + + +
+ Federated Learning (FL) has gained significant popularity due to its +effectiveness in training machine learning models across diverse sites without +requiring direct data sharing. While various algorithms along with their +optimization analyses have shown that FL with local updates is a +communication-efficient distributed learning framework, the generalization +performance of FL with local updates has received comparatively less attention. +This lack of investigation can be attributed to the complex interplay between +data heterogeneity and infrequent communication due to the local updates within +the FL framework. This motivates us to investigate a fundamental question in +FL: Can we quantify the impact of data heterogeneity and local updates on the +generalization performance for FL as the learning process evolves? To this end, +we conduct a comprehensive theoretical study of FL's generalization performance +using a linear model as the first step, where the data heterogeneity is +considered for both the stationary and online/non-stationary cases. By +providing closed-form expressions of the model error, we rigorously quantify +the impact of the number of the local updates (denoted as $K$) under three +settings ($K=1$, $K<\infty$, and $K=\infty$) and show how the generalization +performance evolves with the number of rounds $t$. Our investigation also +provides a comprehensive understanding of how different configurations +(including the number of model parameters $p$ and the number of training +samples $n$) contribute to the overall generalization performance, thus +shedding new insights (such as benign overfitting) for implementing FL over +networks. + +
+
+ comment: Published in MobiHoc 2024 +
+
+
+
+
+ + ☆ Latent Space Energy-based Neural ODEs + + +
+ This paper introduces a novel family of deep dynamical models designed to +represent continuous-time sequence data. This family of models generates each +data point in the time series by a neural emission model, which is a non-linear +transformation of a latent state vector. The trajectory of the latent states is +implicitly described by a neural ordinary differential equation (ODE), with the +initial state following an informative prior distribution parameterized by an +energy-based model. Furthermore, we can extend this model to disentangle +dynamic states from underlying static factors of variation, represented as +time-invariant variables in the latent space. We train the model using maximum +likelihood estimation with Markov chain Monte Carlo (MCMC) in an end-to-end +manner, without requiring additional assisting components such as an inference +network. Our experiments on oscillating systems, videos and real-world state +sequences (MuJoCo) illustrate that ODEs with the learnable energy-based prior +outperform existing counterparts, and can generalize to new dynamic +parameterization, enabling long-horizon predictions. + +
+
+
+
+
+ + ☆ Neural Entropy + + +
+ We examine the connection between deep learning and information theory +through the paradigm of diffusion models. Using well-established principles +from non-equilibrium thermodynamics we can characterize the amount of +information required to reverse a diffusive process. Neural networks store this +information and operate in a manner reminiscent of Maxwell's demon during the +generative stage. We illustrate this cycle using a novel diffusion scheme we +call the entropy matching model, wherein the information conveyed to the +network during training exactly corresponds to the entropy that must be negated +during reversal. We demonstrate that this entropy can be used to analyze the +encoding efficiency and storage capacity of the network. This conceptual +picture blends elements of stochastic optimal control, thermodynamics, +information theory, and optimal transport, and raises the prospect of applying +diffusion models as a test bench to understand neural networks. + +
+
+ comment: 37 pages + references, 11 figures +
+
+
+
+
+ + ☆ How Do Your Code LLMs Perform? Empowering Code Instruction Tuning with + High-Quality Data + + +
+ Recently, there has been a growing interest in studying how to construct +better code instruction tuning data. However, we observe Code models trained +with these datasets exhibit high performance on HumanEval but perform worse on +other benchmarks such as LiveCodeBench. Upon further investigation, we find +that many datasets suffer from severe data leakage. After cleaning up most of +the leaked data, some well-known high-quality datasets perform poorly. This +discovery reveals a new challenge: identifying which dataset genuinely qualify +as high-quality code instruction data. To address this, we propose an efficient +code data pruning strategy for selecting good samples. Our approach is based on +three dimensions: instruction complexity, response quality, and instruction +diversity. Based on our selected data, we present XCoder, a family of models +finetuned from LLaMA3. Our experiments show XCoder achieves new +state-of-the-art performance using fewer training data, which verify the +effectiveness of our data strategy. Moreover, we perform a comprehensive +analysis on the data composition and find existing code datasets have different +characteristics according to their construction methods, which provide new +insights for future code LLMs. Our models and dataset are released in +https://github.com/banksy23/XCoder + +
+
+ comment: Working in progress +
+
+
+
+
+ + ♻ ☆ A Graph-based Adversarial Imitation Learning Framework for Reliable & + Realtime Fleet Scheduling in Urban Air Mobility + + +
+ The advent of Urban Air Mobility (UAM) presents the scope for a +transformative shift in the domain of urban transportation. However, its +widespread adoption and economic viability depends in part on the ability to +optimally schedule the fleet of aircraft across vertiports in a UAM network, +under uncertainties attributed to airspace congestion, changing weather +conditions, and varying demands. This paper presents a comprehensive +optimization formulation of the fleet scheduling problem, while also +identifying the need for alternate solution approaches, since directly solving +the resulting integer nonlinear programming problem is computationally +prohibitive for daily fleet scheduling. Previous work has shown the +effectiveness of using (graph) reinforcement learning (RL) approaches to train +real-time executable policy models for fleet scheduling. However, such policies +can often be brittle on out-of-distribution scenarios or edge cases. Moreover, +training performance also deteriorates as the complexity (e.g., number of +constraints) of the problem increases. To address these issues, this paper +presents an imitation learning approach where the RL-based policy exploits +expert demonstrations yielded by solving the exact optimization using a Genetic +Algorithm. The policy model comprises Graph Neural Network (GNN) based encoders +that embed the space of vertiports and aircraft, Transformer networks to encode +demand, passenger fare, and transport cost profiles, and a Multi-head attention +(MHA) based decoder. Expert demonstrations are used through the Generative +Adversarial Imitation Learning (GAIL) algorithm. Interfaced with a UAM +simulation environment involving 8 vertiports and 40 aircrafts, in terms of the +daily profits earned reward, the new imitative approach achieves better mean +performance and remarkable improvement in the case of unseen worst-case +scenarios, compared to pure RL results. + +
+
+ comment: Presented at the AIAA Aviation Forum 2024 +
+
+
+
+
+ + ♻ ☆ Physics-Informed Machine Learning Towards A Real-Time Spacecraft Thermal + Simulator + + +
+ Modeling thermal states for complex space missions, such as the surface +exploration of airless bodies, requires high computation, whether used in +ground-based analysis for spacecraft design or during onboard reasoning for +autonomous operations. For example, a finite-element thermal model with +hundreds of elements can take significant time to simulate, which makes it +unsuitable for onboard reasoning during time-sensitive scenarios such as +descent and landing, proximity operations, or in-space assembly. Further, the +lack of fast and accurate thermal modeling drives thermal designs to be more +conservative and leads to spacecraft with larger mass and higher power budgets. +The emerging paradigm of physics-informed machine learning (PIML) presents a +class of hybrid modeling architectures that address this challenge by combining +simplified physics models with machine learning (ML) models resulting in models +which maintain both interpretability and robustness. Such techniques enable +designs with reduced mass and power through onboard thermal-state estimation +and control and may lead to improved onboard handling of off-nominal states, +including unplanned down-time. The PIML model or hybrid model presented here +consists of a neural network which predicts reduced nodalizations (distribution +and size of coarse mesh) given on-orbit thermal load conditions, and +subsequently a (relatively coarse) finite-difference model operates on this +mesh to predict thermal states. We compare the computational performance and +accuracy of the hybrid model to a data-driven neural net model, and a +high-fidelity finite-difference model of a prototype Earth-orbiting small +spacecraft. The PIML based active nodalization approach provides significantly +better generalization than the neural net model and coarse mesh model, while +reducing computing cost by up to 1.7x compared to the high-fidelity model. + +
+
+ comment: Presented at the AIAA Aviation 2024 Forum +
+
+
+
+
+ + ♻ ☆ Deep Neural Implicit Representation of Accessibility for Multi-Axis + Manufacturing SP + + +
+ One of the main concerns in design and process planning for multi-axis +additive and subtractive manufacturing is collision avoidance between moving +objects (e.g., tool assemblies) and stationary objects (e.g., a part unified +with fixtures). The collision measure for various pairs of relative rigid +translations and rotations between the two pointsets can be conceptualized by a +compactly supported scalar field over the 6D non-Euclidean configuration space. +Explicit representation and computation of this field is costly in both time +and space. If we fix $O(m)$ sparsely sampled rotations (e.g., tool +orientations), computation of the collision measure field as a convolution of +indicator functions of the 3D pointsets over a uniform grid (i.e., voxelized +geometry) of resolution $O(n^3)$ via fast Fourier transforms (FFTs) scales as +in $O(mn^3 \log n)$ in time and $O(mn^3)$ in space. In this paper, we develop +an implicit representation of the collision measure field via deep neural +networks (DNNs). We show that our approach is able to accurately interpolate +the collision measure from a sparse sampling of rotations, and can represent +the collision measure field with a small memory footprint. Moreover, we show +that this representation can be efficiently updated through fine-tuning to more +efficiently train the network on multi-resolution data, as well as accommodate +incremental changes to the geometry (such as might occur in iterative processes +such as topology optimization of the part subject to CNC tool accessibility +constraints). + +
+
+ comment: Special Issue on symposium on Solid and Physical Modeling (SPM 2023) +
+
+
+
+
+ + ♻ ☆ Data Mixture Inference: What do BPE Tokenizers Reveal about their + Training Data? + + +
+ The pretraining data of today's strongest language models is opaque; in +particular, little is known about the proportions of various domains or +languages represented. In this work, we tackle a task which we call data +mixture inference, which aims to uncover the distributional make-up of training +data. We introduce a novel attack based on a previously overlooked source of +information: byte-pair encoding (BPE) tokenizers, used by the vast majority of +modern language models. Our key insight is that the ordered list of merge rules +learned by a BPE tokenizer naturally reveals information about the token +frequencies in its training data. Given a tokenizer's merge list along with +example data for each category of interest, we formulate a linear program that +solves for the proportion of each category in the tokenizer's training set. In +controlled experiments, we show that our attack recovers mixture ratios with +high precision for tokenizers trained on known mixtures of natural languages, +programming languages, and data sources. We then apply our approach to +off-the-shelf tokenizers released with recent LMs. We confirm much publicly +disclosed information about these models, and also make several new inferences: +GPT-4o and Mistral NeMo's tokenizers are much more multilingual than their +predecessors, training on 39% and 47% non-English language data, respectively; +Llama 3 extends GPT-3.5's tokenizer primarily for multilingual (48%) use; +GPT-3.5's and Claude's tokenizers are trained on predominantly code (~60%). We +hope our work sheds light on current design practices for pretraining data, and +inspires continued research into data mixture inference for LMs. + +
+
+ comment: new robustness experiments; new baselines; include Mistral, + Mistral-Nemo and GPT-NeoX; link to code +
+
+
+
+
+ + ♻ ☆ Evaluations of Machine Learning Privacy Defenses are Misleading CCS 2024 + + +
+ Empirical defenses for machine learning privacy forgo the provable guarantees +of differential privacy in the hope of achieving higher utility while resisting +realistic adversaries. We identify severe pitfalls in existing empirical +privacy evaluations (based on membership inference attacks) that result in +misleading conclusions. In particular, we show that prior evaluations fail to +characterize the privacy leakage of the most vulnerable samples, use weak +attacks, and avoid comparisons with practical differential privacy baselines. +In 5 case studies of empirical privacy defenses, we find that prior evaluations +underestimate privacy leakage by an order of magnitude. Under our stronger +evaluation, none of the empirical defenses we study are competitive with a +properly tuned, high-utility DP-SGD baseline (with vacuous provable +guarantees). + +
+
+ comment: Accepted at ACM CCS 2024 +
+
+
+
+
+ + ♻ ☆ Towards Neural Network based Cognitive Models of Dynamic Decision-Making + by Humans + + +
+ Modeling human cognitive processes in dynamic decision-making tasks has been +an endeavor in AI for a long time because such models can help make AI systems +more intuitive, personalized, mitigate any human biases, and enhance training +in simulation. Some initial work has attempted to utilize neural networks (and +large language models) but often assumes one common model for all humans and +aims to emulate human behavior in aggregate. However, the behavior of each +human is distinct, heterogeneous, and relies on specific past experiences in +certain tasks. For instance, consider two individuals responding to a phishing +email: one who has previously encountered and identified similar threats may +recognize it quickly, while another without such experience might fall for the +scam. In this work, we build on Instance Based Learning (IBL) that posits that +human decisions are based on similar situations encountered in the past. +However, IBL relies on simple fixed form functions to capture the mapping from +past situations to current decisions. To that end, we propose two new +attention-based neural network models to have open form non-linear functions to +model distinct and heterogeneous human decision-making in dynamic settings. We +experiment with two distinct datasets gathered from human subject experiment +data, one focusing on detection of phishing email by humans and another where +humans act as attackers in a cybersecurity setting and decide on an attack +option. We conducted extensive experiments with our two neural network models, +IBL, and GPT3.5, and demonstrate that the neural network models outperform IBL +significantly in representing human decision-making, while providing similar +interpretability of human decisions as IBL. Overall, our work yields promising +results for further use of neural networks in cognitive modeling of human +decision making. + +
+
+ comment: Our code is available at https://github.com/shshnkreddy/NCM-HDM +
+
+
+
+
+ + ♻ ☆ Tracing Privacy Leakage of Language Models to Training Data via Adjusted + Influence Functions + + +
+ The responses generated by Large Language Models (LLMs) can include sensitive +information from individuals and organizations, leading to potential privacy +leakage. This work implements Influence Functions (IFs) to trace privacy +leakage back to the training data, thereby mitigating privacy concerns of +Language Models (LMs). However, we notice that current IFs struggle to +accurately estimate the influence of tokens with large gradient norms, +potentially overestimating their influence. When tracing the most influential +samples, this leads to frequently tracing back to samples with large gradient +norm tokens, overshadowing the actual most influential samples even if their +influences are well estimated. To address this issue, we propose Heuristically +Adjusted IF (HAIF), which reduces the weight of tokens with large gradient +norms, thereby significantly improving the accuracy of tracing the most +influential samples. To establish easily obtained groundtruth for tracing +privacy leakage, we construct two datasets, PII-E and PII-CR, representing two +distinct scenarios: one with identical text in the model outputs and +pre-training data, and the other where models leverage their reasoning +abilities to generate text divergent from pre-training data. HAIF significantly +improves tracing accuracy, enhancing it by 20.96% to 73.71% on the PII-E +dataset and 3.21% to 45.93% on the PII-CR dataset, compared to the best SOTA +IFs against various GPT-2 and QWen-1.5 models. HAIF also outperforms SOTA IFs +on real-world pretraining data CLUECorpus2020, demonstrating strong robustness +regardless prompt and response lengths. + +
+
+
+
+
+ + ♻ ☆ Robust Clustering on High-Dimensional Data with Stochastic Quantization + + +
+ This paper addresses the limitations of traditional vector quantization +(clustering) algorithms, particularly K-Means and its variant K-Means++, and +explores the Stochastic Quantization (SQ) algorithm as a scalable alternative +for high-dimensional unsupervised and semi-supervised learning problems. Some +traditional clustering algorithms suffer from inefficient memory utilization +during computation, necessitating the loading of all data samples into memory, +which becomes impractical for large-scale datasets. While variants such as +Mini-Batch K-Means partially mitigate this issue by reducing memory usage, they +lack robust theoretical convergence guarantees due to the non-convex nature of +clustering problems. In contrast, the Stochastic Quantization algorithm +provides strong theoretical convergence guarantees, making it a robust +alternative for clustering tasks. We demonstrate the computational efficiency +and rapid convergence of the algorithm on an image classification problem with +partially labeled data, comparing model accuracy across various ratios of +labeled to unlabeled data. To address the challenge of high dimensionality, we +trained Triplet Network to encode images into low-dimensional representations +in a latent space, which serve as a basis for comparing the efficiency of both +the Stochastic Quantization algorithm and traditional quantization algorithms. +Furthermore, we enhance the algorithm's convergence speed by introducing +modifications with an adaptive learning rate. + +
+
+ comment: 20 pages, 5 figures, to be published in the International Scientific + Technical Journal "Problems of Control and Informatics" +
+
+
+
+
+ + ♻ ☆ Rethinking Molecular Design: Integrating Latent Variable and + Auto-Regressive Models for Goal Directed Generation + + +
+ De novo molecule design has become a highly active research area, advanced +significantly through the use of state-of-the-art generative models. Despite +these advances, several fundamental questions remain unanswered as the field +increasingly focuses on more complex generative models and sophisticated +molecular representations as an answer to the challenges of drug design. In +this paper, we return to the simplest representation of molecules, and +investigate overlooked limitations of classical generative approaches, +particularly Variational Autoencoders (VAEs) and auto-regressive models. We +propose a hybrid model in the form of a novel regularizer that leverages the +strengths of both to improve validity, conditional generation, and style +transfer of molecular sequences. Additionally, we provide an in depth +discussion of overlooked assumptions of these models' behaviour. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Generative Adversarial Imitation Learning with Mid-level + Input Generation for Autonomous Driving on Urban Environments + + +
+ Deriving robust control policies for realistic urban navigation scenarios is +not a trivial task. In an end-to-end approach, these policies must map +high-dimensional images from the vehicle's cameras to low-level actions such as +steering and throttle. While pure Reinforcement Learning (RL) approaches are +based exclusively on engineered rewards, Generative Adversarial Imitation +Learning (GAIL) agents learn from expert demonstrations while interacting with +the environment, which favors GAIL on tasks for which a reward signal is +difficult to derive, such as autonomous driving. However, training deep +networks directly from raw images on RL tasks is known to be unstable and +troublesome. To deal with that, this work proposes a hierarchical GAIL-based +architecture (hGAIL) which decouples representation learning from the driving +task to solve the autonomous navigation of a vehicle. The proposed architecture +consists of two modules: a GAN (Generative Adversarial Net) which generates an +abstract mid-level input representation, which is the Bird's-Eye View (BEV) +from the surroundings of the vehicle; and the GAIL which learns to control the +vehicle based on the BEV predictions from the GAN as input. hGAIL is able to +learn both the policy and the mid-level representation simultaneously as the +agent interacts with the environment. Our experiments made in the CARLA +simulation environment have shown that GAIL exclusively from cameras (without +BEV) fails to even learn the task, while hGAIL, after training exclusively on +one city, was able to autonomously navigate successfully in 98% of the +intersections of a new city not used in training phase. Videos and code +available at: https://sites.google.com/view/hgail + +
+
+
+
+
+ + ♻ ☆ MimicTouch: Leveraging Multi-modal Human Tactile Demonstrations for + Contact-rich Manipulation NeurIPS 2023 + + +
+ Tactile sensing is critical to fine-grained, contact-rich manipulation tasks, +such as insertion and assembly. Prior research has shown the possibility of +learning tactile-guided policy from teleoperated demonstration data. However, +to provide the demonstration, human users often rely on visual feedback to +control the robot. This creates a gap between the sensing modality used for +controlling the robot (visual) and the modality of interest (tactile). To +bridge this gap, we introduce "MimicTouch", a novel framework for learning +policies directly from demonstrations provided by human users with their hands. +The key innovations are i) a human tactile data collection system which +collects multi-modal tactile dataset for learning human's tactile-guided +control strategy, ii) an imitation learning-based framework for learning +human's tactile-guided control strategy through such data, and iii) an online +residual RL framework to bridge the embodiment gap between the human hand and +the robot gripper. Through comprehensive experiments, we highlight the efficacy +of utilizing human's tactile-guided control strategy to resolve contact-rich +manipulation tasks. The project website is at +https://sites.google.com/view/MimicTouch. + +
+
+ comment: Accepted by CoRL 2024, Best Paper Award at NeurIPS 2023 Touch + Processing Workshop +
+
+
+
+
+ + ♻ ☆ CW-CNN & CW-AN: Convolutional Networks and Attention Networks for + CW-Complexes + + +
+ We present a novel framework for learning on CW-complex structured data +points. Recent advances have discussed CW-complexes as ideal learning +representations for problems in cheminformatics. However, there is a lack of +available machine learning methods suitable for learning on CW-complexes. In +this paper we develop notions of convolution and attention that are well +defined for CW-complexes. These notions enable us to create the first Hodge +informed neural network that can receive a CW-complex as input. We illustrate +and interpret this framework in the context of supervised prediction. + +
+
+
+
+
+ + ♻ ☆ Implementation of The Future of Drug Discovery: QuantumBased Machine + Learning Simulation (QMLS) + + +
+ The Research & Development (R&D) phase of drug development is a lengthy and +costly process. To revolutionize this process, we introduce our new concept +QMLS to shorten the whole R&D phase to three to six months and decrease the +cost to merely fifty to eighty thousand USD. For Hit Generation, Machine +Learning Molecule Generation (MLMG) generates possible hits according to the +molecular structure of the target protein while the Quantum Simulation (QS) +filters molecules from the primary essay based on the reaction and binding +effectiveness with the target protein. Then, For Lead Optimization, the +resultant molecules generated and filtered from MLMG and QS are compared, and +molecules that appear as a result of both processes will be made into dozens of +molecular variations through Machine Learning Molecule Variation (MLMV), while +others will only be made into a few variations. Lastly, all optimized molecules +would undergo multiple rounds of QS filtering with a high standard for reaction +effectiveness and safety, creating a few dozen pre-clinical-trail-ready drugs. +This paper is based on our first paper, where we pitched the concept of machine +learning combined with quantum simulations. In this paper we will go over the +detailed design and framework of QMLS, including MLMG, MLMV, and QS. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Large-Batch, Iteration-Efficient Neural Bayesian Design Optimization + + +
+ Bayesian optimization (BO) provides a powerful framework for optimizing +black-box, expensive-to-evaluate functions. It is therefore an attractive tool +for engineering design problems, typically involving multiple objectives. +Thanks to the rapid advances in fabrication and measurement methods as well as +parallel computing infrastructure, querying many design problems can be heavily +parallelized. This class of problems challenges BO with an unprecedented setup +where it has to deal with very large batches, shifting its focus from sample +efficiency to iteration efficiency. We present a novel Bayesian optimization +framework specifically tailored to address these limitations. Our key +contribution is a highly scalable, sample-based acquisition function that +performs a non-dominated sorting of not only the objectives but also their +associated uncertainty. We show that our acquisition function in combination +with different Bayesian neural network surrogates is effective in +data-intensive environments with a minimal number of iterations. We demonstrate +the superiority of our method by comparing it with state-of-the-art +multi-objective optimizations. We perform our evaluation on two real-world +problems -- airfoil design and 3D printing -- showcasing the applicability and +efficiency of our approach. Our code is available at: +https://github.com/an-on-ym-ous/lbn_mobo + +
+
+
+
+
+ + ♻ ☆ Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, + Applications and Opportunities + + +
+ Model merging is an efficient empowerment technique in the machine learning +community that does not require the collection of raw training data and does +not require expensive computation. As model merging becomes increasingly +prevalent across various fields, it is crucial to understand the available +model merging techniques comprehensively. However, there is a significant gap +in the literature regarding a systematic and thorough review of these +techniques. This survey provides a comprehensive overview of model merging +methods and theories, their applications in various domains and settings, and +future research directions. Specifically, we first propose a new taxonomic +approach that exhaustively discusses existing model merging methods. Secondly, +we discuss the application of model merging techniques in large language +models, multimodal large language models, and 10+ machine learning subfields, +including continual learning, multi-task learning, few-shot learning, etc. +Finally, we highlight the remaining challenges of model merging and discuss +future research directions. A comprehensive list of papers about model merging +is available at +\url{https://github.com/EnnengYang/Awesome-Model-Merging-Methods-Theories-Applications}. + +
+
+
+
+
+ + ♻ ☆ Reducing Spatial Discretization Error on Coarse CFD Simulations Using an + OpenFOAM-Embedded Deep Learning Framework + + +
+ We propose a method for reducing the spatial discretization error of coarse +computational fluid dynamics (CFD) problems by enhancing the quality of +low-resolution simulations using deep learning. We feed the model with +fine-grid data after projecting it to the coarse-grid discretization. We +substitute the default differencing scheme for the convection term by a +feed-forward neural network that interpolates velocities from cell centers to +face values to produce velocities that approximate the down-sampled fine-grid +data well. The deep learning framework incorporates the open-source CFD code +OpenFOAM, resulting in an end-to-end differentiable model. We automatically +differentiate the CFD physics using a discrete adjoint code version. We present +a fast communication method between TensorFlow (Python) and OpenFOAM (c++) that +accelerates the training process. We applied the model to the flow past a +square cylinder problem, reducing the error from 120% to 25% in the velocity +for simulations inside the training distribution compared to the traditional +solver using an x8 coarser mesh. For simulations outside the training +distribution, the error reduction in the velocities was about 50%. The training +is affordable in terms of time and data samples since the architecture exploits +the local features of the physics. + +
+
+
+
+
+ + ♻ ☆ AI-Driven Intrusion Detection Systems (IDS) on the ROAD Dataset: A + Comparative Analysis for Automotive Controller Area Network (CAN) + + +
+ The integration of digital devices in modern vehicles has revolutionized +automotive technology, enhancing safety and the overall driving experience. The +Controller Area Network (CAN) bus is a central system for managing in-vehicle +communication between the electronic control units (ECUs). However, the CAN +protocol poses security challenges due to inherent vulnerabilities, lacking +encryption and authentication, which, combined with an expanding attack +surface, necessitates robust security measures. In response to this challenge, +numerous Intrusion Detection Systems (IDS) have been developed and deployed. +Nonetheless, an open, comprehensive, and realistic dataset to test the +effectiveness of such IDSs remains absent in the existing literature. This +paper addresses this gap by considering the latest ROAD dataset, containing +stealthy and sophisticated injections. The methodology involves dataset +labelling and the implementation of both state-of-the-art deep learning models +and traditional machine learning models to show the discrepancy in performance +between the datasets most commonly used in the literature and the ROAD dataset, +a more realistic alternative. + +
+
+
+
+
+ + ♻ ☆ What Did I Do Wrong? Quantifying LLMs' Sensitivity and Consistency to + Prompt Engineering + + +
+ Large Language Models (LLMs) changed the way we design and interact with +software systems. Their ability to process and extract information from text +has drastically improved productivity in a number of routine tasks. Developers +that want to include these models in their software stack, however, face a +dreadful challenge: debugging LLMs' inconsistent behavior across minor +variations of the prompt. We therefore introduce two metrics for classification +tasks, namely sensitivity and consistency, which are complementary to task +performance. First, sensitivity measures changes of predictions across +rephrasings of the prompt, and does not require access to ground truth labels. +Instead, consistency measures how predictions vary across rephrasings for +elements of the same class. We perform an empirical comparison of these metrics +on text classification tasks, using them as guideline for understanding failure +modes of the LLM. Our hope is that sensitivity and consistency will be helpful +to guide prompt engineering and obtain LLMs that balance robustness with +performance. + +
+
+
+
+
+ + ♻ ☆ Unified Convergence Theory of Stochastic and Variance-Reduced Cubic + Newton Methods + + +
+ We study stochastic Cubic Newton methods for solving general possibly +non-convex minimization problems. We propose a new framework, which we call the +helper framework, that provides a unified view of the stochastic and +variance-reduced second-order algorithms equipped with global complexity +guarantees. It can also be applied to learning with auxiliary information. Our +helper framework offers the algorithm designer high flexibility for +constructing and analyzing the stochastic Cubic Newton methods, allowing +arbitrary size batches, and the use of noisy and possibly biased estimates of +the gradients and Hessians, incorporating both the variance reduction and the +lazy Hessian updates. We recover the best-known complexities for the stochastic +and variance-reduced Cubic Newton, under weak assumptions on the noise. A +direct consequence of our theory is the new lazy stochastic second-order +method, which significantly improves the arithmetic complexity for large +dimension problems. We also establish complexity bounds for the classes of +gradient-dominated objectives, that include convex and strongly convex +problems. For Auxiliary Learning, we show that using a helper (auxiliary +function) can outperform training alone if a given similarity measure is small. + +
+
+ comment: Published in Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Multifidelity Covariance Estimation via Regression on the Manifold of + Symmetric Positive Definite Matrices + + +
+ We introduce a multifidelity estimator of covariance matrices formulated as +the solution to a regression problem on the manifold of symmetric positive +definite matrices. The estimator is positive definite by construction, and the +Mahalanobis distance minimized to obtain it possesses properties enabling +practical computation. We show that our manifold regression multifidelity +(MRMF) covariance estimator is a maximum likelihood estimator under a certain +error model on manifold tangent space. More broadly, we show that our +Riemannian regression framework encompasses existing multifidelity covariance +estimators constructed from control variates. We demonstrate via numerical +examples that the MRMF estimator can provide significant decreases, up to one +order of magnitude, in squared estimation error relative to both +single-fidelity and other multifidelity covariance estimators. Furthermore, +preservation of positive definiteness ensures that our estimator is compatible +with downstream tasks, such as data assimilation and metric learning, in which +this property is essential. + +
+
+ comment: To appear in the SIAM Journal on Mathematics of Data Science (SIMODS) +
+
+
+
+
+ + ♻ ☆ Ontology-driven Reinforcement Learning for Personalized Student Support + + +
+ In the search for more effective education, there is a widespread effort to +develop better approaches to personalize student education. Unassisted, +educators often do not have time or resources to personally support every +student in a given classroom. Motivated by this issue, and by recent +advancements in artificial intelligence, this paper presents a general-purpose +framework for personalized student support, applicable to any virtual +educational system such as a serious game or an intelligent tutoring system. To +fit any educational situation, we apply ontologies for their semantic +organization, combining them with data collection considerations and +multi-agent reinforcement learning. The result is a modular system that can be +adapted to any virtual educational software to provide useful personalized +assistance to students. + +
+
+ comment: 6 pages, 3 figures, in press for IEEE Systems, Man, and Cybernetics + 2024 Conference +
+
+
+
+
+ + ♻ ☆ On the design space between molecular mechanics and machine learning + force fields + + +
+ A force field as accurate as quantum mechanics (QM) and as fast as molecular +mechanics (MM), with which one can simulate a biomolecular system efficiently +enough and meaningfully enough to get quantitative insights, is among the most +ardent dreams of biophysicists -- a dream, nevertheless, not to be fulfilled +any time soon. Machine learning force fields (MLFFs) represent a meaningful +endeavor towards this direction, where differentiable neural functions are +parametrized to fit ab initio energies, and furthermore forces through +automatic differentiation. We argue that, as of now, the utility of the MLFF +models is no longer bottlenecked by accuracy but primarily by their speed (as +well as stability and generalizability), as many recent variants, on limited +chemical spaces, have long surpassed the chemical accuracy of $1$ kcal/mol -- +the empirical threshold beyond which realistic chemical predictions are +possible -- though still magnitudes slower than MM. Hoping to kindle +explorations and designs of faster, albeit perhaps slightly less accurate +MLFFs, in this review, we focus our attention on the design space (the +speed-accuracy tradeoff) between MM and ML force fields. After a brief review +of the building blocks of force fields of either kind, we discuss the desired +properties and challenges now faced by the force field development community, +survey the efforts to make MM force fields more accurate and ML force fields +faster, envision what the next generation of MLFF might look like. + +
+
+
+
+
+ + ♻ ☆ Developing A Multi-Agent and Self-Adaptive Framework with Deep + Reinforcement Learning for Dynamic Portfolio Risk Management + + +
+ Deep or reinforcement learning (RL) approaches have been adapted as reactive +agents to quickly learn and respond with new investment strategies for +portfolio management under the highly turbulent financial market environments +in recent years. In many cases, due to the very complex correlations among +various financial sectors, and the fluctuating trends in different financial +markets, a deep or reinforcement learning based agent can be biased in +maximising the total returns of the newly formulated investment portfolio while +neglecting its potential risks under the turmoil of various market conditions +in the global or regional sectors. Accordingly, a multi-agent and self-adaptive +framework namely the MASA is proposed in which a sophisticated multi-agent +reinforcement learning (RL) approach is adopted through two cooperating and +reactive agents to carefully and dynamically balance the trade-off between the +overall portfolio returns and their potential risks. Besides, a very flexible +and proactive agent as the market observer is integrated into the MASA +framework to provide some additional information on the estimated market trends +as valuable feedbacks for multi-agent RL approach to quickly adapt to the +ever-changing market conditions. The obtained empirical results clearly reveal +the potential strengths of our proposed MASA framework based on the multi-agent +RL approach against many well-known RL-based approaches on the challenging data +sets of the CSI 300, Dow Jones Industrial Average and S&P 500 indexes over the +past 10 years. More importantly, our proposed MASA framework shed lights on +many possible directions for future investigation. + +
+
+ comment: In Proceedings of the 23rd International Conference on Autonomous + Agents and Multiagent Systems +
+
+
+
+
+ + ♻ ☆ Large Scale Training of Graph Neural Networks for Optimal Markov-Chain + Partitioning Using the Kemeny Constant + + +
+ Traditional clustering algorithms often struggle to capture the complex +relationships within graphs and generalise to arbitrary clustering criteria. +The emergence of graph neural networks (GNNs) as a powerful framework for +learning representations of graph data provides new approaches to solving the +problem. Previous work has shown GNNs to be capable of proposing partitionings +using a variety of criteria, however, these approaches have not yet been +extended to work on Markov chains or kinetic networks. These arise frequently +in the study of molecular systems and are of particular interest to the +biochemical modelling community. In this work, we propose several GNN-based +architectures to tackle the graph partitioning problem for Markov Chains +described as kinetic networks. This approach aims to minimize how much a +proposed partitioning changes the Kemeny constant. We propose using an +encoder-decoder architecture and show how simple GraphSAGE-based GNNs with +linear layers can outperform much larger and more expressive attention-based +models in this context. As a proof of concept, we first demonstrate the +method's ability to cluster randomly connected graphs. We also use a linear +chain architecture corresponding to a 1D free energy profile as our kinetic +network. Subsequently, we demonstrate the effectiveness of our method through +experiments on a data set derived from molecular dynamics. We compare the +performance of our method to other partitioning techniques such as PCCA+. We +explore the importance of feature and hyperparameter selection and propose a +general strategy for large-scale parallel training of GNNs for discovering +optimal graph partitionings. + +
+
+
+
+
+ + ♻ ☆ On the Impact of Data Heterogeneity in Federated Learning Environments + with Application to Healthcare Networks + + +
+ Federated Learning (FL) allows multiple privacy-sensitive applications to +leverage their dataset for a global model construction without any disclosure +of the information. One of those domains is healthcare, where groups of silos +collaborate in order to generate a global predictor with improved accuracy and +generalization. However, the inherent challenge lies in the high heterogeneity +of medical data, necessitating sophisticated techniques for assessment and +compensation. This paper presents a comprehensive exploration of the +mathematical formalization and taxonomy of heterogeneity within FL +environments, focusing on the intricacies of medical data. In particular, we +address the evaluation and comparison of the most popular FL algorithms with +respect to their ability to cope with quantity-based, feature and label +distribution-based heterogeneity. The goal is to provide a quantitative +evaluation of the impact of data heterogeneity in FL systems for healthcare +networks as well as a guideline on FL algorithm selection. Our research extends +beyond existing studies by benchmarking seven of the most common FL algorithms +against the unique challenges posed by medical data use cases. The paper +targets the prediction of the risk of stroke recurrence through a set of +tabular clinical reports collected by different federated hospital silos: data +heterogeneity frequently encountered in this scenario and its impact on FL +performance are discussed. + +
+
+
+
+
+ + ♻ ☆ Finite-Time Error Analysis of Soft Q-Learning: Switching System Approach + + +
+ Soft Q-learning is a variation of Q-learning designed to solve entropy +regularized Markov decision problems where an agent aims to maximize the +entropy regularized value function. Despite its empirical success, there have +been limited theoretical studies of soft Q-learning to date. This paper aims to +offer a novel and unified finite-time, control-theoretic analysis of soft +Q-learning algorithms. We focus on two types of soft Q-learning algorithms: one +utilizing the log-sum-exp operator and the other employing the Boltzmann +operator. By using dynamical switching system models, we derive novel +finite-time error bounds for both soft Q-learning algorithms. We hope that our +analysis will deepen the current understanding of soft Q-learning by +establishing connections with switching system models and may even pave the way +for new frameworks in the finite-time analysis of other reinforcement learning +algorithms. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ CyclicFL: A Cyclic Model Pre-Training Approach to Efficient Federated + Learning + + +
+ Federated learning (FL) has been proposed to enable distributed learning on +Artificial Intelligence Internet of Things (AIoT) devices with guarantees of +high-level data privacy. Since random initial models in FL can easily result in +unregulated Stochastic Gradient Descent (SGD) processes, existing FL methods +greatly suffer from both slow convergence and poor accuracy, especially in +non-IID scenarios. To address this problem, we propose a novel method named +CyclicFL, which can quickly derive effective initial models to guide the SGD +processes, thus improving the overall FL training performance. We formally +analyze the significance of data consistency between the pre-training and +training stages of CyclicFL, showing the limited Lipschitzness of loss for the +pre-trained models by CyclicFL. Moreover, we systematically prove that our +method can achieve faster convergence speed under various convexity +assumptions. Unlike traditional centralized pre-training methods that require +public proxy data, CyclicFL pre-trains initial models on selected AIoT devices +cyclically without exposing their local data. Therefore, they can be easily +integrated into any security-critical FL methods. Comprehensive experimental +results show that CyclicFL can not only improve the maximum classification +accuracy by up to $14.11\%$ but also significantly accelerate the overall FL +training process. + +
+
+
+
+
+ + ♻ ☆ Commute-Time-Optimised Graphs for GNNs + + +
+ We explore graph rewiring methods that optimise commute time. Recent graph +rewiring approaches facilitate long-range interactions in sparse graphs, making +such rewirings commute-time-optimal on average. However, when an expert prior +exists on which node pairs should or should not interact, a superior rewiring +would favour short commute times between these privileged node pairs. We +construct two synthetic datasets with known priors reflecting realistic +settings, and use these to motivate two bespoke rewiring methods that +incorporate the known prior. We investigate the regimes where our rewiring +improves test performance on the synthetic datasets. Finally, we perform a case +study on a real-world citation graph to investigate the practical implications +of our work. + +
+
+
+
+
+ + ♻ ☆ Finite Sample Frequency Domain Identification + + +
+ We study non-parametric frequency-domain system identification from a +finite-sample perspective. We assume an open loop scenario where the excitation +input is periodic and consider the Empirical Transfer Function Estimate (ETFE), +where the goal is to estimate the frequency response at certain desired +(evenly-spaced) frequencies, given input-output samples. We show that under +sub-Gaussian colored noise (in time-domain) and stability assumptions, the ETFE +estimates are concentrated around the true values. The error rate is of the +order of +$\mathcal{O}((d_{\mathrm{u}}+\sqrt{d_{\mathrm{u}}d_{\mathrm{y}}})\sqrt{M/N_{\mathrm{tot}}})$, +where $N_{\mathrm{tot}}$ is the total number of samples, $M$ is the number of +desired frequencies, and $d_{\mathrm{u}},\,d_{\mathrm{y}}$ are the dimensions +of the input and output signals respectively. This rate remains valid for +general irrational transfer functions and does not require a finite order +state-space representation. By tuning $M$, we obtain a +$N_{\mathrm{tot}}^{-1/3}$ finite-sample rate for learning the frequency +response over all frequencies in the $ \mathcal{H}_{\infty}$ norm. Our result +draws upon an extension of the Hanson-Wright inequality to semi-infinite +matrices. We study the finite-sample behavior of ETFE in simulations. + +
+
+ comment: Version 2 changes: several typos were fixed and some proof steps were + expanded +
+
+
+
+
+ + ♻ ☆ TSFool: Crafting Highly-Imperceptible Adversarial Time Series through + Multi-Objective Attack ECAI'24 + + +
+ Recent years have witnessed the success of recurrent neural network (RNN) +models in time series classification (TSC). However, neural networks (NNs) are +vulnerable to adversarial samples, which cause real-life adversarial attacks +that undermine the robustness of AI models. To date, most existing attacks +target at feed-forward NNs and image recognition tasks, but they cannot perform +well on RNN-based TSC. This is due to the cyclical computation of RNN, which +prevents direct model differentiation. In addition, the high visual sensitivity +of time series to perturbations also poses challenges to local objective +optimization of adversarial samples. In this paper, we propose an efficient +method called TSFool to craft highly-imperceptible adversarial time series for +RNN-based TSC. The core idea is a new global optimization objective known as +"Camouflage Coefficient" that captures the imperceptibility of adversarial +samples from the class distribution. Based on this, we reduce the adversarial +attack problem to a multi-objective optimization problem that enhances the +perturbation quality. Furthermore, to speed up the optimization process, we +propose to use a representation model for RNN to capture deeply embedded +vulnerable samples whose features deviate from the latent manifold. Experiments +on 11 UCR and UEA datasets showcase that TSFool significantly outperforms six +white-box and three black-box benchmark attacks in terms of effectiveness, +efficiency and imperceptibility from various perspectives including standard +measure, human study and real-world defense. + +
+
+ comment: 27th European Conference on Artificial Intelligence (ECAI'24) +
+
+
+
+
+ + ♻ ☆ On Bits and Bandits: Quantifying the Regret-Information Trade-off + + +
+ In interactive decision-making tasks, information can be acquired by direct +interactions, through receiving indirect feedback, and from external +knowledgeable sources. We examine the trade-off between the information an +agent accumulates and the regret it suffers. We show that information from +external sources, measured in bits, can be traded off for regret, measured in +reward. We invoke information-theoretic methods for obtaining regret lower +bounds, that also allow us to easily re-derive several known lower bounds. We +then generalize a variety of interactive decision-making tasks with external +information to a new setting. Using this setting, we introduce the first +Bayesian regret lower bounds that depend on the information an agent +accumulates. These lower bounds also prove the near-optimality of Thompson +sampling for Bayesian problems. Finally, we demonstrate the utility of these +bounds in improving the performance of a question-answering task with large +language models, allowing us to obtain valuable insights. + +
+
+
+
+
+ + ♻ ☆ The Role of Transformer Models in Advancing Blockchain Technology: A + Systematic Survey + + +
+ As blockchain technology rapidly evolves, the demand for enhanced efficiency, +security, and scalability grows.Transformer models, as powerful deep learning +architectures,have shown unprecedented potential in addressing various +blockchain challenges. However, a systematic review of Transformer applications +in blockchain is lacking. This paper aims to fill this research gap by +surveying over 200 relevant papers, comprehensively reviewing practical cases +and research progress of Transformers in blockchain applications. Our survey +covers key areas including anomaly detection, smart contract security analysis, +cryptocurrency prediction and trend analysis, and code summary generation. To +clearly articulate the advancements of Transformers across various blockchain +domains, we adopt a domain-oriented classification system, organizing and +introducing representative methods based on major challenges in current +blockchain research. For each research domain,we first introduce its background +and objectives, then review previous representative methods and analyze their +limitations,and finally introduce the advancements brought by Transformer +models. Furthermore, we explore the challenges of utilizing Transformer, such +as data privacy, model complexity, and real-time processing requirements. +Finally, this article proposes future research directions, emphasizing the +importance of exploring the Transformer architecture in depth to adapt it to +specific blockchain applications, and discusses its potential role in promoting +the development of blockchain technology. This review aims to provide new +perspectives and a research foundation for the integrated development of +blockchain technology and machine learning, supporting further innovation and +application expansion of blockchain technology. + +
+
+
+
+
+ + ♻ ☆ Cross-Validated Off-Policy Evaluation + + +
+ In this paper, we study the problem of estimator selection and +hyper-parameter tuning in off-policy evaluation. Although cross-validation is +the most popular method for model selection in supervised learning, off-policy +evaluation relies mostly on theory-based approaches, which provide only limited +guidance to practitioners. We show how to use cross-validation for off-policy +evaluation. This challenges a popular belief that cross-validation in +off-policy evaluation is not feasible. We evaluate our method empirically and +show that it addresses a variety of use cases. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A causal viewpoint on prediction model performance under changes in + case-mix: discrimination and calibration respond differently for prognosis + and diagnosis predictions + + +
+ Prediction models inform important clinical decisions, aiding in diagnosis, +prognosis, and treatment planning. The predictive performance of these models +is typically assessed through discrimination and calibration. However, changes +in the distribution of the data impact model performance. In health-care, a +typical change is a shift in case-mix: for example, for cardiovascular risk +management, a general practitioner sees a different mix of patients than a +specialist in a tertiary hospital. + This work introduces a novel framework that differentiates the effects of +case-mix shifts on discrimination and calibration based on the causal direction +of the prediction task. When prediction is in the causal direction (often the +case for prognosis predictions), calibration remains stable under case-mix +shifts, while discrimination does not. Conversely, when predicting in the +anti-causal direction (often with diagnosis predictions), discrimination +remains stable, but calibration does not. + A simulation study and empirical validation using cardiovascular disease +prediction models demonstrate the implications of this framework. This +framework provides critical insights for evaluating and deploying prediction +models across different clinical settings, emphasizing the importance of +understanding the causal structure of the prediction task. + +
+
+
+
+
+ + ♻ ☆ Explainable Hierarchical Urban Representation Learning for Commuting + Flow Prediction + + +
+ Commuting flow prediction is an essential task for municipal operations in +the real world. Previous studies have revealed that it is feasible to estimate +the commuting origin-destination (OD) demand within a city using multiple +auxiliary data. However, most existing methods are not suitable to deal with a +similar task at a large scale, namely within a prefecture or the whole nation, +owing to the increased number of geographical units that need to be maintained. +In addition, region representation learning is a universal approach for gaining +urban knowledge for diverse metropolitan downstream tasks. Although many +researchers have developed comprehensive frameworks to describe urban units +from multi-source data, they have not clarified the relationship between the +selected geographical elements. Furthermore, metropolitan areas naturally +preserve ranked structures, like cities and their inclusive districts, which +makes elucidating relations between cross-level urban units necessary. +Therefore, we develop a heterogeneous graph-based model to generate meaningful +region embeddings at multiple spatial resolutions for predicting different +types of inter-level OD flows. To demonstrate the effectiveness of the proposed +method, extensive experiments were conducted using real-world aggregated mobile +phone datasets collected from Shizuoka Prefecture, Japan. The results indicate +that our proposed model outperforms existing models in terms of a uniform urban +structure. We extend the understanding of predicted results using reasonable +explanations to enhance the credibility of the model. + +
+
+
+
+
+ + ♻ ☆ Trustworthy Human-AI Collaboration: Reinforcement Learning with Human + Feedback and Physics Knowledge for Safe Autonomous Driving + + +
+ In the field of autonomous driving, developing safe and trustworthy +autonomous driving policies remains a significant challenge. Recently, +Reinforcement Learning with Human Feedback (RLHF) has attracted substantial +attention due to its potential to enhance training safety and sampling +efficiency. Nevertheless, existing RLHF-enabled methods often falter when faced +with imperfect human demonstrations, potentially leading to training +oscillations or even worse performance than rule-based approaches. Inspired by +the human learning process, we propose Physics-enhanced Reinforcement Learning +with Human Feedback (PE-RLHF). This novel framework synergistically integrates +human feedback (e.g., human intervention and demonstration) and physics +knowledge (e.g., traffic flow model) into the training loop of reinforcement +learning. The key advantage of PE-RLHF is its guarantee that the learned policy +will perform at least as well as the given physics-based policy, even when +human feedback quality deteriorates, thus ensuring trustworthy safety +improvements. PE-RLHF introduces a Physics-enhanced Human-AI (PE-HAI) +collaborative paradigm for dynamic action selection between human and +physics-based actions, employs a reward-free approach with a proxy value +function to capture human preferences, and incorporates a minimal intervention +mechanism to reduce the cognitive load on human mentors. Extensive experiments +across diverse driving scenarios demonstrate that PE-RLHF significantly +outperforms traditional methods, achieving state-of-the-art (SOTA) performance +in safety, efficiency, and generalizability, even with varying quality of human +feedback. The philosophy behind PE-RLHF not only advances autonomous driving +technology but can also offer valuable insights for other safety-critical +domains. Demo video and code are available at: +\https://zilin-huang.github.io/PE-RLHF-website/ + +
+
+ comment: 33 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ FRAC-Q-Learning: A Reinforcement Learning with Boredom Avoidance + Processes for Social Robots + + +
+ The reinforcement learning algorithms have often been applied to social +robots. However, most reinforcement learning algorithms were not optimized for +the use of social robots, and consequently they may bore users. We proposed a +new reinforcement learning method specialized for the social robot, the +FRAC-Q-learning, that can avoid user boredom. The proposed algorithm consists +of a forgetting process in addition to randomizing and categorizing processes. +This study evaluated interest and boredom hardness scores of the +FRAC-Q-learning by a comparison with the traditional Q-learning. The +FRAC-Q-learning showed significantly higher trend of interest score, and +indicated significantly harder to bore users compared to the traditional +Q-learning. Therefore, the FRAC-Q-learning can contribute to develop a social +robot that will not bore users. The proposed algorithm has a potential to apply +for Web-based communication and educational systems. This paper presents the +entire process, detailed implementation and a detailed evaluation method of the +of the FRAC-Q-learning for the first time. + +
+
+
+
+
+ + ♻ ☆ Painful intelligence: What AI can tell us about human suffering + + +
+ This book uses the modern theory of artificial intelligence (AI) to +understand human suffering or mental pain. Both humans and sophisticated AI +agents process information about the world in order to achieve goals and obtain +rewards, which is why AI can be used as a model of the human brain and mind. +This book intends to make the theory accessible to a relatively general +audience, requiring only some relevant scientific background. + The book starts with the assumption that suffering is mainly caused by +frustration. Frustration means the failure of an agent (whether AI or human) to +achieve a goal or a reward it wanted or expected. Frustration is inevitable +because of the overwhelming complexity of the world, limited computational +resources, and scarcity of good data. In particular, such limitations imply +that an agent acting in the real world must cope with uncontrollability, +unpredictability, and uncertainty, which all lead to frustration. + Fundamental in such modelling is the idea of learning, or adaptation to the +environment. While AI uses machine learning, humans and animals adapt by a +combination of evolutionary mechanisms and ordinary learning. Even frustration +is fundamentally an error signal that the system uses for learning. This book +explores various aspects and limitations of learning algorithms and their +implications regarding suffering. + At the end of the book, the computational theory is used to derive various +interventions or training methods that will reduce suffering in humans. The +amount of frustration is expressed by a simple equation which indicates how it +can be reduced. The ensuing interventions are very similar to those proposed by +Buddhist and Stoic philosophy, and include mindfulness meditation. Therefore, +this book can be interpreted as an exposition of a computational theory +justifying why such philosophies and meditation reduce human suffering. + +
+
+ comment: Second Edition of this book with 258 pages +
+
+
+
+
+ + ♻ ☆ Prediction of soil fertility parameters using USB-microscope imagery and + portable X-ray fluorescence spectrometry + + +
+ This study investigated the use of portable X-ray fluorescence (PXRF) +spectrometry and soil image analysis for rapid soil fertility assessment, with +a focus on key indicators such as available boron (B), organic carbon (OC), +available manganese (Mn), available sulfur (S), and the sulfur availability +index (SAI). A total of 1,133 soil samples from diverse agro-climatic zones in +Eastern India were analyzed. The research integrated color and texture features +from microscopic soil images, PXRF data, and auxiliary soil variables (AVs) +using a Random Forest model. Results showed that combining image features (IFs) +with AVs significantly improved prediction accuracy for available B (R2 = 0.80) +and OC (R2 = 0.88). A data fusion approach, incorporating IFs, AVs, and PXRF +data, further enhanced predictions for available Mn and SAI, with R2 values of +0.72 and 0.70, respectively. The study highlights the potential of integrating +these technologies to offer rapid, cost-effective soil testing methods, paving +the way for more advanced predictive models and a deeper understanding of soil +fertility. Future work should explore the application of deep learning models +on a larger dataset, incorporating soils from a wider range of agro-climatic +zones under field conditions. + +
+
+ comment: Published in 'Soil Advances' +
+
+
+
+
+ + ♻ ☆ AICAttack: Adversarial Image Captioning Attack with Attention-Based + Optimization + + +
+ Recent advances in deep learning research have shown remarkable achievements +across many tasks in computer vision (CV) and natural language processing +(NLP). At the intersection of CV and NLP is the problem of image captioning, +where the related models' robustness against adversarial attacks has not been +well studied. This paper presents a novel adversarial attack strategy, +AICAttack (Attention-based Image Captioning Attack), designed to attack image +captioning models through subtle perturbations on images. Operating within a +black-box attack scenario, our algorithm requires no access to the target +model's architecture, parameters, or gradient information. We introduce an +attention-based candidate selection mechanism that identifies the optimal +pixels to attack, followed by a customised differential evolution method to +optimise the perturbations of pixels' RGB values. We demonstrate AICAttack's +effectiveness through extensive experiments on benchmark datasets against +multiple victim models. The experimental results demonstrate that our method +outperforms current leading-edge techniques by achieving consistently higher +attack success rates. + +
+
+
+
+
+ + ♻ ☆ LinFusion: 1 GPU, 1 Minute, 16K Image + + +
+ Modern diffusion models, particularly those utilizing a Transformer-based +UNet for denoising, rely heavily on self-attention operations to manage complex +spatial relationships, thus achieving impressive generation performance. +However, this existing paradigm faces significant challenges in generating +high-resolution visual content due to its quadratic time and memory complexity +with respect to the number of spatial tokens. To address this limitation, we +aim at a novel linear attention mechanism as an alternative in this paper. +Specifically, we begin our exploration from recently introduced models with +linear complexity, e.g., Mamba2, RWKV6, Gated Linear Attention, etc, and +identify two key features-attention normalization and non-causal inference-that +enhance high-resolution visual generation performance. Building on these +insights, we introduce a generalized linear attention paradigm, which serves as +a low-rank approximation of a wide spectrum of popular linear token mixers. To +save the training cost and better leverage pre-trained models, we initialize +our models and distill the knowledge from pre-trained StableDiffusion (SD). We +find that the distilled model, termed LinFusion, achieves performance on par +with or superior to the original SD after only modest training, while +significantly reducing time and memory complexity. Extensive experiments on +SD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory +zero-shot cross-resolution generation performance, generating high-resolution +images like 16K resolution. Moreover, it is highly compatible with pre-trained +SD components, such as ControlNet and IP-Adapter, requiring no adaptation +efforts. Codes are available at https://github.com/Huage001/LinFusion. + +
+
+ comment: Work in Progress. Codes are available at + https://github.com/Huage001/LinFusion +
+
+
+
+
+ + ♻ ☆ A Survey for Foundation Models in Autonomous Driving + + +
+ The advent of foundation models has revolutionized the fields of natural +language processing and computer vision, paving the way for their application +in autonomous driving (AD). This survey presents a comprehensive review of more +than 40 research papers, demonstrating the role of foundation models in +enhancing AD. Large language models contribute to planning and simulation in +AD, particularly through their proficiency in reasoning, code generation and +translation. In parallel, vision foundation models are increasingly adapted for +critical tasks such as 3D object detection and tracking, as well as creating +realistic driving scenarios for simulation and testing. Multi-modal foundation +models, integrating diverse inputs, exhibit exceptional visual understanding +and spatial reasoning, crucial for end-to-end AD. This survey not only provides +a structured taxonomy, categorizing foundation models based on their modalities +and functionalities within the AD domain but also delves into the methods +employed in current research. It identifies the gaps between existing +foundation models and cutting-edge AD approaches, thereby charting future +research directions and proposing a roadmap for bridging these gaps. + +
+
+
+
+
+ + ♻ ☆ Transfer-based Adversarial Poisoning Attacks for Online (MIMO-)Deep + Receviers + + +
+ Recently, the design of wireless receivers using deep neural networks (DNNs), +known as deep receivers, has attracted extensive attention for ensuring +reliable communication in complex channel environments. To adapt quickly to +dynamic channels, online learning has been adopted to update the weights of +deep receivers with over-the-air data (e.g., pilots). However, the fragility of +neural models and the openness of wireless channels expose these systems to +malicious attacks. To this end, understanding these attack methods is essential +for robust receiver design. In this paper, we propose a transfer-based +adversarial poisoning attack method for online receivers.Without knowledge of +the attack target, adversarial perturbations are injected to the pilots, +poisoning the online deep receiver and impairing its ability to adapt to +dynamic channels and nonlinear effects. In particular, our attack method +targets Deep Soft Interference Cancellation (DeepSIC)[1] using online +meta-learning. As a classical model-driven deep receiver, DeepSIC incorporates +wireless domain knowledge into its architecture. This integration allows it to +adapt efficiently to time-varying channels with only a small number of pilots, +achieving optimal performance in a multi-input and multi-output (MIMO) +scenario.The deep receiver in this scenario has a number of applications in the +field of wireless communication, which motivates our study of the attack +methods targeting it.Specifically, we demonstrate the effectiveness of our +attack in simulations on synthetic linear, synthetic nonlinear, static, and +COST 2100 channels. Simulation results indicate that the proposed poisoning +attack significantly reduces the performance of online receivers in rapidly +changing scenarios. + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Last-Iterate Convergence of Payoff-Based Independent Learning in + Zero-Sum Stochastic Games NeurIPS 2023 + + +
+ In this paper, we consider two-player zero-sum matrix and stochastic games +and develop learning dynamics that are payoff-based, convergent, rational, and +symmetric between the two players. Specifically, the learning dynamics for +matrix games are based on the smoothed best-response dynamics, while the +learning dynamics for stochastic games build upon those for matrix games, with +additional incorporation of the minimax value iteration. To our knowledge, our +theoretical results present the first finite-sample analysis of such learning +dynamics with last-iterate guarantees. In the matrix game setting, the results +imply a sample complexity of $O(\epsilon^{-1})$ to find the Nash distribution +and a sample complexity of $O(\epsilon^{-8})$ to find a Nash equilibrium. In +the stochastic game setting, the results also imply a sample complexity of +$O(\epsilon^{-8})$ to find a Nash equilibrium. To establish these results, the +main challenge is to handle stochastic approximation algorithms with multiple +sets of coupled and stochastic iterates that evolve on (possibly) different +time scales. To overcome this challenge, we developed a coupled Lyapunov-based +approach, which may be of independent interest to the broader community +studying the convergence behavior of stochastic approximation algorithms. + +
+
+ comment: A preliminary version [arXiv:2303.03100] of this paper, with a subset + of the results that are presented here, was presented at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Prediction of COPD Using Machine Learning, Clinical Summary Notes, and + Vital Signs + + +
+ Chronic obstructive pulmonary disease (COPD) is a chronic inflammatory lung +disease that causes obstructed airflow from the lungs. In the United States, +more than 15.7 million Americans have been diagnosed with COPD, with 96% of +individuals living with at least one other chronic health condition. It is the +4th leading cause of death in the country. Over 2.2 million patients are +admitted to hospitals annually due to COPD exacerbations. Monitoring and +predicting patient exacerbations on-time could save their life. This paper +presents two different predictive models to predict COPD exacerbation using AI +and natural language processing (NLP) approaches. These models use respiration +summary notes, symptoms, and vital signs. To train and test these models, data +records containing physiologic signals and vital signs time series were used. +These records were captured from patient monitors and comprehensive clinical +data obtained from hospital medical information systems for tens of thousands +of Intensive Care Unit (ICU) patients. We achieved an area under the Receiver +operating characteristic (ROC) curve of 0.82 in detection and prediction of +COPD exacerbation. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Explanation Space: A New Perspective into Time Series Interpretability + + +
+ Human understandable explanation of deep learning models is necessary for +many critical and sensitive applications. Unlike image or tabular data where +the importance of each input feature (for the classifier's decision) can be +directly projected into the input, time series distinguishable features (e.g. +dominant frequency) are often hard to manifest in time domain for a user to +easily understand. Moreover, most explanation methods require a baseline value +as an indication of the absence of any feature. However, the notion of lack of +feature, which is often defined as black pixels for vision tasks or zero/mean +values for tabular data, is not well-defined in time series. Despite the +adoption of explainable AI methods (XAI) from tabular and vision domain into +time series domain, these differences limit the application of these XAI +methods in practice. In this paper, we propose a simple yet effective method +that allows a model originally trained on time domain to be interpreted in +other explanation spaces using existing methods. We suggest four explanation +spaces that each can potentially alleviate these issues in certain types of +time series. Our method can be readily adopted in existing platforms without +any change to trained models or XAI methods. The code is available at +https://github.com/shrezaei/TS-X-spaces. + +
+
+
+
+
+ + ♻ ☆ Simultaneous Masking, Not Prompting Optimization: A Paradigm Shift in + Fine-tuning LLMs for Simultaneous Translation + + +
+ Large language models (LLMs) have achieved state-of-the-art performance in +various language processing tasks, motivating their adoption in simultaneous +translation. Current fine-tuning methods to adapt LLMs for simultaneous +translation focus on prompting optimization strategies using either data +augmentation or prompt structure modifications. However, these methods suffer +from several issues, such as unnecessarily expanded training sets, +computational inefficiency from dumping the key and value cache, increased +prompt sizes, or restriction to a single decision policy. To eliminate these +issues, in this work, we propose SimulMask, a new paradigm for fine-tuning LLMs +for simultaneous translation. It utilizes a novel attention mask approach that +models simultaneous translation during fine-tuning by masking attention for a +desired decision policy. Applying the proposed SimulMask on a Falcon LLM for +the IWSLT 2017 dataset, we have observed a significant translation quality +improvement compared to state-of-the-art prompting optimization strategies on +five language pairs while reducing the computational cost. + +
+
+
+
+
+ + ♻ ☆ Stacked ensemble\-based mutagenicity prediction model using multiple + modalities with graph attention network + + +
+ Mutagenicity is a concern due to its association with genetic mutations which +can result in a variety of negative consequences, including the development of +cancer. Earlier identification of mutagenic compounds in the drug development +process is therefore crucial for preventing the progression of unsafe +candidates and reducing development costs. While computational techniques, +especially machine learning models have become increasingly prevalent for this +endpoint, they rely on a single modality. In this work, we introduce a novel +stacked ensemble based mutagenicity prediction model which incorporate multiple +modalities such as simplified molecular input line entry system (SMILES) and +molecular graph. These modalities capture diverse information about molecules +such as substructural, physicochemical, geometrical and topological. To derive +substructural, geometrical and physicochemical information, we use SMILES, +while topological information is extracted through a graph attention network +(GAT) via molecular graph. Our model uses a stacked ensemble of machine +learning classifiers to make predictions using these multiple features. We +employ the explainable artificial intelligence (XAI) technique SHAP (Shapley +Additive Explanations) to determine the significance of each classifier and the +most relevant features in the prediction. We demonstrate that our method +surpasses SOTA methods on two standard datasets across various metrics. +Notably, we achieve an area under the curve of 95.21\% on the Hansen benchmark +dataset, affirming the efficacy of our method in predicting mutagenicity. We +believe that this research will captivate the interest of both clinicians and +computational biologists engaged in translational research. + +
+
+ comment: Submitted to a journal +
+
+
+
+
+ + ♻ ☆ UserSumBench: A Benchmark Framework for Evaluating User Summarization + Approaches + + +
+ Large language models (LLMs) have shown remarkable capabilities in generating +user summaries from a long list of raw user activity data. These summaries +capture essential user information such as preferences and interests, and +therefore are invaluable for LLM-based personalization applications, such as +explainable recommender systems. However, the development of new summarization +techniques is hindered by the lack of ground-truth labels, the inherent +subjectivity of user summaries, and human evaluation which is often costly and +time-consuming. To address these challenges, we introduce \UserSumBench, a +benchmark framework designed to facilitate iterative development of LLM-based +summarization approaches. This framework offers two key components: (1) A +reference-free summary quality metric. We show that this metric is effective +and aligned with human preferences across three diverse datasets (MovieLens, +Yelp and Amazon Review). (2) A novel robust summarization method that leverages +time-hierarchical summarizer and self-critique verifier to produce high-quality +summaries while eliminating hallucination. This method serves as a strong +baseline for further innovation in summarization techniques. + +
+
+
+
+
+ + ♻ ☆ inGRASS: Incremental Graph Spectral Sparsification via + Low-Resistance-Diameter Decomposition + + +
+ This work presents inGRASS, a novel algorithm designed for incremental +spectral sparsification of large undirected graphs. The proposed inGRASS +algorithm is highly scalable and parallel-friendly, having a nearly-linear time +complexity for the setup phase and the ability to update the spectral +sparsifier in $O(\log N)$ time for each incremental change made to the original +graph with $N$ nodes. A key component in the setup phase of inGRASS is a +multilevel resistance embedding framework introduced for efficiently +identifying spectrally-critical edges and effectively detecting redundant ones, +which is achieved by decomposing the initial sparsifier into many node clusters +with bounded effective-resistance diameters leveraging a +low-resistance-diameter decomposition (LRD) scheme. The update phase of inGRASS +exploits low-dimensional node embedding vectors for efficiently estimating the +importance and uniqueness of each newly added edge. As demonstrated through +extensive experiments, inGRASS achieves up to over $200 \times$ speedups while +retaining comparable solution quality in incremental spectral sparsification of +graphs obtained from various datasets, such as circuit simulations, finite +element analysis, and social networks. + +
+
+ comment: Accepted on DAC 2024 +
+
+
+
+
+ + ♻ ☆ Refusing Safe Prompts for Multi-modal Large Language Models + + +
+ Multimodal large language models (MLLMs) have become the cornerstone of +today's generative AI ecosystem, sparking intense competition among tech giants +and startups. In particular, an MLLM generates a text response given a prompt +consisting of an image and a question. While state-of-the-art MLLMs use safety +filters and alignment techniques to refuse unsafe prompts, in this work, we +introduce MLLM-Refusal, the first method that induces refusals for safe +prompts. In particular, our MLLM-Refusal optimizes a nearly-imperceptible +refusal perturbation and adds it to an image, causing target MLLMs to likely +refuse a safe prompt containing the perturbed image and a safe question. +Specifically, we formulate MLLM-Refusal as a constrained optimization problem +and propose an algorithm to solve it. Our method offers competitive advantages +for MLLM model providers by potentially disrupting user experiences of +competing MLLMs, since competing MLLM's users will receive unexpected refusals +when they unwittingly use these perturbed images in their prompts. We evaluate +MLLM-Refusal on four MLLMs across four datasets, demonstrating its +effectiveness in causing competing MLLMs to refuse safe prompts while not +affecting non-competing MLLMs. Furthermore, we explore three potential +countermeasures-adding Gaussian noise, DiffPure, and adversarial training. Our +results show that though they can mitigate MLLM-Refusal's effectiveness, they +also sacrifice the accuracy and/or efficiency of the competing MLLM. The code +is available at https://github.com/Sadcardation/MLLM-Refusal. + +
+
+
+
+
+ + ♻ ☆ QEDCartographer: Automating Formal Verification Using Reward-Free + Reinforcement Learning ICSE + + +
+ Formal verification is a promising method for producing reliable software, +but the difficulty of manually writing verification proofs severely limits its +utility in practice. Recent methods have automated some proof synthesis by +guiding a search through the proof space using a theorem prover. Unfortunately, +the theorem prover provides only the crudest estimate of progress, resulting in +effectively undirected search. To address this problem, we create +QEDCartographer, an automated proof-synthesis tool that combines supervised and +reinforcement learning to more effectively explore the proof space. +QEDCartographer incorporates the proofs' branching structure, enabling +reward-free search and overcoming the sparse reward problem inherent to formal +verification. We evaluate QEDCartographer using the CoqGym benchmark of 68.5K +theorems from 124 open-source Coq projects. QEDCartographer fully automatically +proves 21.4% of the test-set theorems. Previous search-based proof-synthesis +tools Tok, Tac, ASTactic, Passport, and Proverbot9001, which rely only on +supervised learning, prove 9.6%, 9.8%, 10.9%, 12.5%, and 19.8%, respectively. +Diva, which combines 62 tools, proves 19.2%. Comparing to the most effective +prior tool, Proverbot9001, QEDCartographer produces 34% shorter proofs 29% +faster, on average over the theorems both tools prove. Together, +QEDCartographer and non-learning-based CoqHammer prove 30.3% of the theorems, +while CoqHammer alone proves 26.6%. Our work demonstrates that reinforcement +learning is a fruitful research direction for improving proof-synthesis tools' +search mechanisms. + +
+
+ comment: Published in the International Conference on Software Engineering + (ICSE) 2025: Alex Sanchez-Stern, Abhishek Varghese, Zhanna Kaufman, Dylan + Zhang, Talia Ringer, and Yuriy Brun, QEDCartographer: Automating Formal + Verification Using Reward-Free Reinforcement Learning, in Proceedings of the + 47th International Conference on Software Engineering (ICSE), 2025 +
+
+
+
+
+ + ♻ ☆ Visual Prompting Upgrades Neural Network Sparsification: A Data-Model + Perspective + + +
+ The rapid development of large-scale deep learning models questions the +affordability of hardware platforms, which necessitates the pruning to reduce +their computational and memory footprints. Sparse neural networks as the +product, have demonstrated numerous favorable benefits like low complexity, +undamaged generalization, etc. Most of the prominent pruning strategies are +invented from a model-centric perspective, focusing on searching and preserving +crucial weights by analyzing network topologies. However, the role of data and +its interplay with model-centric pruning has remained relatively unexplored. In +this research, we introduce a novel data-model co-design perspective: to +promote superior weight sparsity by learning important model topology and +adequate input data in a synergetic manner. Specifically, customized Visual +Prompts are mounted to upgrade neural Network sparsification in our proposed +VPNs framework. As a pioneering effort, this paper conducts systematic +investigations about the impact of different visual prompts on model pruning +and suggests an effective joint optimization approach. Extensive experiments +with 3 network architectures and 8 datasets evidence the substantial +performance improvements from VPNs over existing start-of-the-art pruning +algorithms. Furthermore, we find that subnetworks discovered by VPNs from +pre-trained models enjoy better transferability across diverse downstream +scenarios. These insights shed light on new promising possibilities of +data-model co-designs for vision model sparsification. + +
+
+
+
+
+ + ♻ ☆ A Survey on Efficient Federated Learning Methods for Foundation Model + Training IJCAI 2024 + + +
+ Federated Learning (FL) has become an established technique to facilitate +privacy-preserving collaborative training across a multitude of clients. +However, new approaches to FL often discuss their contributions involving small +deep-learning models only and focus on training full models on clients. In the +wake of Foundation Models (FM), the reality is different for many deep learning +applications. Typically, FMs have already been pre-trained across a wide +variety of tasks and can be fine-tuned to specific downstream tasks over +significantly smaller datasets than required for full model training. However, +access to such datasets is often challenging. By its design, FL can help to +open data silos. With this survey, we introduce a novel taxonomy focused on +computational and communication efficiency, the vital elements to make use of +FMs in FL systems. We discuss the benefits and drawbacks of parameter-efficient +fine-tuning (PEFT) for FL applications, elaborate on the readiness of FL +frameworks to work with FMs, and provide future research opportunities on how +to evaluate generative models in FL as well as the interplay of privacy and +PEFT. + +
+
+ comment: Accepted for publication at IJCAI 2024. Please cite the published + paper via https://doi.org/10.24963/ijcai.2024/919 +
+
+
+
+
+ + ♻ ☆ One flow to correct them all: improving simulations in high-energy + physics with a single normalising flow and a switch + + +
+ Simulated events are key ingredients in almost all high-energy physics +analyses. However, imperfections in the simulation can lead to sizeable +differences between the observed data and simulated events. The effects of such +mismodelling on relevant observables must be corrected either effectively via +scale factors, with weights or by modifying the distributions of the +observables and their correlations. We introduce a correction method that +transforms one multidimensional distribution (simulation) into another one +(data) using a simple architecture based on a single normalising flow with a +boolean condition. We demonstrate the effectiveness of the method on a +physics-inspired toy dataset with non-trivial mismodelling of several +observables and their correlations. + +
+
+ comment: 19 pages, 12 figures, Dataset: + https://doi.org/10.5281/zenodo.13305706 +
+
+
+
+
+ + ♻ ☆ Less is More: Fewer Interpretable Region via Submodular Subset Selection ICLR 2024 + + +
+ Image attribution algorithms aim to identify important regions that are +highly relevant to model decisions. Although existing attribution solutions can +effectively assign importance to target elements, they still face the following +challenges: 1) existing attribution methods generate inaccurate small regions +thus misleading the direction of correct attribution, and 2) the model cannot +produce good attribution results for samples with wrong predictions. To address +the above challenges, this paper re-models the above image attribution problem +as a submodular subset selection problem, aiming to enhance model +interpretability using fewer regions. To address the lack of attention to local +regions, we construct a novel submodular function to discover more accurate +small interpretation regions. To enhance the attribution effect for all +samples, we also impose four different constraints on the selection of +sub-regions, i.e., confidence, effectiveness, consistency, and collaboration +scores, to assess the importance of various subsets. Moreover, our theoretical +analysis substantiates that the proposed function is in fact submodular. +Extensive experiments show that the proposed method outperforms SOTA methods on +two face datasets (Celeb-A and VGG-Face2) and one fine-grained dataset +(CUB-200-2011). For correctly predicted samples, the proposed method improves +the Deletion and Insertion scores with an average of 4.9% and 2.5% gain +relative to HSIC-Attribution. For incorrectly predicted samples, our method +achieves gains of 81.0% and 18.4% compared to the HSIC-Attribution algorithm in +the average highest confidence and Insertion score respectively. The code is +released at https://github.com/RuoyuChen10/SMDL-Attribution. + +
+
+ comment: Accepted to ICLR 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ OpenVLA: An Open-Source Vision-Language-Action Model + + +
+ Large policies pretrained on a combination of Internet-scale vision-language +data and diverse robot demonstrations have the potential to change how we teach +robots new skills: rather than training new behaviors from scratch, we can +fine-tune such vision-language-action (VLA) models to obtain robust, +generalizable policies for visuomotor control. Yet, widespread adoption of VLAs +for robotics has been challenging as 1) existing VLAs are largely closed and +inaccessible to the public, and 2) prior work fails to explore methods for +efficiently fine-tuning VLAs for new tasks, a key component for adoption. +Addressing these challenges, we introduce OpenVLA, a 7B-parameter open-source +VLA trained on a diverse collection of 970k real-world robot demonstrations. +OpenVLA builds on a Llama 2 language model combined with a visual encoder that +fuses pretrained features from DINOv2 and SigLIP. As a product of the added +data diversity and new model components, OpenVLA demonstrates strong results +for generalist manipulation, outperforming closed models such as RT-2-X (55B) +by 16.5% in absolute task success rate across 29 tasks and multiple robot +embodiments, with 7x fewer parameters. We further show that we can effectively +fine-tune OpenVLA for new settings, with especially strong generalization +results in multi-task environments involving multiple objects and strong +language grounding abilities, and outperform expressive from-scratch imitation +learning methods such as Diffusion Policy by 20.4%. We also explore compute +efficiency; as a separate contribution, we show that OpenVLA can be fine-tuned +on consumer GPUs via modern low-rank adaptation methods and served efficiently +via quantization without a hit to downstream success rate. Finally, we release +model checkpoints, fine-tuning notebooks, and our PyTorch codebase with +built-in support for training VLAs at scale on Open X-Embodiment datasets. + +
+
+ comment: Website: https://openvla.github.io/ +
+
+
+
+
+ + ♻ ☆ Unlearning Targeted Information via Single Layer Unlearning Gradient + + +
+ Unauthorized privacy-related and copyrighted content generation using +generative-AI is becoming a significant concern for human society, raising +ethical, legal, and privacy issues that demand urgent attention. The EU's +General Data Protection Regulation (GDPR) include a "right to be forgotten," +which allows individuals to request the deletion of their personal data. +However, this primarily applies to data stored in traditional databases, not AI +models. Recently, machine unlearning techniques have arise that attempt to +eliminate the influence of sensitive content used during AI model training, but +they often require extensive updates to the deployed systems and incur +substantial computational costs. In this work, we propose a novel and efficient +method called Single Layer Unlearning Gradient (SLUG), that can unlearn +targeted information by updating targeted layers of a model using a one-time +gradient computation. Our method is highly modular and enables the selective +removal of multiple sensitive concepts, such as celebrity names and copyrighted +content, from the generated outputs of widely used foundation models (e.g., +CLIP) and generative models (e.g., Stable Diffusion). Broadly, our method +ensures AI-generated content complies with privacy regulations and intellectual +property laws, fostering responsible use of generative models, mitigating legal +risks and promoting a trustworthy, socially responsible AI ecosystem. + +
+
+
+
+
+ + ♻ ☆ Multiply-Robust Causal Change Attribution + + +
+ Comparing two samples of data, we observe a change in the distribution of an +outcome variable. In the presence of multiple explanatory variables, how much +of the change can be explained by each possible cause? We develop a new +estimation strategy that, given a causal model, combines regression and +re-weighting methods to quantify the contribution of each causal mechanism. Our +proposed methodology is multiply robust, meaning that it still recovers the +target parameter under partial misspecification. We prove that our estimator is +consistent and asymptotically normal. Moreover, it can be incorporated into +existing frameworks for causal attribution, such as Shapley values, which will +inherit the consistency and large-sample distribution properties. Our method +demonstrates excellent performance in Monte Carlo simulations, and we show its +usefulness in an empirical application. Our method is implemented as part of +the Python library DoWhy (arXiv:2011.04216, arXiv:2206.06821). + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ SegTalker: Segmentation-based Talking Face Generation with Mask-guided + Local Editing + + +
+ Audio-driven talking face generation aims to synthesize video with lip +movements synchronized to input audio. However, current generative techniques +face challenges in preserving intricate regional textures (skin, teeth). To +address the aforementioned challenges, we propose a novel framework called +SegTalker to decouple lip movements and image textures by introducing +segmentation as intermediate representation. Specifically, given the mask of +image employed by a parsing network, we first leverage the speech to drive the +mask and generate talking segmentation. Then we disentangle semantic regions of +image into style codes using a mask-guided encoder. Ultimately, we inject the +previously generated talking segmentation and style codes into a mask-guided +StyleGAN to synthesize video frame. In this way, most of textures are fully +preserved. Moreover, our approach can inherently achieve background separation +and facilitate mask-guided facial local editing. In particular, by editing the +mask and swapping the region textures from a given reference image (e.g. hair, +lip, eyebrows), our approach enables facial editing seamlessly when generating +talking face video. Experiments demonstrate that our proposed approach can +effectively preserve texture details and generate temporally consistent video +while remaining competitive in lip synchronization. Quantitative and +qualitative results on the HDTF and MEAD datasets illustrate the superior +performance of our method over existing methods. + +
+
+ comment: 10 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ Make Graph-based Referring Expression Comprehension Great Again through + Expression-guided Dynamic Gating and Regression + + +
+ One common belief is that with complex models and pre-training on large-scale +datasets, transformer-based methods for referring expression comprehension +(REC) perform much better than existing graph-based methods. We observe that +since most graph-based methods adopt an off-the-shelf detector to locate +candidate objects (i.e., regions detected by the object detector), they face +two challenges that result in subpar performance: (1) the presence of +significant noise caused by numerous irrelevant objects during reasoning, and +(2) inaccurate localization outcomes attributed to the provided detector. To +address these issues, we introduce a plug-and-adapt module guided by +sub-expressions, called dynamic gate constraint (DGC), which can adaptively +disable irrelevant proposals and their connections in graphs during reasoning. +We further introduce an expression-guided regression strategy (EGR) to refine +location prediction. Extensive experimental results on the RefCOCO, RefCOCO+, +RefCOCOg, Flickr30K, RefClef, and Ref-reasoning datasets demonstrate the +effectiveness of the DGC module and the EGR strategy in consistently boosting +the performances of various graph-based REC methods. Without any pretaining, +the proposed graph-based method achieves better performance than the +state-of-the-art (SOTA) transformer-based methods. + +
+
+ comment: 12 pages to appear in IEEE Transactions on Multimedia +
+
+
+
+
+ + ☆ Eetimating Indoor Scene Depth Maps from Ultrasonic Echoes ICIP 2024 + + +
+ Measuring 3D geometric structures of indoor scenes requires dedicated depth +sensors, which are not always available. Echo-based depth estimation has +recently been studied as a promising alternative solution. All previous studies +have assumed the use of echoes in the audible range. However, one major problem +is that audible echoes cannot be used in quiet spaces or other situations where +producing audible sounds is prohibited. In this paper, we consider echo-based +depth estimation using inaudible ultrasonic echoes. While ultrasonic waves +provide high measurement accuracy in theory, the actual depth estimation +accuracy when ultrasonic echoes are used has remained unclear, due to its +disadvantage of being sensitive to noise and susceptible to attenuation. We +first investigate the depth estimation accuracy when the frequency of the sound +source is restricted to the high-frequency band, and found that the accuracy +decreased when the frequency was limited to ultrasonic ranges. Based on this +observation, we propose a novel deep learning method to improve the accuracy of +ultrasonic echo-based depth estimation by using audible echoes as auxiliary +data only during training. Experimental results with a public dataset +demonstrate that our method improves the estimation accuracy. + +
+
+ comment: ICIP 2024 +
+
+
+
+
+ + ☆ WaterMAS: Sharpness-Aware Maximization for Neural Network Watermarking + + +
+ Nowadays, deep neural networks are used for solving complex tasks in several +critical applications and protecting both their integrity and intellectual +property rights (IPR) has become of utmost importance. To this end, we advance +WaterMAS, a substitutive, white-box neural network watermarking method that +improves the trade-off among robustness, imperceptibility, and computational +complexity, while making provisions for increased data payload and security. +WasterMAS insertion keeps unchanged the watermarked weights while sharpening +their underlying gradient space. The robustness is thus ensured by limiting the +attack's strength: even small alterations of the watermarked weights would +impact the model's performance. The imperceptibility is ensured by inserting +the watermark during the training process. The relationship among the WaterMAS +data payload, imperceptibility, and robustness properties is discussed. The +secret key is represented by the positions of the weights conveying the +watermark, randomly chosen through multiple layers of the model. The security +is evaluated by investigating the case in which an attacker would intercept the +key. The experimental validations consider 5 models and 2 tasks (VGG16, +ResNet18, MobileNetV3, SwinT for CIFAR10 image classification, and DeepLabV3 +for Cityscapes image segmentation) as well as 4 types of attacks (Gaussian +noise addition, pruning, fine-tuning, and quantization). The code will be +released open-source upon acceptance of the article. + +
+
+
+
+
+ + ☆ MetaBGM: Dynamic Soundtrack Transformation For Continuous Multi-Scene + Experiences With Ambient Awareness And Personalization + + +
+ This paper introduces MetaBGM, a groundbreaking framework for generating +background music that adapts to dynamic scenes and real-time user interactions. +We define multi-scene as variations in environmental contexts, such as +transitions in game settings or movie scenes. To tackle the challenge of +converting backend data into music description texts for audio generation +models, MetaBGM employs a novel two-stage generation approach that transforms +continuous scene and user state data into these texts, which are then fed into +an audio generation model for real-time soundtrack creation. Experimental +results demonstrate that MetaBGM effectively generates contextually relevant +and dynamic background music for interactive applications. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Character Identification and Speaker Prediction in Comics via + Iterative Multimodal Fusion + + +
+ Recognizing characters and predicting speakers of dialogue are critical for +comic processing tasks, such as voice generation or translation. However, +because characters vary by comic title, supervised learning approaches like +training character classifiers which require specific annotations for each +comic title are infeasible. This motivates us to propose a novel zero-shot +approach, allowing machines to identify characters and predict speaker names +based solely on unannotated comic images. In spite of their importance in +real-world applications, these task have largely remained unexplored due to +challenges in story comprehension and multimodal integration. Recent large +language models (LLMs) have shown great capability for text understanding and +reasoning, while their application to multimodal content analysis is still an +open problem. To address this problem, we propose an iterative multimodal +framework, the first to employ multimodal information for both character +identification and speaker prediction tasks. Our experiments demonstrate the +effectiveness of the proposed framework, establishing a robust baseline for +these tasks. Furthermore, since our method requires no training data or +annotations, it can be used as-is on any comic series. + +
+
+ comment: Accepted to ACM Multimedia 2024. Project page: + https://liyingxuan1012.github.io/zeroshot-speaker-prediction ; Github repo: + https://github.com/liyingxuan1012/zeroshot-speaker-prediction +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 81 + +
+
+
+ + ☆ RoboTwin: Dual-Arm Robot Benchmark with Generative Digital Twins (early + version) + + +
+ Effective collaboration of dual-arm robots and their tool use capabilities +are increasingly important areas in the advancement of robotics. These skills +play a significant role in expanding robots' ability to operate in diverse +real-world environments. However, progress is impeded by the scarcity of +specialized training data. This paper introduces RoboTwin, a novel benchmark +dataset combining real-world teleoperated data with synthetic data from digital +twins, designed for dual-arm robotic scenarios. Using the COBOT Magic platform, +we have collected diverse data on tool usage and human-robot interaction. We +present a innovative approach to creating digital twins using AI-generated +content, transforming 2D images into detailed 3D models. Furthermore, we +utilize large language models to generate expert-level training data and +task-specific pose sequences oriented toward functionality. Our key +contributions are: 1) the RoboTwin benchmark dataset, 2) an efficient +real-to-simulation pipeline, and 3) the use of language models for automatic +expert-level data generation. These advancements are designed to address the +shortage of robotic training data, potentially accelerating the development of +more capable and versatile robotic systems for a wide range of real-world +applications. The project page is available at +https://robotwin-benchmark.github.io/early-version/ + +
+
+ comment: Project page: https://robotwin-benchmark.github.io/early-version/ +
+
+
+
+
+ + ☆ Masked Diffusion Models are Secretly Time-Agnostic Masked Models and + Exploit Inaccurate Categorical Sampling + + +
+ Masked diffusion models (MDMs) have emerged as a popular research topic for +generative modeling of discrete data, thanks to their superior performance over +other discrete diffusion models, and are rivaling the auto-regressive models +(ARMs) for language modeling tasks. The recent effort in simplifying the masked +diffusion framework further leads to alignment with continuous-space diffusion +models and more principled training and sampling recipes. In this paper, +however, we reveal that both training and sampling of MDMs are theoretically +free from the time variable, arguably the key signature of diffusion models, +and are instead equivalent to masked models. The connection on the sampling +aspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we +show that the FHS is theoretically equivalent to MDMs' original generation +process while significantly alleviating the time-consuming categorical sampling +and achieving a 20$\times$ speedup. In addition, our investigation challenges +previous claims that MDMs can surpass ARMs in generative perplexity. We +identify, for the first time, an underlying numerical issue, even with the +32-bit floating-point precision, which results in inaccurate categorical +sampling. We show that the numerical issue lowers the effective temperature +both theoretically and empirically, leading to unfair assessments of MDMs' +generation results in the previous literature. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ LongCite: Enabling LLMs to Generate Fine-grained Citations in + Long-context QA + + +
+ Though current long-context large language models (LLMs) have demonstrated +impressive capacities in answering user questions based on extensive text, the +lack of citations in their responses makes user verification difficult, leading +to concerns about their trustworthiness due to their potential hallucinations. +In this work, we aim to enable long-context LLMs to generate responses with +fine-grained sentence-level citations, improving their faithfulness and +verifiability. We first introduce LongBench-Cite, an automated benchmark for +assessing current LLMs' performance in Long-Context Question Answering with +Citations (LQAC), revealing considerable room for improvement. To this end, we +propose CoF (Coarse to Fine), a novel pipeline that utilizes off-the-shelf LLMs +to automatically generate long-context QA instances with precise sentence-level +citations, and leverage this pipeline to construct LongCite-45k, a large-scale +SFT dataset for LQAC. Finally, we train LongCite-8B and LongCite-9B using the +LongCite-45k dataset, successfully enabling their generation of accurate +responses and fine-grained sentence-level citations in a single output. The +evaluation results on LongBench-Cite show that our trained models achieve +state-of-the-art citation quality, surpassing advanced proprietary models +including GPT-4o. + +
+
+
+
+
+ + ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 19 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ Configurable Foundation Models: Building LLMs from a Modular Perspective + + +
+ Advancements in LLMs have recently unveiled challenges tied to computational +efficiency and continual scalability due to their requirements of huge +parameters, making the applications and evolution of these models on devices +with limited computation resources and scenarios requiring various abilities +increasingly cumbersome. Inspired by modularity within the human brain, there +is a growing tendency to decompose LLMs into numerous functional modules, +allowing for inference with part of modules and dynamic assembly of modules to +tackle complex tasks, such as mixture-of-experts. To highlight the inherent +efficiency and composability of the modular approach, we coin the term brick to +represent each functional module, designating the modularized structure as +configurable foundation models. In this paper, we offer a comprehensive +overview and investigation of the construction, utilization, and limitation of +configurable foundation models. We first formalize modules into emergent bricks +- functional neuron partitions that emerge during the pre-training phase, and +customized bricks - bricks constructed via additional post-training to improve +the capabilities and knowledge of LLMs. Based on diverse functional bricks, we +further present four brick-oriented operations: retrieval and routing, merging, +updating, and growing. These operations allow for dynamic configuration of LLMs +based on instructions to handle complex tasks. To verify our perspective, we +conduct an empirical analysis on widely-used LLMs. We find that the FFN layers +follow modular patterns with functional specialization of neurons and +functional neuron partitions. Finally, we highlight several open issues and +directions for future research. Overall, this paper aims to offer a fresh +modular perspective on existing LLM research and inspire the future creation of +more efficient and scalable foundational models. + +
+
+
+
+
+ + ☆ Historical German Text Normalization Using Type- and Token-Based + Language Modeling + + +
+ Historic variations of spelling poses a challenge for full-text search or +natural language processing on historical digitized texts. To minimize the gap +between the historic orthography and contemporary spelling, usually an +automatic orthographic normalization of the historical source material is +pursued. This report proposes a normalization system for German literary texts +from c. 1700-1900, trained on a parallel corpus. The proposed system makes use +of a machine learning approach using Transformer language models, combining an +encoder-decoder model to normalize individual word types, and a pre-trained +causal language model to adjust these normalizations within their context. An +extensive evaluation shows that the proposed system provides state-of-the-art +accuracy, comparable with a much larger fully end-to-end sentence-based +normalization system, fine-tuning a pre-trained Transformer large language +model. However, the normalization of historical text remains a challenge due to +difficulties for models to generalize, and the lack of extensive high-quality +parallel data. + +
+
+ comment: 27 pages, 3 figures +
+
+
+
+
+ + ☆ R2GQA: Retriever-Reader-Generator Question Answering System to Support + Students Understanding Legal Regulations in Higher Education + + +
+ In this article, we propose the R2GQA system, a Retriever-Reader-Generator +Question Answering system, consisting of three main components: Document +Retriever, Machine Reader, and Answer Generator. The Retriever module employs +advanced information retrieval techniques to extract the context of articles +from a dataset of legal regulation documents. The Machine Reader module +utilizes state-of-the-art natural language understanding algorithms to +comprehend the retrieved documents and extract answers. Finally, the Generator +module synthesizes the extracted answers into concise and informative responses +to questions of students regarding legal regulations. Furthermore, we built the +ViRHE4QA dataset in the domain of university training regulations, comprising +9,758 question-answer pairs with a rigorous construction process. This is the +first Vietnamese dataset in the higher regulations domain with various types of +answers, both extractive and abstractive. In addition, the R2GQA system is the +first system to offer abstractive answers in Vietnamese. This paper discusses +the design and implementation of each module within the R2GQA system on the +ViRHE4QA dataset, highlighting their functionalities and interactions. +Furthermore, we present experimental results demonstrating the effectiveness +and utility of the proposed system in supporting the comprehension of students +of legal regulations in higher education settings. In general, the R2GQA system +and the ViRHE4QA dataset promise to contribute significantly to related +research and help students navigate complex legal documents and regulations, +empowering them to make informed decisions and adhere to institutional policies +effectively. Our dataset is available for research purposes. + +
+
+
+
+
+ + ☆ Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency + Discussions by Few-Shot Learning with Large Language Models + + +
+ This study performs analysis of Predictive statements, Hope speech, and +Regret Detection behaviors within cryptocurrency-related discussions, +leveraging advanced natural language processing techniques. We introduce a +novel classification scheme named "Prediction statements," categorizing +comments into Predictive Incremental, Predictive Decremental, Predictive +Neutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large +language model, we explore sentiment dynamics across five prominent +cryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis +reveals distinct patterns in predictive sentiments, with Matic demonstrating a +notably higher propensity for optimistic predictions. Additionally, we +investigate hope and regret sentiments, uncovering nuanced interplay between +these emotions and predictive behaviors. Despite encountering limitations +related to data volume and resource availability, our study reports valuable +discoveries concerning investor behavior and sentiment trends within the +cryptocurrency market, informing strategic decision-making and future research +endeavors. + +
+
+
+
+
+ + ☆ CMM-Math: A Chinese Multimodal Math Dataset To Evaluate and Enhance the + Mathematics Reasoning of Large Multimodal Models + + +
+ Large language models (LLMs) have obtained promising results in mathematical +reasoning, which is a foundational skill for human intelligence. Most previous +studies focus on improving and measuring the performance of LLMs based on +textual math reasoning datasets (e.g., MATH, GSM8K). Recently, a few +researchers have released English multimodal math datasets (e.g., MATHVISTA and +MATH-V) to evaluate the effectiveness of large multimodal models (LMMs). In +this paper, we release a Chinese multimodal math (CMM-Math) dataset, including +benchmark and training parts, to evaluate and enhance the mathematical +reasoning of LMMs. CMM-Math contains over 28,000 high-quality samples, +featuring a variety of problem types (e.g., multiple-choice, fill-in-the-blank, +and so on) with detailed solutions across 12 grade levels from elementary to +high school in China. Specifically, the visual context may be present in the +questions or opinions, which makes this dataset more challenging. Through +comprehensive analysis, we discover that state-of-the-art LMMs on the CMM-Math +dataset face challenges, emphasizing the necessity for further improvements in +LMM development. We also propose a Multimodal Mathematical LMM (Math-LMM) to +handle the problems with mixed input of multiple images and text segments. We +train our model using three stages, including foundational pre-training, +foundational fine-tuning, and mathematical fine-tuning. The extensive +experiments indicate that our model effectively improves math reasoning +performance by comparing it with the SOTA LMMs over three multimodal +mathematical datasets. + +
+
+
+
+
+ + ☆ MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding + Benchmark + + +
+ This paper introduces MMMU-Pro, a robust version of the Massive +Multi-discipline Multimodal Understanding and Reasoning (MMMU) benchmark. +MMMU-Pro rigorously assesses multimodal models' true understanding and +reasoning capabilities through a three-step process based on MMMU: (1) +filtering out questions answerable by text-only models, (2) augmenting +candidate options, and (3) introducing a vision-only input setting where +questions are embedded within images. This setting challenges AI to truly "see" +and "read" simultaneously, testing a fundamental human cognitive skill of +seamlessly integrating visual and textual information. Results show that model +performance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8% +to 26.9% across models. We explore the impact of OCR prompts and Chain of +Thought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT +generally improves performance. MMMU-Pro provides a more rigorous evaluation +tool, closely mimicking real-world scenarios and offering valuable directions +for future research in multimodal AI. + +
+
+
+
+
+ + ☆ Towards a Unified View of Preference Learning for Large Language Models: + A Survey + + +
+ Large Language Models (LLMs) exhibit remarkably powerful capabilities. One of +the crucial factors to achieve success is aligning the LLM's output with human +preferences. This alignment process often requires only a small amount of data +to efficiently enhance the LLM's performance. While effective, research in this +area spans multiple domains, and the methods involved are relatively complex to +understand. The relationships between different methods have been +under-explored, limiting the development of the preference alignment. In light +of this, we break down the existing popular alignment strategies into different +components and provide a unified framework to study the current alignment +strategies, thereby establishing connections among them. In this survey, we +decompose all the strategies in preference learning into four components: +model, data, feedback, and algorithm. This unified view offers an in-depth +understanding of existing alignment algorithms and also opens up possibilities +to synergize the strengths of different strategies. Furthermore, we present +detailed working examples of prevalent existing algorithms to facilitate a +comprehensive understanding for the readers. Finally, based on our unified +perspective, we explore the challenges and future research directions for +aligning large language models with human preferences. + +
+
+ comment: Initial Commit, 21 pages +
+
+
+
+
+ + ☆ A Comparative Study of Pre-training and Self-training + + +
+ Pre-training and self-training are two approaches to semi-supervised +learning. The comparison between pre-training and self-training has been +explored. However, the previous works led to confusing findings: self-training +outperforms pre-training experienced on some tasks in computer vision, and +contrarily, pre-training outperforms self-training experienced on some tasks in +natural language processing, under certain conditions of incomparable settings. +We propose, comparatively and exhaustively, an ensemble method to empirical +study all feasible training paradigms combining pre-training, self-training, +and fine-tuning within consistent foundational settings comparable to data +augmentation. We conduct experiments on six datasets, four data augmentation, +and imbalanced data for sentiment analysis and natural language inference +tasks. Our findings confirm that the pre-training and fine-tuning paradigm +yields the best overall performances. Moreover, self-training offers no +additional benefits when combined with semi-supervised pre-training. + +
+
+ comment: 19 pages, 2 figures, 9 tables +
+
+
+
+
+ + ☆ Pooling And Attention: What Are Effective Designs For LLm-Based + Embedding Models? + + +
+ The significant advancements of Large Language Models (LLMs) in generative +tasks have led to a growing body of work exploring LLM-based embedding models. +While these models, employing different pooling and attention strategies, have +achieved state-of-the-art performance on public embedding benchmarks, questions +still arise about what constitutes an effective design for LLM-based embedding +models. However, these models are often trained on different datasets, using +different LLM base models or training settings. Moreover, evaluations on public +embedding benchmarks often fail to report statistical significance, making it +difficult to determine which designs truly contribute to final performance. +This complicates the process for practitioners seeking optimal training recipes +for LLM-based embedding models. In this study, we conduct a large-scale +experiment by training a series of LLM-based embedding models using the same +training data and base model but differing in their pooling and attention +strategies. The results show that there is no one-size-fits-all solution: while +bidirectional attention and an additional trainable pooling layer outperform in +text similarity and information retrieval tasks, they do not significantly +surpass simpler designs like EOS-last token pooling and default causal +attention in clustering and classification tasks. Furthermore, we propose a new +pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs +of all hidden layers, rather than just the last layer, using a cross-attention +network. This method proves to be statistically superior in text similarity and +retrieval tasks compared to existing pooling methods. Overall, this paper sheds +light on effective training strategies for LLM-based embedding models. + +
+
+ comment: https://github.com/yixuantt/PoolingAndAttn +
+
+
+
+
+ + ☆ Pre-training data selection for biomedical domain adaptation using + journal impact metrics + + +
+ Domain adaptation is a widely used method in natural language processing +(NLP) to improve the performance of a language model within a specific domain. +This method is particularly common in the biomedical domain, which sees regular +publication of numerous scientific articles. PubMed, a significant corpus of +text, is frequently used in the biomedical domain. The primary objective of +this study is to explore whether refining a pre-training dataset using specific +quality metrics for scientific papers can enhance the performance of the +resulting model. To accomplish this, we employ two straightforward journal +impact metrics and conduct experiments by continually pre-training BERT on +various subsets of the complete PubMed training set, we then evaluate the +resulting models on biomedical language understanding tasks from the BLURB +benchmark. Our results show that pruning using journal impact metrics is not +efficient. But we also show that pre-training using fewer abstracts (but with +the same number of training steps) does not necessarily decrease the resulting +model's performance. + +
+
+
+
+
+ + ☆ Alignment-Aware Model Extraction Attacks on Large Language Models + + +
+ Model extraction attacks (MEAs) on large language models (LLMs) have received +increasing research attention lately. Existing attack methods on LLMs inherit +the extraction strategies from those designed for deep neural networks (DNNs) +yet neglect the inconsistency of training tasks between MEA and LLMs' +alignments. As such, they result in poor attack performances. To tackle this +issue, we present Locality Reinforced Distillation (LoRD), a novel model +extraction attack algorithm specifically for LLMs. In particular, we design a +policy-gradient-style training task, which utilizes victim models' responses as +a signal to guide the crafting of preference for the local model. Theoretical +analysis has shown that i) LoRD's convergence procedure in MEAs is consistent +with the alignments of LLMs, and ii) LoRD can reduce query complexity while +mitigating watermark protection through exploration-based stealing. Extensive +experiments on domain-specific extractions demonstrate the superiority of our +method by examining the extraction of various state-of-the-art commercial LLMs. + +
+
+ comment: Source code: https://github.com/liangzid/alignmentExtraction +
+
+
+
+
+ + ☆ A Data Selection Approach for Enhancing Low Resource Machine Translation + Using Cross-Lingual Sentence Representations + + +
+ Machine translation in low-resource language pairs faces significant +challenges due to the scarcity of parallel corpora and linguistic resources. +This study focuses on the case of English-Marathi language pairs, where +existing datasets are notably noisy, impeding the performance of machine +translation models. To mitigate the impact of data quality issues, we propose a +data filtering approach based on cross-lingual sentence representations. Our +methodology leverages a multilingual SBERT model to filter out problematic +translations in the training data. Specifically, we employ an IndicSBERT +similarity model to assess the semantic equivalence between original and +translated sentences, allowing us to retain linguistically correct translations +while discarding instances with substantial deviations. The results demonstrate +a significant improvement in translation quality over the baseline +post-filtering with IndicSBERT. This illustrates how cross-lingual sentence +representations can reduce errors in machine translation scenarios with limited +resources. By integrating multilingual sentence BERT models into the +translation pipeline, this research contributes to advancing machine +translation techniques in low-resource environments. The proposed method not +only addresses the challenges in English-Marathi language pairs but also +provides a valuable framework for enhancing translation quality in other +low-resource language translation tasks. + +
+
+ comment: Accepted at I2CT 2024 +
+
+
+
+
+ + ☆ Detecting Calls to Action in Multimodal Content: Analysis of the 2021 + German Federal Election Campaign on Instagram + + +
+ This study investigates the automated classification of Calls to Action +(CTAs) within the 2021 German Instagram election campaign to advance the +understanding of mobilization in social media contexts. We analyzed over 2,208 +Instagram stories and 712 posts using fine-tuned BERT models and OpenAI's GPT-4 +models. The fine-tuned BERT model incorporating synthetic training data +achieved a macro F1 score of 0.93, demonstrating a robust classification +performance. Our analysis revealed that 49.58% of Instagram posts and 10.64% of +stories contained CTAs, highlighting significant differences in mobilization +strategies between these content types. Additionally, we found that FDP and the +Greens had the highest prevalence of CTAs in posts, whereas CDU and CSU led in +story CTAs. + +
+
+ comment: Accepted Archival Paper for the CPSS Workshop at KONVENS 2024. Camera + Ready Submission +
+
+
+
+
+ + ☆ Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for + Problem-Solving Improvement of LLMs + + +
+ Large Language Models (LLMs) have demonstrated remarkable efficiency in +tackling various tasks based on human instructions, but recent studies reveal +that these models often fail to achieve satisfactory results on questions +involving reasoning, such as mathematics or physics questions. This phenomenon +is usually attributed to the uncertainty regarding whether these models could +genuinely comprehend the knowledge embedded in the text or merely learn to +replicate the token distribution without a true understanding of the content. +In this paper, we delve into this problem and aim to enhance the reasoning +capabilities of LLMs. First, we investigate if the model has genuine reasoning +capabilities by visualizing the text generation process at the attention and +representation level. Then, we formulate the reasoning process of LLMs into a +causal framework, which provides a formal explanation of the problems we +observe in the visualization. Finally, building upon this causal framework, we +propose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient +fine-tuning (PEFT) method to enhance the model's reasoning capabilities by +encouraging the model to extract the general problem-solving skills and apply +these skills to different questions. Experiments show that our method +outperforms the baseline consistently across multiple benchmarks, and with only +1.2M tunable parameters, we achieve better or comparable results to other +fine-tuning methods. This demonstrates the effectiveness and efficiency of our +method in improving the overall accuracy and reliability of LLMs. + +
+
+
+
+
+ + ☆ Creating Domain-Specific Translation Memories for Machine Translation + Fine-tuning: The TRENCARD Bilingual Cardiology Corpus + + +
+ This article investigates how translation memories (TM) can be created by +translators or other language professionals in order to compile domain-specific +parallel corpora , which can then be used in different scenarios, such as +machine translation training and fine-tuning, TM leveraging, and/or large +language model fine-tuning. The article introduces a semi-automatic TM +preparation methodology leveraging primarily translation tools used by +translators in favor of data quality and control by the translators. This +semi-automatic methodology is then used to build a cardiology-based Turkish -> +English corpus from bilingual abstracts of Turkish cardiology journals. The +resulting corpus called TRENCARD Corpus has approximately 800,000 source words +and 50,000 sentences. Using this methodology, translators can build their +custom TMs in a reasonable time and use them in their bilingual data requiring +tasks. + +
+
+
+
+
+ + ☆ OpenFact at CheckThat! 2024: Combining Multiple Attack Methods for + Effective Adversarial Text Generation + + +
+ This paper presents the experiments and results for the CheckThat! Lab at +CLEF 2024 Task 6: Robustness of Credibility Assessment with Adversarial +Examples (InCrediblAE). The primary objective of this task was to generate +adversarial examples in five problem domains in order to evaluate the +robustness of widely used text classification methods (fine-tuned BERT, BiLSTM, +and RoBERTa) when applied to credibility assessment issues. + This study explores the application of ensemble learning to enhance +adversarial attacks on natural language processing (NLP) models. We +systematically tested and refined several adversarial attack methods, including +BERT-Attack, Genetic algorithms, TextFooler, and CLARE, on five datasets across +various misinformation tasks. By developing modified versions of BERT-Attack +and hybrid methods, we achieved significant improvements in attack +effectiveness. Our results demonstrate the potential of modification and +combining multiple methods to create more sophisticated and effective +adversarial attack strategies, contributing to the development of more robust +and secure systems. + +
+
+ comment: CLEF 2024 - Conference and Labs of the Evaluation Forum +
+
+
+
+
+ + ☆ A Survey on Emergent Language + + +
+ The field of emergent language represents a novel area of research within the +domain of artificial intelligence, particularly within the context of +multi-agent reinforcement learning. Although the concept of studying language +emergence is not new, early approaches were primarily concerned with explaining +human language formation, with little consideration given to its potential +utility for artificial agents. In contrast, studies based on reinforcement +learning aim to develop communicative capabilities in agents that are +comparable to or even superior to human language. Thus, they extend beyond the +learned statistical representations that are common in natural language +processing research. This gives rise to a number of fundamental questions, from +the prerequisites for language emergence to the criteria for measuring its +success. This paper addresses these questions by providing a comprehensive +review of 181 scientific publications on emergent language in artificial +intelligence. Its objective is to serve as a reference for researchers +interested in or proficient in the field. Consequently, the main contributions +are the definition and overview of the prevailing terminology, the analysis of +existing evaluation methods and metrics, and the description of the identified +research gaps. + +
+
+
+
+
+ + ☆ PUB: Plot Understanding Benchmark and Dataset for Evaluating Large + Language Models on Synthetic Visual Data Interpretation + + +
+ The ability of large language models (LLMs) to interpret visual +representations of data is crucial for advancing their application in data +analysis and decision-making processes. This paper presents a novel synthetic +dataset designed to evaluate the proficiency of LLMs in interpreting various +forms of data visualizations, including plots like time series, histograms, +violins, boxplots, and clusters. Our dataset is generated using controlled +parameters to ensure comprehensive coverage of potential real-world scenarios. +We employ multimodal text prompts with questions related to visual data in +images to benchmark several state-of-the-art models like ChatGPT or Gemini, +assessing their understanding and interpretative accuracy. + To ensure data integrity, our benchmark dataset is generated automatically, +making it entirely new and free from prior exposure to the models being tested. +This strategy allows us to evaluate the models' ability to truly interpret and +understand the data, eliminating possibility of pre-learned responses, and +allowing for an unbiased evaluation of the models' capabilities. We also +introduce quantitative metrics to assess the performance of the models, +providing a robust and comprehensive evaluation tool. + Benchmarking several state-of-the-art LLMs with this dataset reveals varying +degrees of success, highlighting specific strengths and weaknesses in +interpreting diverse types of visual data. The results provide valuable +insights into the current capabilities of LLMs and identify key areas for +improvement. This work establishes a foundational benchmark for future research +and development aimed at enhancing the visual interpretative abilities of +language models. In the future, improved LLMs with robust visual interpretation +skills can significantly aid in automated data analysis, scientific research, +educational tools, and business intelligence applications. + +
+
+
+
+
+ + ☆ An Analysis of Linear Complexity Attention Substitutes with BEST-RQ + + +
+ Self-Supervised Learning (SSL) has proven to be effective in various domains, +including speech processing. However, SSL is computationally and memory +expensive. This is in part due the quadratic complexity of multi-head +self-attention (MHSA). Alternatives for MHSA have been proposed and used in the +speech domain, but have yet to be investigated properly in an SSL setting. In +this work, we study the effects of replacing MHSA with recent state-of-the-art +alternatives that have linear complexity, namely, HyperMixing, Fastformer, +SummaryMixing, and Mamba. We evaluate these methods by looking at the speed, +the amount of VRAM consumed, and the performance on the SSL MP3S benchmark. +Results show that these linear alternatives maintain competitive performance +compared to MHSA while, on average, decreasing VRAM consumption by around 20% +to 60% and increasing speed from 7% to 65% for input sequences ranging from 20 +to 80 seconds. + +
+
+ comment: Accepted in the IEEE Soken Language Technology Workshop 2024 +
+
+
+
+
+ + ☆ More is More: Addition Bias in Large Language Models + + +
+ In this paper, we investigate the presence of additive bias in Large Language +Models (LLMs), drawing a parallel to the cognitive bias observed in humans +where individuals tend to favor additive over subtractive changes. Using a +series of controlled experiments, we tested various LLMs, including GPT-3.5 +Turbo, Claude 3.5 Sonnet, Mistral, Math$\Sigma$tral, and Llama 3.1, on tasks +designed to measure their propensity for additive versus subtractive +modifications. Our findings demonstrate a significant preference for additive +changes across all tested models. For example, in a palindrome creation task, +Llama 3.1 favored adding letters 97.85% of the time over removing them. +Similarly, in a Lego tower balancing task, GPT-3.5 Turbo chose to add a brick +76.38% of the time rather than remove one. In a text summarization task, +Mistral 7B produced longer summaries in 59.40% to 75.10% of cases when asked to +improve its own or others' writing. These results indicate that, similar to +humans, LLMs exhibit a marked additive bias, which might have implications when +LLMs are used on a large scale. Addittive bias might increase resource use and +environmental impact, leading to higher economic costs due to overconsumption +and waste. This bias should be considered in the development and application of +LLMs to ensure balanced and efficient problem-solving approaches. + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ☆ Language is Scary when Over-Analyzed: Unpacking Implied Misogynistic + Reasoning with Argumentation Theory-Driven Prompts + + +
+ We propose misogyny detection as an Argumentative Reasoning task and we +investigate the capacity of large language models (LLMs) to understand the +implicit reasoning used to convey misogyny in both Italian and English. The +central aim is to generate the missing reasoning link between a message and the +implied meanings encoding the misogyny. Our study uses argumentation theory as +a foundation to form a collection of prompts in both zero-shot and few-shot +settings. These prompts integrate different techniques, including +chain-of-thought reasoning and augmented knowledge. Our findings show that LLMs +fall short on reasoning capabilities about misogynistic comments and that they +mostly rely on their implicit knowledge derived from internalized common +stereotypes about women to generate implied assumptions, rather than on +inductive reasoning. + +
+
+
+
+
+ + ☆ Word and Phrase Features in Graph Convolutional Network for Automatic + Question Classification + + +
+ Effective question classification is crucial for AI-driven educational tools, +enabling adaptive learning systems to categorize questions by skill area, +difficulty level, and competence. This classification not only supports +educational diagnostics and analytics but also enhances complex tasks like +information retrieval and question answering by associating questions with +relevant categories. Traditional methods, often based on word embeddings and +conventional classifiers, struggle to capture the nuanced relationships in +natural language, leading to suboptimal performance. To address this, we +propose a novel approach leveraging graph convolutional networks (GCNs), named +Phrase Question-Graph Convolutional Network (PQ-GCN) to better model the +inherent structure of questions. By representing questions as graphs -- where +nodes signify words or phrases and edges denote syntactic or semantic +relationships -- our method allows GCNs to learn from the interconnected nature +of language more effectively. Additionally, we explore the incorporation of +phrase-based features to enhance classification accuracy, especially in +low-resource settings. Our findings demonstrate that GCNs, augmented with these +features, offer a promising solution for more accurate and context-aware +question classification, bridging the gap between graph neural network research +and practical educational applications. + +
+
+
+
+
+ + ☆ A Comparative Study on Large Language Models for Log Parsing + + +
+ Background: Log messages provide valuable information about the status of +software systems. This information is provided in an unstructured fashion and +automated approaches are applied to extract relevant parameters. To ease this +process, log parsing can be applied, which transforms log messages into +structured log templates. Recent advances in language models have led to +several studies that apply ChatGPT to the task of log parsing with promising +results. However, the performance of other state-of-the-art large language +models (LLMs) on the log parsing task remains unclear. + Aims: In this study, we investigate the current capability of +state-of-the-art LLMs to perform log parsing. + Method: We select six recent LLMs, including both paid proprietary (GPT-3.5, +Claude 2.1) and four free-to-use open models, and compare their performance on +system logs obtained from a selection of mature open-source projects. We design +two different prompting approaches and apply the LLMs on 1, 354 log templates +across 16 different projects. We evaluate their effectiveness, in the number of +correctly identified templates, and the syntactic similarity between the +generated templates and the ground truth. + Results: We found that free-to-use models are able to compete with paid +models, with CodeLlama extracting 10% more log templates correctly than +GPT-3.5. Moreover, we provide qualitative insights into the usability of +language models (e.g., how easy it is to use their responses). + Conclusions: Our results reveal that some of the smaller, free-to-use LLMs +can considerably assist log parsing compared to their paid proprietary +competitors, especially code-specialized models. + +
+
+ comment: Accepted for publication in the 18th ACM/IEEE International Symposium + on Empirical Software Engineering and Measurement (ESEM '24) +
+
+
+
+
+ + ☆ DetectiveQA: Evaluating Long-Context Reasoning on Detective Novels + + +
+ With the rapid advancement of Large Language Models (LLMs), long-context +information understanding and processing have become a hot topic in academia +and industry. However, benchmarks for evaluating the ability of LLMs to handle +long-context information do not seem to have kept pace with the development of +LLMs. Despite the emergence of various long-context evaluation benchmarks, the +types of capability assessed are still limited, without new capability +dimensions. In this paper, we introduce DetectiveQA, a narrative reasoning +benchmark featured with an average context length of over 100K tokens. +DetectiveQA focuses on evaluating the long-context reasoning ability of LLMs, +which not only requires a full understanding of context but also requires +extracting important evidences from the context and reasoning according to +extracted evidences to answer the given questions. This is a new dimension of +capability evaluation, which is more in line with the current intelligence +level of LLMs. We use detective novels as data sources, which naturally have +various reasoning elements. Finally, we manually annotated 600 questions in +Chinese and then also provided an English edition of the context information +and questions. We evaluate many long-context LLMs on DetectiveQA, including +commercial and open-sourced models, and the results indicate that existing +long-context LLMs still require significant advancements to effectively process +true long-context dependency questions. + +
+
+
+
+
+ + ☆ What is lost in Normalization? Exploring Pitfalls in Multilingual ASR + Model Evaluations EMNLP 2024 + + +
+ This paper explores the pitfalls in evaluating multilingual automatic speech +recognition (ASR) models, with a particular focus on Indic language scripts. We +investigate the text normalization routine employed by leading ASR models, +including OpenAI Whisper, Meta's MMS, Seamless, and Assembly AI's Conformer, +and their unintended consequences on performance metrics. Our research reveals +that current text normalization practices, while aiming to standardize ASR +outputs for fair comparison, by removing inconsistencies such as variations in +spelling, punctuation, and special characters, are fundamentally flawed when +applied to Indic scripts. Through empirical analysis using text similarity +scores and in-depth linguistic examination, we demonstrate that these flaws +lead to artificially inflated performance metrics for Indic languages. We +conclude by proposing a shift towards developing normalization routines that +leverage native linguistic expertise, ensuring more robust and accurate +evaluations of multilingual ASR models. + +
+
+ comment: Sumbitted to EMNLP 2024 +
+
+
+
+
+ + ☆ Large Language Models as Efficient Reward Function Searchers for + Custom-Environment Multi-Objective Reinforcement Learning + + +
+ Leveraging large language models (LLMs) for designing reward functions +demonstrates significant potential. However, achieving effective design and +improvement of reward functions in reinforcement learning (RL) tasks with +complex custom environments and multiple requirements presents considerable +challenges. In this paper, we enable LLMs to be effective white-box searchers, +highlighting their advanced semantic understanding capabilities. Specifically, +we generate reward components for each explicit user requirement and employ the +reward critic to identify the correct code form. Then, LLMs assign weights to +the reward components to balance their values and iteratively search and +optimize these weights based on the context provided by the training log +analyzer, while adaptively determining the search step size. We applied the +framework to an underwater information collection RL task without direct human +feedback or reward examples (zero-shot). The reward critic successfully correct +the reward code with only one feedback for each requirement, effectively +preventing irreparable errors that can occur when reward function feedback is +provided in aggregate. The effective initialization of weights enables the +acquisition of different reward functions within the Pareto solution set +without weight search. Even in the case where a weight is 100 times off, fewer +than four iterations are needed to obtain solutions that meet user +requirements. The framework also works well with most prompts utilizing GPT-3.5 +Turbo, since it does not require advanced numerical understanding or +calculation. + +
+
+
+
+
+ + ☆ Abstractive Text Summarization: State of the Art, Challenges, and + Improvements + + +
+ Specifically focusing on the landscape of abstractive text summarization, as +opposed to extractive techniques, this survey presents a comprehensive +overview, delving into state-of-the-art techniques, prevailing challenges, and +prospective research directions. We categorize the techniques into traditional +sequence-to-sequence models, pre-trained large language models, reinforcement +learning, hierarchical methods, and multi-modal summarization. Unlike prior +works that did not examine complexities, scalability and comparisons of +techniques in detail, this review takes a comprehensive approach encompassing +state-of-the-art methods, challenges, solutions, comparisons, limitations and +charts out future improvements - providing researchers an extensive overview to +advance abstractive summarization research. We provide vital comparison tables +across techniques categorized - offering insights into model complexity, +scalability and appropriate applications. The paper highlights challenges such +as inadequate meaning representation, factual consistency, controllable text +summarization, cross-lingual summarization, and evaluation metrics, among +others. Solutions leveraging knowledge incorporation and other innovative +strategies are proposed to address these challenges. The paper concludes by +highlighting emerging research areas like factual inconsistency, +domain-specific, cross-lingual, multilingual, and long-document summarization, +as well as handling noisy data. Our objective is to provide researchers and +practitioners with a structured overview of the domain, enabling them to better +understand the current landscape and identify potential areas for further +research and improvement. + +
+
+ comment: 9 Tables, 7 Figures +
+
+
+
+
+ + ☆ Determination of language families using deep learning + + +
+ We use a c-GAN (convolutional generative adversarial) neural network to +analyze transliterated text fragments of extant, dead comprehensible, and one +dead non-deciphered (Cypro-Minoan) language to establish linguistic affinities. +The paper is agnostic with respect to translation and/or deciphering. However, +there is hope that the proposed approach can be useful for decipherment with +more sophisticated neural network techniques. + +
+
+ comment: First draft. Comments are welcome +
+
+
+
+
+ + ☆ Large Language Models and Cognitive Science: A Comprehensive Review of + Similarities, Differences, and Challenges + + +
+ This comprehensive review explores the intersection of Large Language Models +(LLMs) and cognitive science, examining similarities and differences between +LLMs and human cognitive processes. We analyze methods for evaluating LLMs +cognitive abilities and discuss their potential as cognitive models. The review +covers applications of LLMs in various cognitive fields, highlighting insights +gained for cognitive science research. We assess cognitive biases and +limitations of LLMs, along with proposed methods for improving their +performance. The integration of LLMs with cognitive architectures is examined, +revealing promising avenues for enhancing artificial intelligence (AI) +capabilities. Key challenges and future research directions are identified, +emphasizing the need for continued refinement of LLMs to better align with +human cognition. This review provides a balanced perspective on the current +state and future potential of LLMs in advancing our understanding of both +artificial and human intelligence. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ☆ STAB: Speech Tokenizer Assessment Benchmark + + +
+ Representing speech as discrete tokens provides a framework for transforming +speech into a format that closely resembles text, thus enabling the use of +speech as an input to the widely successful large language models (LLMs). +Currently, while several speech tokenizers have been proposed, there is +ambiguity regarding the properties that are desired from a tokenizer for +specific downstream tasks and its overall generalizability. Evaluating the +performance of tokenizers across different downstream tasks is a +computationally intensive effort that poses challenges for scalability. To +circumvent this requirement, we present STAB (Speech Tokenizer Assessment +Benchmark), a systematic evaluation framework designed to assess speech +tokenizers comprehensively and shed light on their inherent characteristics. +This framework provides a deeper understanding of the underlying mechanisms of +speech tokenization, thereby offering a valuable resource for expediting the +advancement of future tokenizer models and enabling comparative analysis using +a standardized benchmark. We evaluate the STAB metrics and correlate this with +downstream task performance across a range of speech tasks and tokenizer +choices. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ How Privacy-Savvy Are Large Language Models? A Case Study on Compliance + and Privacy Technical Review + + +
+ The recent advances in large language models (LLMs) have significantly +expanded their applications across various fields such as language generation, +summarization, and complex question answering. However, their application to +privacy compliance and technical privacy reviews remains under-explored, +raising critical concerns about their ability to adhere to global privacy +standards and protect sensitive user data. This paper seeks to address this gap +by providing a comprehensive case study evaluating LLMs' performance in +privacy-related tasks such as privacy information extraction (PIE), legal and +regulatory key point detection (KPD), and question answering (QA) with respect +to privacy policies and data protection regulations. We introduce a Privacy +Technical Review (PTR) framework, highlighting its role in mitigating privacy +risks during the software development life-cycle. Through an empirical +assessment, we investigate the capacity of several prominent LLMs, including +BERT, GPT-3.5, GPT-4, and custom models, in executing privacy compliance checks +and technical privacy reviews. Our experiments benchmark the models across +multiple dimensions, focusing on their precision, recall, and F1-scores in +extracting privacy-sensitive information and detecting key regulatory +compliance points. While LLMs show promise in automating privacy reviews and +identifying regulatory discrepancies, significant gaps persist in their ability +to fully comply with evolving legal standards. We provide actionable +recommendations for enhancing LLMs' capabilities in privacy compliance, +emphasizing the need for robust model improvements and better integration with +legal and regulatory requirements. This study underscores the growing +importance of developing privacy-aware LLMs that can both support businesses in +compliance efforts and safeguard user privacy rights. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Do Large Language Models Possess Sensitive to Sentiment? + + +
+ Large Language Models (LLMs) have recently displayed their extraordinary +capabilities in language understanding. However, how to comprehensively assess +the sentiment capabilities of LLMs continues to be a challenge. This paper +investigates the ability of LLMs to detect and react to sentiment in text +modal. As the integration of LLMs into diverse applications is on the rise, it +becomes highly critical to comprehend their sensitivity to emotional tone, as +it can influence the user experience and the efficacy of sentiment-driven +tasks. We conduct a series of experiments to evaluate the performance of +several prominent LLMs in identifying and responding appropriately to +sentiments like positive, negative, and neutral emotions. The models' outputs +are analyzed across various sentiment benchmarks, and their responses are +compared with human evaluations. Our discoveries indicate that although LLMs +show a basic sensitivity to sentiment, there are substantial variations in +their accuracy and consistency, emphasizing the requirement for further +enhancements in their training processes to better capture subtle emotional +cues. Take an example in our findings, in some cases, the models might wrongly +classify a strongly positive sentiment as neutral, or fail to recognize sarcasm +or irony in the text. Such misclassifications highlight the complexity of +sentiment analysis and the areas where the models need to be refined. Another +aspect is that different LLMs might perform differently on the same set of +data, depending on their architecture and training datasets. This variance +calls for a more in-depth study of the factors that contribute to the +performance differences and how they can be optimized. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ Diversify-verify-adapt: Efficient and Robust Retrieval-Augmented + Ambiguous Question Answering + + +
+ The retrieval augmented generation (RAG) framework addresses an ambiguity in +user queries in QA systems by retrieving passages that cover all plausible +interpretations and generating comprehensive responses based on the passages. +However, our preliminary studies reveal that a single retrieval process often +suffers from low quality results, as the retrieved passages frequently fail to +capture all plausible interpretations. Although the iterative RAG approach has +been proposed to address this problem, it comes at the cost of significantly +reduced efficiency. To address these issues, we propose the +diversify-verify-adapt (DIVA) framework. DIVA first diversifies the retrieved +passages to encompass diverse interpretations. Subsequently, DIVA verifies the +quality of the passages and adapts the most suitable approach tailored to their +quality. This approach improves the QA systems accuracy and robustness by +handling low quality retrieval issue in ambiguous questions, while enhancing +efficiency. + +
+
+
+
+
+ + ☆ NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for + Retrieval + + +
+ $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval) +from pre-trained embedding models is the predominant retrieval method for text +and images, as well as Retrieval-Augmented Generation (RAG) pipelines. In +practice, application developers often fine-tune the embeddings to improve +their accuracy on the dataset and query workload in hand. Existing approaches +either fine-tune the pre-trained model itself or, more efficiently, but at the +cost of accuracy, train adaptor models to transform the output of the +pre-trained model. We present NUDGE, a family of novel non-parametric embedding +fine-tuning approaches that are significantly more accurate and efficient than +both sets of existing approaches. NUDGE directly modifies the embeddings of +data records to maximize the accuracy of $k$-NN retrieval. We present a +thorough theoretical and experimental study of NUDGE's non-parametric approach. +We show that even though the underlying problem is NP-Hard, constrained +variations can be solved efficiently. These constraints additionally ensure +that the changes to the embeddings are modest, avoiding large distortions to +the semantics learned during pre-training. In experiments across five +pre-trained models and nine standard text and image retrieval datasets, NUDGE +runs in minutes and often improves NDCG@10 by more than 10% over existing +fine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase +in accuracy and runs 200x and 3x faster, respectively, over fine-tuning the +pre-trained model and training adaptors. + +
+
+
+
+
+ + ☆ Well, that escalated quickly: The Single-Turn Crescendo Attack (STCA) + + +
+ This paper explores a novel approach to adversarial attacks on large language +models (LLM): the Single-Turn Crescendo Attack (STCA). The STCA builds upon the +multi-turn crescendo attack established by Mark Russinovich, Ahmed Salem, Ronen +Eldan. Traditional multi-turn adversarial strategies gradually escalate the +context to elicit harmful or controversial responses from LLMs. However, this +paper introduces a more efficient method where the escalation is condensed into +a single interaction. By carefully crafting the prompt to simulate an extended +dialogue, the attack bypasses typical content moderation systems, leading to +the generation of responses that would normally be filtered out. I demonstrate +this technique through a few case studies. The results highlight +vulnerabilities in current LLMs and underscore the need for more robust +safeguards. This work contributes to the broader discourse on responsible AI +(RAI) safety and adversarial testing, providing insights and practical examples +for researchers and developers. This method is unexplored in the literature, +making it a novel contribution to the field. + +
+
+
+
+
+ + ☆ Probing self-attention in self-supervised speech models for + cross-linguistic differences + + +
+ Speech models have gained traction thanks to increase in accuracy from novel +transformer architectures. While this impressive increase in performance across +automatic speech recognition (ASR) benchmarks is noteworthy, there is still +much that is unknown about the use of attention mechanisms for speech-related +tasks. For example, while it is assumed that these models are learning +language-independent (i.e., universal) speech representations, there has not +yet been an in-depth exploration of what it would mean for the models to be +language-independent. In the current paper, we explore this question within the +realm of self-attention mechanisms of one small self-supervised speech +transformer model (TERA). We find that even with a small model, the attention +heads learned are diverse ranging from almost entirely diagonal to almost +entirely global regardless of the training language. We highlight some notable +differences in attention patterns between Turkish and English and demonstrate +that the models do learn important phonological information during pretraining. +We also present a head ablation study which shows that models across languages +primarily rely on diagonal heads to classify phonemes. + +
+
+ comment: 10 pages, 18 figures +
+
+
+
+
+ + ☆ Quantification of stylistic differences in human- and ASR-produced + transcripts of African American English + + +
+ Common measures of accuracy used to assess the performance of automatic +speech recognition (ASR) systems, as well as human transcribers, conflate +multiple sources of error. Stylistic differences, such as verbatim vs +non-verbatim, can play a significant role in ASR performance evaluation when +differences exist between training and test datasets. The problem is compounded +for speech from underrepresented varieties, where the speech to orthography +mapping is not as standardized. We categorize the kinds of stylistic +differences between 6 transcription versions, 4 human- and 2 ASR-produced, of +10 hours of African American English (AAE) speech. Focusing on verbatim +features and AAE morphosyntactic features, we investigate the interactions of +these categories with how well transcripts can be compared via word error rate +(WER). The results, and overall analysis, help clarify how ASR outputs are a +function of the decisions made by the training data's human transcribers. + +
+
+ comment: Published in Interspeech 2024 Proceedings, 5 pages excluding + references, 5 figures +
+
+
+
+
+ + ☆ Oddballness: universal anomaly detection with language models + + +
+ We present a new method to detect anomalies in texts (in general: in +sequences of any data), using language models, in a totally unsupervised +manner. The method considers probabilities (likelihoods) generated by a +language model, but instead of focusing on low-likelihood tokens, it considers +a new metric introduced in this paper: oddballness. Oddballness measures how +``strange'' a given token is according to the language model. We demonstrate in +grammatical error detection tasks (a specific case of text anomaly detection) +that oddballness is better than just considering low-likelihood events, if a +totally unsupervised setup is assumed. + +
+
+
+
+
+ + ☆ CLUE: Concept-Level Uncertainty Estimation for Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable proficiency in +various natural language generation (NLG) tasks. Previous studies suggest that +LLMs' generation process involves uncertainty. However, existing approaches to +uncertainty estimation mainly focus on sequence-level uncertainty, overlooking +individual pieces of information within sequences. These methods fall short in +separately assessing the uncertainty of each component in a sequence. In +response, we propose a novel framework for Concept-Level Uncertainty Estimation +(CLUE) for LLMs. We leverage LLMs to convert output sequences into +concept-level representations, breaking down sequences into individual concepts +and measuring the uncertainty of each concept separately. We conduct +experiments to demonstrate that CLUE can provide more interpretable uncertainty +estimation results compared with sentence-level uncertainty, and could be a +useful tool for various tasks such as hallucination detection and story +generation. + +
+
+
+
+
+ + ☆ Hallucination Detection in LLMs: Fast and Memory-Efficient Finetuned + Models + + +
+ Uncertainty estimation is a necessary component when implementing AI in +high-risk settings, such as autonomous cars, medicine, or insurances. Large +Language Models (LLMs) have seen a surge in popularity in recent years, but +they are subject to hallucinations, which may cause serious harm in high-risk +settings. Despite their success, LLMs are expensive to train and run: they need +a large amount of computations and memory, preventing the use of ensembling +methods in practice. In this work, we present a novel method that allows for +fast and memory-friendly training of LLM ensembles. We show that the resulting +ensembles can detect hallucinations and are a viable approach in practice as +only one GPU is needed for training and inference. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ NESTFUL: A Benchmark for Evaluating LLMs on Nested Sequences of API + Calls + + +
+ Autonomous agent applications powered by large language models (LLMs) have +recently risen to prominence as effective tools for addressing complex +real-world tasks. At their core, agentic workflows rely on LLMs to plan and +execute the use of tools and external Application Programming Interfaces (APIs) +in sequence to arrive at the answer to a user's request. Various benchmarks and +leaderboards have emerged to evaluate an LLM's capabilities for tool and API +use; however, most of these evaluations only track single or multiple isolated +API calling capabilities. In this paper, we present NESTFUL, a benchmark to +evaluate LLMs on nested sequences of API calls, i.e., sequences where the +output of one API call is passed as input to a subsequent call. NESTFUL has a +total of 300 human annotated samples divided into two types - executable and +non-executable. The executable samples are curated manually by crawling +Rapid-APIs whereas the non-executable samples are hand picked by human +annotators from data synthetically generated using an LLM. We evaluate +state-of-the-art LLMs with function calling abilities on NESTFUL. Our results +show that most models do not perform well on nested APIs in NESTFUL as compared +to their performance on the simpler problem settings available in existing +benchmarks. + +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available (https://github.com/batmanlab/Ladder). + +
+
+
+
+
+ + ♻ ☆ The Need for Guardrails with Large Language Models in Medical + Safety-Critical Settings: An Artificial Intelligence Application in the + Pharmacovigilance Ecosystem + + +
+ Large language models (LLMs) are useful tools with the capacity for +performing specific types of knowledge work at an effective scale. However, LLM +deployments in high-risk and safety-critical domains pose unique challenges, +notably the issue of ``hallucination,'' where LLMs can generate fabricated +information. This is particularly concerning in settings such as drug safety, +where inaccuracies could lead to patient harm. To mitigate these risks, we have +developed and demonstrated a proof of concept suite of guardrails specifically +designed to mitigate certain types of hallucinations and errors for drug +safety, and potentially applicable to other medical safety-critical contexts. +These guardrails include mechanisms to detect anomalous documents to prevent +the ingestion of inappropriate data, identify incorrect drug names or adverse +event terms, and convey uncertainty in generated content. We integrated these +guardrails with an LLM fine-tuned for a text-to-text task, which involves +converting both structured and unstructured data within adverse event reports +into natural language. This method was applied to translate individual case +safety reports, demonstrating effective application in a pharmacovigilance +processing task. Our guardrail framework offers a set of tools with broad +applicability across various domains, ensuring LLMs can be safely used in +high-risk situations by eliminating the occurrence of key errors, including the +generation of incorrect pharmacovigilance-related terms, thus adhering to +stringent regulatory and quality standards in medical safety-critical +environments. + +
+
+ comment: 27 pages, 6 figures, 4 tables and supplementary material provided +
+
+
+
+
+ + ♻ ☆ Simple and Scalable Strategies to Continually Pre-train Large Language + Models + + +
+ Large language models (LLMs) are routinely pre-trained on billions of tokens, +only to start the process over again once new data becomes available. A much +more efficient solution is to continually pre-train these models, saving +significant compute compared to re-training. However, the distribution shift +induced by new data typically results in degraded performance on previous data +or poor adaptation to the new data. In this work, we show that a simple and +scalable combination of learning rate (LR) re-warming, LR re-decaying, and +replay of previous data is sufficient to match the performance of fully +re-training from scratch on all available data, as measured by the final loss +and the average score on several language model (LM) evaluation benchmarks. +Specifically, we show this for a weak but realistic distribution shift between +two commonly used LLM pre-training datasets (English$\rightarrow$English) and a +stronger distribution shift (English$\rightarrow$German) at the $405$M +parameter model scale with large dataset sizes (hundreds of billions of +tokens). Selecting the weak but realistic shift for larger-scale experiments, +we also find that our continual learning strategies match the re-training +baseline for a 10B parameter LLM. Our results demonstrate that LLMs can be +successfully updated via simple and scalable continual learning strategies, +matching the re-training baseline using only a fraction of the compute. +Finally, inspired by previous work, we propose alternatives to the cosine +learning rate schedule that help circumvent forgetting induced by LR re-warming +and that are not bound to a fixed token budget. + +
+
+
+
+
+ + ♻ ☆ LongRecipe: Recipe for Efficient Long Context Generalization in Large + Language Models + + +
+ Large language models (LLMs) face significant challenges in handling +long-context tasks because of their limited effective context window size +during pretraining, which restricts their ability to generalize over extended +sequences. Meanwhile, extending the context window in LLMs through +post-pretraining is highly resource-intensive. To address this, we introduce +LongRecipe, an efficient training strategy for extending the context window of +LLMs, including impactful token analysis, position index transformation, and +training optimization strategies. It simulates long-sequence inputs while +maintaining training efficiency and significantly improves the model's +understanding of long-range dependencies. Experiments on three types of LLMs +show that LongRecipe can utilize long sequences while requiring only 30% of the +target context window size, and reduces computational training resource over +85% compared to full sequence training. Furthermore, LongRecipe also preserves +the original LLM's capabilities in general tasks. Ultimately, we can extend the +effective context window of open-source LLMs from 8k to 128k, achieving +performance close to GPT-4 with just one day of dedicated training using a +single GPU with 80G memory. Our code is released at +https://github.com/zhiyuanhubj/LongRecipe. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ Revisiting Character-level Adversarial Attacks for Language Models ICML 2024 + + +
+ Adversarial attacks in Natural Language Processing apply perturbations in the +character or token levels. Token-level attacks, gaining prominence for their +use of gradient-based methods, are susceptible to altering sentence semantics, +leading to invalid adversarial examples. While character-level attacks easily +maintain semantics, they have received less attention as they cannot easily +adopt popular gradient-based methods, and are thought to be easy to defend. +Challenging these beliefs, we introduce Charmer, an efficient query-based +adversarial attack capable of achieving high attack success rate (ASR) while +generating highly similar adversarial examples. Our method successfully targets +both small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2, +Charmer improves the ASR in 4.84% points and the USE similarity in 8% points +with respect to the previous art. Our implementation is available in +https://github.com/LIONS-EPFL/Charmer. + +
+
+ comment: Accepted in ICML 2024 +
+
+
+
+
+ + ♻ ☆ LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language + Models + + +
+ Large Language Models (LLMs) have demonstrated notable capabilities across +various tasks, showcasing complex problem-solving abilities. Understanding and +executing complex rules, along with multi-step planning, are fundamental to +logical reasoning and critical for practical LLM agents and decision-making +systems. However, evaluating LLMs as effective rule-based executors and +planners remains underexplored. In this paper, we introduce LogicGame, a novel +benchmark designed to evaluate the comprehensive rule understanding, execution, +and planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame +provides diverse games that contain a series of rules with an initial state, +requiring models to comprehend and apply predefined regulations to solve +problems. We create simulated scenarios in which models execute or plan +operations to achieve specific outcomes. These game scenarios are specifically +designed to distinguish logical reasoning from mere knowledge by relying +exclusively on predefined rules. This separation allows for a pure assessment +of rule-based reasoning capabilities. The evaluation considers not only final +outcomes but also intermediate steps, providing a comprehensive assessment of +model performance. Moreover, these intermediate steps are deterministic and can +be automatically verified. LogicGame defines game scenarios with varying +difficulty levels, from simple rule applications to complex reasoning chains, +in order to offer a precise evaluation of model performance on rule +understanding and multi-step execution. Utilizing LogicGame, we test various +LLMs and identify notable shortcomings in their rule-based logical reasoning +abilities. + +
+
+
+
+
+ + ♻ ☆ AI-generated text boundary detection with RoFT + + +
+ Due to the rapid development of large language models, people increasingly +often encounter texts that may start as written by a human but continue as +machine-generated. Detecting the boundary between human-written and +machine-generated parts of such texts is a challenging problem that has not +received much attention in literature. We attempt to bridge this gap and +examine several ways to adapt state of the art artificial text detection +classifiers to the boundary detection setting. We push all detectors to their +limits, using the Real or Fake text benchmark that contains short texts on +several topics and includes generations of various language models. We use this +diversity to deeply examine the robustness of all detectors in cross-domain and +cross-model settings to provide baselines and insights for future research. In +particular, we find that perplexity-based approaches to boundary detection tend +to be more robust to peculiarities of domain-specific data than supervised +fine-tuning of the RoBERTa model; we also find which features of the text +confuse boundary detection algorithms and negatively influence their +performance in cross-domain settings. + +
+
+ comment: Our official repository: + https://github.com/SilverSolver/ai_boundary_detection +
+
+
+
+
+ + ♻ ☆ Negation Blindness in Large Language Models: Unveiling the NO Syndrome + in Image Generation + + +
+ Foundational Large Language Models (LLMs) have changed the way we perceive +technology. They have been shown to excel in tasks ranging from poem writing +and coding to essay generation and puzzle solving. With the incorporation of +image generation capability, they have become more comprehensive and versatile +AI tools. At the same time, researchers are striving to identify the +limitations of these tools to improve them further. Currently identified flaws +include hallucination, biases, and bypassing restricted commands to generate +harmful content. In the present work, we have identified a fundamental +limitation related to the image generation ability of LLMs, and termed it The +NO Syndrome. This negation blindness refers to LLMs inability to correctly +comprehend NO related natural language prompts to generate the desired images. +Interestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found +to be suffering from this syndrome. To demonstrate the generalization of this +limitation, we carried out simulation experiments and conducted entropy-based +and benchmark statistical analysis tests on various LLMs in multiple languages, +including English, Hindi, and French. We conclude that the NO syndrome is a +significant flaw in current LLMs that needs to be addressed. A related finding +of this study showed a consistent discrepancy between image and textual +responses as a result of this NO syndrome. We posit that the introduction of a +negation context-aware reinforcement learning based feedback loop between the +LLMs textual response and generated image could help ensure the generated text +is based on both the LLMs correct contextual understanding of the negation +query and the generated visual output. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Seeing Like an AI: How LLMs Apply (and Misapply) Wikipedia Neutrality + Norms + + +
+ Large language models (LLMs) are trained on broad corpora and then used in +communities with specialized norms. Is providing LLMs with community rules +enough for models to follow these norms? We evaluate LLMs' capacity to detect +(Task 1) and correct (Task 2) biased Wikipedia edits according to Wikipedia's +Neutral Point of View (NPOV) policy. LLMs struggled with bias detection, +achieving only 64% accuracy on a balanced dataset. Models exhibited contrasting +biases (some under- and others over-predicted bias), suggesting distinct priors +about neutrality. LLMs performed better at generation, removing 79% of words +removed by Wikipedia editors. However, LLMs made additional changes beyond +Wikipedia editors' simpler neutralizations, resulting in high-recall but +low-precision editing. Interestingly, crowdworkers rated AI rewrites as more +neutral (70%) and fluent (61%) than Wikipedia-editor rewrites. Qualitative +analysis found LLMs sometimes applied NPOV more comprehensively than Wikipedia +editors but often made extraneous non-NPOV-related changes (such as grammar). +LLMs may apply rules in ways that resonate with the public but diverge from +community experts. While potentially effective for generation, LLMs may reduce +editor agency and increase moderation workload (e.g., verifying additions). +Even when rules are easy to articulate, having LLMs apply them like community +members may still be difficult. + +
+
+
+
+
+ + ♻ ☆ A Causal Explainable Guardrails for Large Language Models + + +
+ Large Language Models (LLMs) have shown impressive performance in natural +language tasks, but their outputs can exhibit undesirable attributes or biases. +Existing methods for steering LLMs toward desired attributes often assume +unbiased representations and rely solely on steering prompts. However, the +representations learned from pre-training can introduce semantic biases that +influence the steering process, leading to suboptimal results. We propose +LLMGuardrail, a novel framework that incorporates causal analysis and +adversarial learning to obtain unbiased steering representations in LLMs. +LLMGuardrail systematically identifies and blocks the confounding effects of +biases, enabling the extraction of unbiased steering representations. +Additionally, it includes an explainable component that provides insights into +the alignment between the generated output and the desired direction. +Experiments demonstrate LLMGuardrail's effectiveness in steering LLMs toward +desired attributes while mitigating biases. Our work contributes to the +development of safe and reliable LLMs that align with desired attributes. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Parallel Speculative Decoding with Adaptive Draft Length + + +
+ Speculative decoding (SD), where an extra draft model is employed to provide +multiple \textit{draft} tokens first and then the original target model +verifies these tokens in parallel, has shown great power for LLM inference +acceleration. However, existing SD methods suffer from the mutual waiting +problem, i.e., the target model gets stuck when the draft model is +\textit{guessing} tokens, and vice versa. This problem is directly incurred by +the asynchronous execution of the draft model and the target model, and is +exacerbated due to the fixed draft length in speculative decoding. To address +these challenges, we propose a conceptually simple, flexible, and general +framework to boost speculative decoding, namely \textbf{P}arallel +sp\textbf{E}culative decoding with \textbf{A}daptive d\textbf{R}aft +\textbf{L}ength (PEARL). Specifically, PEARL proposes \textit{pre-verify} to +verify the first draft token in advance during the drafting phase, and +\textit{post-verify} to generate more draft tokens during the verification +phase. PEARL parallels the drafting phase and the verification phase via +applying the two strategies, and achieves adaptive draft length for different +scenarios, which effectively alleviates the mutual waiting problem. Moreover, +we theoretically demonstrate that the mean accepted tokens of PEARL is more +than existing \textit{draft-then-verify} works. Experiments on various text +generation benchmarks demonstrate the effectiveness of our \name, leading to a +superior speedup performance up to \textbf{3.79$\times$} and +\textbf{1.52$\times$}, compared to auto-regressive decoding and vanilla +speculative decoding, respectively. + +
+
+
+
+
+ + ♻ ☆ HIRO: Hierarchical Information Retrieval Optimization + + +
+ Retrieval-Augmented Generation (RAG) has revolutionized natural language +processing by dynamically integrating external knowledge into Large Language +Models (LLMs), addressing their limitation of static training datasets. Recent +implementations of RAG leverage hierarchical data structures, which organize +documents at various levels of summarization and information density. This +complexity, however, can cause LLMs to "choke" on information overload, +necessitating more sophisticated querying mechanisms. In this context, we +introduce Hierarchical Information Retrieval Optimization (HIRO), a novel +querying approach that employs a Depth-First Search (DFS)-based recursive +similarity score calculation and branch pruning. This method uniquely minimizes +the context delivered to the LLM without informational loss, effectively +managing the challenge of excessive data. HIRO's refined approach is validated +by a 10.85% improvement in performance on the NarrativeQA dataset. + +
+
+
+
+
+ + ♻ ☆ What Formal Languages Can Transformers Express? A Survey + + +
+ As transformers have gained prominence in natural language processing, some +researchers have investigated theoretically what problems they can and cannot +solve, by treating problems as formal languages. Exploring such questions can +help clarify the power of transformers relative to other models of computation, +their fundamental capabilities and limits, and the impact of architectural +choices. Work in this subarea has made considerable progress in recent years. +Here, we undertake a comprehensive survey of this work, documenting the diverse +assumptions that underlie different results and providing a unified framework +for harmonizing seemingly contradictory findings. + +
+
+ comment: One minor correction in {\S}5.1 +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions, such as +search agents, within this expanding field. + +
+
+ comment: updated to version 3 +
+
+
+
+
+ + ♻ ☆ Towards a Universal Method for Meaningful Signal Detection + + +
+ It is known that human speech and certain animal vocalizations can convey +meaningful content because we can decipher the content that a given utterance +does convey. This paper explores an alternative approach to determining whether +a signal is meaningful, one that analyzes only the signal itself and is +independent of what the conveyed meaning might be. We devise a method that +takes a waveform as input and outputs a score indicating its degree of +`meaningfulness`. We cluster contiguous portions of the input to minimize the +total description length, and then take the length of the code of the assigned +cluster labels as meaningfulness score. We evaluate our method empirically, +against several baselines, and show that it is the only one to give a high +score to human speech in various languages and with various speakers, a +moderate score to animal vocalizations from birds and orcas, and a low score to +ambient noise from various sources. + +
+
+
+
+
+ + ♻ ☆ Open Implementation and Study of BEST-RQ for Speech Processing ICASSP 2024 + + +
+ Self-Supervised Learning (SSL) has proven to be useful in various speech +tasks. However, these methods are generally very demanding in terms of data, +memory, and computational resources. BERT-based Speech pre-Training with +Random-projection Quantizer (BEST-RQ), is an SSL method that has shown great +performance on Automatic Speech Recognition (ASR) while being simpler than +other SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance, +details are lacking in the original paper, such as the amount of GPU/TPU hours +used in pre-training, and there is no official easy-to-use open-source +implementation. Furthermore, BEST-RQ has not been evaluated on other downstream +tasks aside from ASR and speech translation. In this work, we describe a +re-implementation of a Random-projection quantizer and perform a preliminary +study with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the +details and differences of our implementation. We show that a random projection +quantizer can achieve similar downstream performance as wav2vec 2.0 while +decreasing training time by over a factor of two. + +
+
+ comment: Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio, + Speech and Beyond (SASB 2024) +
+
+
+
+
+ + ♻ ☆ Prompt Compression with Context-Aware Sentence Encoding for Fast and + Improved LLM Inference + + +
+ Large language models (LLMs) have triggered a new stream of research focusing +on compressing the context length to reduce the computational cost while +ensuring the retention of helpful information for LLMs to answer the given +question. Token-based removal methods are one of the most prominent approaches +in this direction, but risk losing the semantics of the context caused by +intermediate token removal, especially under high compression ratios, while +also facing challenges in computational efficiency. In this work, we propose +context-aware prompt compression (CPC), a sentence-level prompt compression +technique where its key innovation is a novel context-aware sentence encoder +that provides a relevance score for each sentence for a given question. To +train this encoder, we generate a new dataset consisting of questions, +positives, and negative pairs where positives are sentences relevant to the +question, while negatives are irrelevant context sentences. We train the +encoder in a contrastive setup to learn context-aware sentence representations. +Our method considerably outperforms prior works on prompt compression on +benchmark datasets and is up to 10.93x faster at inference compared to the best +token-level compression method. We also find better improvement for shorter +length constraints in most benchmarks, showing the effectiveness of our +proposed solution in the compression of relevant information in a shorter +context. Finally, we release the code and the dataset for quick reproducibility +and further development: https://github.com/Workday/cpc. + +
+
+
+
+
+ + ♻ ☆ CADGE: Context-Aware Dialogue Generation Enhanced with Graph-Structured + Knowledge Aggregation + + +
+ Commonsense knowledge is crucial to many natural language processing tasks. +Existing works usually incorporate graph knowledge with conventional graph +neural networks (GNNs), leading to the text and graph knowledge encoding +processes being separated in a serial pipeline. We argue that these separate +representation learning stages may be suboptimal for neural networks to learn +the overall context contained in both types of input knowledge. In this paper, +we propose a novel context-aware graph-attention model (Context-aware GAT), +which can effectively incorporate global features of relevant knowledge graphs +based on a context-enhanced knowledge aggregation process. Specifically, our +framework leverages a novel representation learning approach to process +heterogeneous features - combining flattened graph knowledge with text. To the +best of our knowledge, this is the first attempt at hierarchically applying +graph knowledge aggregation on a connected subgraph in addition to contextual +information to support commonsense dialogue generation. This framework shows +superior performance compared to conventional GNN-based language frameworks. +Both automatic and human evaluation demonstrates that our proposed model has +significant performance uplifts over state-of-the-art baselines. + +
+
+ comment: Accepted by INLG 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Sindhi Word Segmentation using Subword Representation Learning + and Position-aware Self-attention + + +
+ Sindhi word segmentation is a challenging task due to space omission and +insertion issues. The Sindhi language itself adds to this complexity. It's +cursive and consists of characters with inherent joining and non-joining +properties, independent of word boundaries. Existing Sindhi word segmentation +methods rely on designing and combining hand-crafted features. However, these +methods have limitations, such as difficulty handling out-of-vocabulary words, +limited robustness for other languages, and inefficiency with large amounts of +noisy or raw text. Neural network-based models, in contrast, can automatically +capture word boundary information without requiring prior knowledge. In this +paper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses +word segmentation as a sequence labeling task. The SGNWS model incorporates +subword representation learning through a bidirectional long short-term memory +encoder, position-aware self-attention, and a conditional random field. Our +empirical results demonstrate that the SGNWS model achieves state-of-the-art +performance in Sindhi word segmentation on six datasets. + +
+
+ comment: Journal Paper, 14 pages +
+
+
+
+
+ + ♻ ☆ A Sentence is Worth a Thousand Pictures: Can Large Language Models + Understand Hum4n L4ngu4ge and the W0rld behind W0rds? + + +
+ Modern Artificial Intelligence applications show great potential for +language-related tasks that rely on next-word prediction. The current +generation of Large Language Models (LLMs) have been linked to claims about +human-like linguistic performance and their applications are hailed both as a +step towards artificial general intelligence and as a major advance in +understanding the cognitive, and even neural basis of human language. To assess +these claims, first we analyze the contribution of LLMs as theoretically +informative representations of a target cognitive system vs. atheoretical +mechanistic tools. Second, we evaluate the models' ability to see the bigger +picture, through top-down feedback from higher levels of processing, which +requires grounding in previous expectations and past world experience. We +hypothesize that since models lack grounded cognition, they cannot take +advantage of these features and instead solely rely on fixed associations +between represented words and word vectors. To assess this, we designed and ran +a novel 'leet task' (l33t t4sk), which requires decoding sentences in which +letters are systematically replaced by numbers. The results suggest that humans +excel in this task whereas models struggle, confirming our hypothesis. We +interpret the results by identifying the key abilities that are still missing +from the current state of development of these models, which require solutions +that go beyond increased system scaling. + +
+
+
+
+
+ + ♻ ☆ Exploring Interpretability of Independent Components of Word Embeddings + with Automated Word Intruder Test LREC + + +
+ Independent Component Analysis (ICA) is an algorithm originally developed for +finding separate sources in a mixed signal, such as a recording of multiple +people in the same room speaking at the same time. Unlike Principal Component +Analysis (PCA), ICA permits the representation of a word as an unstructured set +of features, without any particular feature being deemed more significant than +the others. In this paper, we used ICA to analyze word embeddings. We have +found that ICA can be used to find semantic features of the words, and these +features can easily be combined to search for words that satisfy the +combination. We show that most of the independent components represent such +features. To quantify the interpretability of the components, we use the word +intruder test, performed both by humans and by large language models. We +propose to use the automated version of the word intruder test as a fast and +inexpensive way of quantifying vector interpretability without the need for +human effort. + +
+
+ comment: Presented at LREC-COLING 2024, cite this version please: + https://aclanthology.org/2024.lrec-main.605/ +
+
+
+
+
+ + ♻ ☆ Vision-Language and Large Language Model Performance in + Gastroenterology: GPT, Claude, Llama, Phi, Mistral, Gemma, and Quantized + Models + + +
+ Background and Aims: This study evaluates the medical reasoning performance +of large language models (LLMs) and vision language models (VLMs) in +gastroenterology. + Methods: We used 300 gastroenterology board exam-style multiple-choice +questions, 138 of which contain images to systematically assess the impact of +model configurations and parameters and prompt engineering strategies utilizing +GPT-3.5. Next, we assessed the performance of proprietary and open-source LLMs +(versions), including GPT (3.5, 4, 4o, 4omini), Claude (3, 3.5), Gemini (1.0), +Mistral, Llama (2, 3, 3.1), Mixtral, and Phi (3), across different interfaces +(web and API), computing environments (cloud and local), and model precisions +(with and without quantization). Finally, we assessed accuracy using a +semiautomated pipeline. + Results: Among the proprietary models, GPT-4o (73.7%) and Claude3.5-Sonnet +(74.0%) achieved the highest accuracy, outperforming the top open-source +models: Llama3.1-405b (64%), Llama3.1-70b (58.3%), and Mixtral-8x7b (54.3%). +Among the quantized open-source models, the 6-bit quantized Phi3-14b (48.7%) +performed best. The scores of the quantized models were comparable to those of +the full-precision models Llama2-7b, Llama2--13b, and Gemma2-9b. Notably, VLM +performance on image-containing questions did not improve when the images were +provided and worsened when LLM-generated captions were provided. In contrast, a +10% increase in accuracy was observed when images were accompanied by +human-crafted image descriptions. + Conclusion: In conclusion, while LLMs exhibit robust zero-shot performance in +medical reasoning, the integration of visual data remains a challenge for VLMs. +Effective deployment involves carefully determining optimal model +configurations, encouraging users to consider either the high performance of +proprietary models or the flexible adaptability of open-source models. + +
+
+ comment: Manuscript Pages: 34, Figures: 7, Tables: 2, Supplementary File + Pages: 35, Data Transparency Statement: Code is available at: + https://github.com/Sdamirsa/LLM-VLM-in-Gastroenterology . Study data from + American College of Gastroenterology (ACG) are restricted and available upon + request with ACG permission. Correction: updated abstract considering + Llama3.1 results +
+
+
+
+
+ + ♻ ☆ Towards Measuring and Modeling "Culture" in LLMs: A Survey + + +
+ We present a survey of more than 90 recent papers that aim to study cultural +representation and inclusion in large language models (LLMs). We observe that +none of the studies explicitly define "culture, which is a complex, +multifaceted concept; instead, they probe the models on some specially designed +datasets which represent certain aspects of "culture". We call these aspects +the proxies of culture, and organize them across two dimensions of demographic +and semantic proxies. We also categorize the probing methods employed. Our +analysis indicates that only certain aspects of ``culture,'' such as values and +objectives, have been studied, leaving several other interesting and important +facets, especially the multitude of semantic domains (Thompson et al., 2020) +and aboutness (Hershcovich et al., 2022), unexplored. Two other crucial gaps +are the lack of robustness of probing techniques and situated studies on the +impact of cultural mis- and under-representation in LLM-based applications. + +
+
+
+
+
+ + ♻ ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever EMNLP + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce a novel architecture and a training framework to +support long context window and multilingual retrieval. Our new model, +Jina-ColBERT-v2, demonstrates strong performance across a range of English and +multilingual retrieval tasks, + +
+
+ comment: 8 pages, references at pp7,8; EMNLP workshop submission +
+
+
+
+
+ + ♻ ☆ An Empirical Study on Information Extraction using Large Language Models + + +
+ Human-like large language models (LLMs), especially the most powerful and +popular ones in OpenAI's GPT family, have proven to be very helpful for many +natural language processing (NLP) related tasks. Therefore, various attempts +have been made to apply LLMs to information extraction (IE), which is a +fundamental NLP task that involves extracting information from unstructured +plain text. To demonstrate the latest representative progress in LLMs' +information extraction ability, we assess the information extraction ability of +GPT-4 (the latest version of GPT at the time of writing this paper) from four +perspectives: Performance, Evaluation Criteria, Robustness, and Error Types. +Our results suggest a visible performance gap between GPT-4 and +state-of-the-art (SOTA) IE methods. To alleviate this problem, considering the +LLMs' human-like characteristics, we propose and analyze the effects of a +series of simple prompt-based methods, which can be generalized to other LLMs +and NLP tasks. Rich experiments show our methods' effectiveness and some of +their remaining issues in improving GPT-4's information extraction ability. + +
+
+ comment: This article has an original arxiv version entitled "Is Information + Extraction Solved by ChatGPT? An Analysis of Performance, Evaluation + Criteria, Robustness and Errors", whose url link is arXiv/2305.14450 +
+
+
+
+
+ + ♻ ☆ Can AI Replace Human Subjects? A Large-Scale Replication of + Psychological Experiments with LLMs + + +
+ Artificial Intelligence (AI) is increasingly being integrated into scientific +research, particularly in the social sciences, where understanding human +behavior is critical. Large Language Models (LLMs) like GPT-4 have shown +promise in replicating human-like responses in various psychological +experiments. However, the extent to which LLMs can effectively replace human +subjects across diverse experimental contexts remains unclear. Here, we conduct +a large-scale study replicating 154 psychological experiments from top social +science journals with 618 main effects and 138 interaction effects using GPT-4 +as a simulated participant. We find that GPT-4 successfully replicates 76.0 +percent of main effects and 47.0 percent of interaction effects observed in the +original studies, closely mirroring human responses in both direction and +significance. However, only 19.44 percent of GPT-4's replicated confidence +intervals contain the original effect sizes, with the majority of replicated +effect sizes exceeding the 95 percent confidence interval of the original +studies. Additionally, there is a 71.6 percent rate of unexpected significant +results where the original studies reported null findings, suggesting potential +overestimation or false positives. Our results demonstrate the potential of +LLMs as powerful tools in psychological research but also emphasize the need +for caution in interpreting AI-driven findings. While LLMs can complement human +studies, they cannot yet fully replace the nuanced insights provided by human +subjects. + +
+
+ comment: 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Enhancing Dialogue Generation in Werewolf Game Through Situation + Analysis and Persuasion Strategies + + +
+ Recent advancements in natural language processing, particularly with large +language models (LLMs) like GPT-4, have significantly enhanced dialogue +systems, enabling them to generate more natural and fluent conversations. +Despite these improvements, challenges persist, such as managing continuous +dialogues, memory retention, and minimizing hallucinations. The AIWolfDial2024 +addresses these challenges by employing the Werewolf Game, an incomplete +information game, to test the capabilities of LLMs in complex interactive +environments. This paper introduces a LLM-based Werewolf Game AI, where each +role is supported by situation analysis to aid response generation. +Additionally, for the werewolf role, various persuasion strategies, including +logical appeal, credibility appeal, and emotional appeal, are employed to +effectively persuade other players to align with its actions. + +
+
+ comment: Accepted to the AIWolfDial2024 workshop at INLG 2024 +
+
+
+
+
+ + ♻ ☆ SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated + Responses + + +
+ Can LLMs consistently improve their previous outputs for better results? For +this to be true, LLMs would need to be better at discriminating among +previously-generated alternatives, than generating initial responses. We +explore the validity of this hypothesis in practice. We first formulate a +unified framework that allows us to compare the generative and discriminative +capability of any model on any task. In our resulting experimental analysis of +several open-source and industrial LLMs, we observe that models are not +reliably better at discriminating among previously-generated alternatives than +generating initial responses. This finding challenges the notion that LLMs may +be able to enhance their performance only through their own judgment. + +
+
+
+
+
+ + ♻ ☆ LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet + + +
+ Recent large language model (LLM) defenses have greatly improved models' +ability to refuse harmful queries, even when adversarially attacked. However, +LLM defenses are primarily evaluated against automated adversarial attacks in a +single turn of conversation, an insufficient threat model for real-world +malicious use. We demonstrate that multi-turn human jailbreaks uncover +significant vulnerabilities, exceeding 70% attack success rate (ASR) on +HarmBench against defenses that report single-digit ASRs with automated +single-turn attacks. Human jailbreaks also reveal vulnerabilities in machine +unlearning defenses, successfully recovering dual-use biosecurity knowledge +from unlearned models. We compile these results into Multi-Turn Human +Jailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks. +We publicly release MHJ alongside a compendium of jailbreak tactics developed +across dozens of commercial red teaming engagements, supporting research +towards stronger LLM defenses. + +
+
+
+
+
+ + ♻ ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+ + ♻ ☆ RT-Surv: Improving Mortality Prediction After Radiotherapy with Large + Language Model Structuring of Large-Scale Unstructured Electronic Health + Records + + +
+ Accurate patient selection is critical in radiotherapy (RT) to prevent +ineffective treatments. Traditional survival prediction models, relying on +structured data, often lack precision. This study explores the potential of +large language models (LLMs) to structure unstructured electronic health record +(EHR) data, thereby improving survival prediction accuracy through +comprehensive clinical information integration. Data from 34,276 patients +treated with RT at Yonsei Cancer Center between 2013 and 2023 were analyzed, +encompassing both structured and unstructured data. An open-source LLM was used +to structure the unstructured EHR data via single-shot learning, with its +performance compared against a domain-specific medical LLM and a smaller +variant. Survival prediction models were developed using statistical, machine +learning, and deep learning approaches, incorporating both structured and +LLM-structured data. Clinical experts evaluated the accuracy of the +LLM-structured data. The open-source LLM achieved 87.5% accuracy in structuring +unstructured EHR data without additional training, significantly outperforming +the domain-specific medical LLM, which reached only 35.8% accuracy. Larger LLMs +were more effective, particularly in extracting clinically relevant features +like general condition and disease extent, which closely correlated with +patient survival. Incorporating LLM-structured clinical features into survival +prediction models significantly improved accuracy, with the C-index of deep +learning models increasing from 0.737 to 0.820. These models also became more +interpretable by emphasizing clinically significant factors. This study shows +that general-domain LLMs, even without specific medical training, can +effectively structure large-scale unstructured EHR data, substantially +enhancing the accuracy and interpretability of clinical predictive models. + +
+
+ comment: 23 pages, 2 tables, 4 figures +
+
+
+
+
+ + ♻ ☆ Learning to Ask: When LLMs Meet Unclear Instruction + + +
+ Equipped with the capability to call functions, modern large language models +(LLMs) can leverage external tools for addressing a range of tasks unattainable +through language skills alone. However, the effective execution of these tools +relies heavily not just on the advanced capabilities of LLMs but also on +precise user instructions, which often cannot be ensured in the real world. To +evaluate the performance of LLMs tool-use under imperfect instructions, we +meticulously examine the real-world instructions queried from users, analyze +the error patterns, and build a challenging tool-use benchmark called Noisy +ToolBench (NoisyToolBench). We find that due to the next-token prediction +training objective, LLMs tend to arbitrarily generate the missed argument, +which may lead to hallucinations and risks. To address this issue, we propose a +novel framework, Ask-when-Needed (AwN), which prompts LLMs to ask questions to +users whenever they encounter obstacles due to unclear instructions. Moreover, +to reduce the manual labor involved in user-LLM interaction and assess LLMs +performance in tool utilization from both accuracy and efficiency perspectives, +we design an automated evaluation tool named ToolEvaluator. Our experiments +demonstrate that the AwN significantly outperforms existing frameworks for tool +learning in the NoisyToolBench. We will release all related code and datasets +to support future research. + +
+
+
+
+
+ + ♻ ☆ Predicting Drug-Gene Relations via Analogy Tasks with Word Embeddings + + +
+ Natural language processing (NLP) is utilized in a wide range of fields, +where words in text are typically transformed into feature vectors called +embeddings. BioConceptVec is a specific example of embeddings tailored for +biology, trained on approximately 30 million PubMed abstracts using models such +as skip-gram. Generally, word embeddings are known to solve analogy tasks +through simple vector arithmetic. For instance, $\mathrm{\textit{king}} - +\mathrm{\textit{man}} + \mathrm{\textit{woman}}$ predicts +$\mathrm{\textit{queen}}$. In this study, we demonstrate that BioConceptVec +embeddings, along with our own embeddings trained on PubMed abstracts, contain +information about drug-gene relations and can predict target genes from a given +drug through analogy computations. We also show that categorizing drugs and +genes using biological pathways improves performance. Furthermore, we +illustrate that vectors derived from known relations in the past can predict +unknown future relations in datasets divided by year. Despite the simplicity of +implementing analogy tasks as vector additions, our approach demonstrated +performance comparable to that of large language models such as GPT-4 in +predicting drug-gene relations. + +
+
+
+
+
+ + ♻ ☆ Optimizing Byte-level Representation for End-to-end ASR + + +
+ We propose a novel approach to optimizing a byte-level representation for +end-to-end automatic speech recognition (ASR). Byte-level representation is +often used by large scale multilingual ASR systems when the character set of +the supported languages is large. The compactness and universality of +byte-level representation allow the ASR models to use smaller output +vocabularies and therefore, provide more flexibility. UTF-8 is a commonly used +byte-level representation for multilingual ASR, but it is not designed to +optimize machine learning tasks directly. By using auto-encoder and vector +quantization, we show that we can optimize a byte-level representation for ASR +and achieve better accuracy. Our proposed framework can incorporate information +from different modalities, and provides an error correction mechanism. In an +English/Mandarin dictation task, we show that a bilingual ASR model built with +this approach can outperform UTF-8 representation by 5% relative in error rate. + +
+
+ comment: 5 pages, 1 figure, IEEE SLT 2024 +
+
+
+
+
+ + ♻ ☆ Booster: Tackling Harmful Fine-tuning for Large Language Models via + Attenuating Harmful Perturbation + + +
+ Harmful fine-tuning issue \citep{qi2023fine} poses serious safety concerns +for Large language models' fine-tuning-as-a-service. While existing defenses +\citep{huang2024vaccine,rosati2024representation} have been proposed to +mitigate the issue, their performances are still far away from satisfactory, +and the root cause of the problem has not been fully recovered. For the first +time in the literature, we in this paper show that \textit{harmful +perturbation} over the model weights should be the root cause of +alignment-broken of harmful fine-tuning. In order to attenuate the negative +impact of harmful perturbation, we propose an alignment-stage solution, dubbed +Booster. Technically, along with the original alignment loss, we append a loss +regularizer in the alignment stage's optimization. The regularizer ensures that +the model's harmful loss reduction before/after simulated harmful perturbation +is attenuated, thereby mitigating the subsequent fine-tuning risk. Empirical +results show that Booster can effectively reduce the harmful score of the +fine-tuned models while maintaining the performance of downstream tasks. Our +code is available at \url{https://github.com/git-disl/Booster}. + +
+
+
+
+
+ + ♻ ☆ Language-Guided World Models: A Model-Based Approach to AI Control ACL 2024 + + +
+ This paper introduces the concept of Language-Guided World Models (LWMs) -- +probabilistic models that can simulate environments by reading texts. Agents +equipped with these models provide humans with more extensive and efficient +control, allowing them to simultaneously alter agent behaviors in multiple +tasks via natural verbal communication. In this work, we take initial steps in +developing robust LWMs that can generalize to compositionally novel language +descriptions. We design a challenging world modeling benchmark based on the +game of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that +require varying degrees of compositional generalization. Our experiments reveal +the lack of generalizability of the state-of-the-art Transformer model, as it +offers marginal improvements in simulation quality over a no-text baseline. We +devise a more robust model by fusing the Transformer with the EMMA attention +mechanism (Hanjie et al., 2021). Our model substantially outperforms the +Transformer and approaches the performance of a model with an oracle semantic +parsing and grounding capability. To demonstrate the practicality of this model +in improving AI safety and transparency, we simulate a scenario in which the +model enables an agent to present plans to a human before execution, and to +revise plans based on their language feedback. + +
+
+ comment: SpLU-RoboNLP workshop at ACL 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 136 + +
+
+
+ + ☆ HiPrompt: Tuning-free Higher-Resolution Generation with Hierarchical + MLLM Prompts + + +
+ The potential for higher-resolution image generation using pretrained +diffusion models is immense, yet these models often struggle with issues of +object repetition and structural artifacts especially when scaling to 4K +resolution and higher. We figure out that the problem is caused by that, a +single prompt for the generation of multiple scales provides insufficient +efficacy. In response, we propose HiPrompt, a new tuning-free solution that +tackles the above problems by introducing hierarchical prompts. The +hierarchical prompts offer both global and local guidance. Specifically, the +global guidance comes from the user input that describes the overall content, +while the local guidance utilizes patch-wise descriptions from MLLMs to +elaborately guide the regional structure and texture generation. Furthermore, +during the inverse denoising process, the generated noise is decomposed into +low- and high-frequency spatial components. These components are conditioned on +multiple prompt levels, including detailed patch-wise descriptions and broader +image-level prompts, facilitating prompt-guided denoising under hierarchical +semantic guidance. It further allows the generation to focus more on local +spatial regions and ensures the generated images maintain coherent local and +global semantics, structures, and textures with high definition. Extensive +experiments demonstrate that HiPrompt outperforms state-of-the-art works in +higher-resolution image generation, significantly reducing object repetition +and enhancing structural quality. + +
+
+
+
+
+ + ☆ UC-NeRF: Uncertainty-aware Conditional Neural Radiance Fields from + Endoscopic Sparse Views + + +
+ Visualizing surgical scenes is crucial for revealing internal anatomical +structures during minimally invasive procedures. Novel View Synthesis is a +vital technique that offers geometry and appearance reconstruction, enhancing +understanding, planning, and decision-making in surgical scenes. Despite the +impressive achievements of Neural Radiance Field (NeRF), its direct application +to surgical scenes produces unsatisfying results due to two challenges: +endoscopic sparse views and significant photometric inconsistencies. In this +paper, we propose uncertainty-aware conditional NeRF for novel view synthesis +to tackle the severe shape-radiance ambiguity from sparse surgical views. The +core of UC-NeRF is to incorporate the multi-view uncertainty estimation to +condition the neural radiance field for modeling the severe photometric +inconsistencies adaptively. Specifically, our UC-NeRF first builds a +consistency learner in the form of multi-view stereo network, to establish the +geometric correspondence from sparse views and generate uncertainty estimation +and feature priors. In neural rendering, we design a base-adaptive NeRF network +to exploit the uncertainty estimation for explicitly handling the photometric +inconsistencies. Furthermore, an uncertainty-guided geometry distillation is +employed to enhance geometry learning. Experiments on the SCARED and Hamlyn +datasets demonstrate our superior performance in rendering appearance and +geometry, consistently outperforming the current state-of-the-art approaches. +Our code will be released at \url{https://github.com/wrld/UC-NeRF}. + +
+
+
+
+
+ + ☆ Can LVLMs Obtain a Driver's License? A Benchmark Towards Reliable AGI + for Autonomous Driving + + +
+ Large Vision-Language Models (LVLMs) have recently garnered significant +attention, with many efforts aimed at harnessing their general knowledge to +enhance the interpretability and robustness of autonomous driving models. +However, LVLMs typically rely on large, general-purpose datasets and lack the +specialized expertise required for professional and safe driving. Existing +vision-language driving datasets focus primarily on scene understanding and +decision-making, without providing explicit guidance on traffic rules and +driving skills, which are critical aspects directly related to driving safety. +To bridge this gap, we propose IDKB, a large-scale dataset containing over one +million data items collected from various countries, including driving +handbooks, theory test data, and simulated road test data. Much like the +process of obtaining a driver's license, IDKB encompasses nearly all the +explicit knowledge needed for driving from theory to practice. In particular, +we conducted comprehensive tests on 15 LVLMs using IDKB to assess their +reliability in the context of autonomous driving and provided extensive +analysis. We also fine-tuned popular models, achieving notable performance +improvements, which further validate the significance of our dataset. The +project page can be found at: +\url{https://4dvlab.github.io/project_page/idkb.html} + +
+
+
+
+
+ + ☆ SITAR: Semi-supervised Image Transformer for Action Recognition ICPR 2024 + + +
+ Recognizing actions from a limited set of labeled videos remains a challenge +as annotating visual data is not only tedious but also can be expensive due to +classified nature. Moreover, handling spatio-temporal data using deep $3$D +transformers for this can introduce significant computational complexity. In +this paper, our objective is to address video action recognition in a +semi-supervised setting by leveraging only a handful of labeled videos along +with a collection of unlabeled videos in a compute efficient manner. +Specifically, we rearrange multiple frames from the input videos in row-column +form to construct super images. Subsequently, we capitalize on the vast pool of +unlabeled samples and employ contrastive learning on the encoded super images. +Our proposed approach employs two pathways to generate representations for +temporally augmented super images originating from the same video. +Specifically, we utilize a 2D image-transformer to generate representations and +apply a contrastive loss function to minimize the similarity between +representations from different videos while maximizing the representations of +identical videos. Our method demonstrates superior performance compared to +existing state-of-the-art approaches for semi-supervised action recognition +across various benchmark datasets, all while significantly reducing +computational costs. + +
+
+ comment: Accepted at ICPR 2024 +
+
+
+
+
+ + ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 19 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ CanvOI, an Oncology Intelligence Foundation Model: Scaling FLOPS + Differently + + +
+ The rapidly evolving field of digital oncopathology faces significant +challenges, including the need to address diverse and complex clinical +questions, often involving rare conditions, with limited availability of +labeled data. These limitations hinder the development of robust AI-driven +tools in the biomedical space, where accuracy in probabilistic determinations +is of utmost importance. To address this, digital pathology foundation models +have begun to emerge, typically developed with the size and diversity of the +pre-training dataset and model parameters in mind. Here, we present CanvOI, a +ViT-g/10-based foundation model designed to enhance the capabilities of digital +pathology by addressing these challenges through a different approach. +Considering the unique nature of oncologic histopathological images and the +requirements from the embeddings to provide meaningful representations for +Multiple Instance Learning (MIL) downstream models, we chose to modify the +input image characteristics. By introducing larger tile sizes (380 x 380 +pixels) and smaller patch sizes (10 x 10 pixels), we were able to optimize the +model's performance, pushing computational resources in a new direction and +achieving state-of-the-art performance on cancer-related benchmarks. CanvOI +demonstrated a 1.5-7.4% improvement in averaged AUC compared to other leading +foundation models built for digital pathology. Moreover, our results +demonstrate that CanvOI significantly outperformed the other models, with the +performance gap widening substantially when trained on just 10% of the initial +cohort. This work highlights an alternative approach that, if integrated with +traditional development approaches, has the potential to advance Oncology +Intelligence (OI), overcome some of the current barriers and ultimately improve +the clinical outcome of cancer patients. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Multi-stream deep learning framework to predict mild cognitive + impairment with Rey Complex Figure Test + + +
+ Drawing tests like the Rey Complex Figure Test (RCFT) are widely used to +assess cognitive functions such as visuospatial skills and memory, making them +valuable tools for detecting mild cognitive impairment (MCI). Despite their +utility, existing predictive models based on these tests often suffer from +limitations like small sample sizes and lack of external validation, which +undermine their reliability. We developed a multi-stream deep learning +framework that integrates two distinct processing streams: a multi-head +self-attention based spatial stream using raw RCFT images and a scoring stream +employing a previously developed automated scoring system. Our model was +trained on data from 1,740 subjects in the Korean cohort and validated on an +external hospital dataset of 222 subjects from Korea. The proposed multi-stream +model demonstrated superior performance over baseline models (AUC = 0.872, +Accuracy = 0.781) in external validation. The integration of both spatial and +scoring streams enables the model to capture intricate visual details from the +raw images while also incorporating structured scoring data, which together +enhance its ability to detect subtle cognitive impairments. This dual approach +not only improves predictive accuracy but also increases the robustness of the +model, making it more reliable in diverse clinical settings. Our model has +practical implications for clinical settings, where it could serve as a +cost-effective tool for early MCI screening. + +
+
+ comment: 20 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Benchmarking Spurious Bias in Few-Shot Image Classifiers ECCV 2024 + + +
+ Few-shot image classifiers are designed to recognize and classify new data +with minimal supervision and limited data but often show reliance on spurious +correlations between classes and spurious attributes, known as spurious bias. +Spurious correlations commonly hold in certain samples and few-shot classifiers +can suffer from spurious bias induced from them. There is an absence of an +automatic benchmarking system to assess the robustness of few-shot classifiers +against spurious bias. In this paper, we propose a systematic and rigorous +benchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied +degrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates +few-shot evaluation tasks with biased attributes so that using them for +predictions can demonstrate poor performance. To construct these tasks, we +propose attribute-based sample selection strategies based on a pre-trained +vision-language model, eliminating the need for manual dataset curation. This +allows FewSTAB to automatically benchmark spurious bias using any existing test +data. FewSTAB offers evaluation results in a new dimension along with a new +design guideline for building robust classifiers. Moreover, it can benchmark +spurious bias in varied degrees and enable designs for varied degrees of +robustness. Its effectiveness is demonstrated through experiments on ten +few-shot learning methods across three datasets. We hope our framework can +inspire new designs of robust few-shot classifiers. Our code is available at +https://github.com/gtzheng/FewSTAB. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ The Impact of Balancing Real and Synthetic Data on Accuracy and Fairness + in Face Recognition ECCV 2024 + + +
+ Over the recent years, the advancements in deep face recognition have fueled +an increasing demand for large and diverse datasets. Nevertheless, the +authentic data acquired to create those datasets is typically sourced from the +web, which, in many cases, can lead to significant privacy issues due to the +lack of explicit user consent. Furthermore, obtaining a demographically +balanced, large dataset is even more difficult because of the natural imbalance +in the distribution of images from different demographic groups. In this paper, +we investigate the impact of demographically balanced authentic and synthetic +data, both individually and in combination, on the accuracy and fairness of +face recognition models. Initially, several generative methods were used to +balance the demographic representations of the corresponding synthetic +datasets. Then a state-of-the-art face encoder was trained and evaluated using +(combinations of) synthetic and authentic images. Our findings emphasized two +main points: (i) the increased effectiveness of training data generated by +diffusion-based models in enhancing accuracy, whether used alone or combined +with subsets of authentic data, and (ii) the minimal impact of incorporating +balanced data from pre-trained generative methods on fairness (in nearly all +tested scenarios using combined datasets, fairness scores remained either +unchanged or worsened, even when compared to unbalanced authentic datasets). +Source code and data are available at \url{https://cutt.ly/AeQy1K5G} for +reproducibility. + +
+
+ comment: Accepted at Synthetic Data for Computer Vision Workshop - Side Event + at ECCV 2024 +
+
+
+
+
+ + ☆ Hybrid-Segmentor: A Hybrid Approach to Automated Fine-Grained Crack + Segmentation in Civil Infrastructure + + +
+ Detecting and segmenting cracks in infrastructure, such as roads and +buildings, is crucial for safety and cost-effective maintenance. In spite of +the potential of deep learning, there are challenges in achieving precise +results and handling diverse crack types. With the proposed dataset and model, +we aim to enhance crack detection and infrastructure maintenance. We introduce +Hybrid-Segmentor, an encoder-decoder based approach that is capable of +extracting both fine-grained local and global crack features. This allows the +model to improve its generalization capabilities in distinguish various type of +shapes, surfaces and sizes of cracks. To keep the computational performances +low for practical purposes, while maintaining the high the generalization +capabilities of the model, we incorporate a self-attention model at the encoder +level, while reducing the complexity of the decoder component. The proposed +model outperforms existing benchmark models across 5 quantitative metrics +(accuracy 0.971, precision 0.804, recall 0.744, F1-score 0.770, and IoU score +0.630), achieving state-of-the-art status. + +
+
+ comment: 25 pages, 6 figures +
+
+
+
+
+ + ☆ Human-VDM: Learning Single-Image 3D Human Gaussian Splatting from Video + Diffusion Models + + +
+ Generating lifelike 3D humans from a single RGB image remains a challenging +task in computer vision, as it requires accurate modeling of geometry, +high-quality texture, and plausible unseen parts. Existing methods typically +use multi-view diffusion models for 3D generation, but they often face +inconsistent view issues, which hinder high-quality 3D human generation. To +address this, we propose Human-VDM, a novel method for generating 3D human from +a single RGB image using Video Diffusion Models. Human-VDM provides temporally +consistent views for 3D human generation using Gaussian Splatting. It consists +of three modules: a view-consistent human video diffusion module, a video +augmentation module, and a Gaussian Splatting module. First, a single image is +fed into a human video diffusion module to generate a coherent human video. +Next, the video augmentation module applies super-resolution and video +interpolation to enhance the textures and geometric smoothness of the generated +video. Finally, the 3D Human Gaussian Splatting module learns lifelike humans +under the guidance of these high-resolution and view-consistent images. +Experiments demonstrate that Human-VDM achieves high-quality 3D human from a +single image, outperforming state-of-the-art methods in both generation quality +and quantity. Project page: https://human-vdm.github.io/Human-VDM/ + +
+
+ comment: 14 Pages, 8 figures, Project page: + https://human-vdm.github.io/Human-VDM/ +
+
+
+
+
+ + ☆ MaDis-Stereo: Enhanced Stereo Matching via Distilled Masked Image + Modeling + + +
+ In stereo matching, CNNs have traditionally served as the predominant +architectures. Although Transformer-based stereo models have been studied +recently, their performance still lags behind CNN-based stereo models due to +the inherent data scarcity issue in the stereo matching task. In this paper, we +propose Masked Image Modeling Distilled Stereo matching model, termed +MaDis-Stereo, that enhances locality inductive bias by leveraging Masked Image +Modeling (MIM) in training Transformer-based stereo model. Given randomly +masked stereo images as inputs, our method attempts to conduct both image +reconstruction and depth prediction tasks. While this strategy is beneficial to +resolving the data scarcity issue, the dual challenge of reconstructing masked +tokens and subsequently performing stereo matching poses significant +challenges, particularly in terms of training stability. To address this, we +propose to use an auxiliary network (teacher), updated via Exponential Moving +Average (EMA), along with the original stereo model (student), where teacher +predictions serve as pseudo supervisory signals to effectively distill +knowledge into the student model. State-of-the-arts performance is achieved +with the proposed method on several stereo matching such as ETH3D and KITTI +2015. Additionally, to demonstrate that our model effectively leverages +locality inductive bias, we provide the attention distance measurement. + +
+
+
+
+
+ + ☆ iConFormer: Dynamic Parameter-Efficient Tuning with Input-Conditioned + Adaptation + + +
+ Transfer learning based on full fine-tuning (FFT) of the pre-trained encoder +and task-specific decoder becomes increasingly complex as deep models grow +exponentially. Parameter efficient fine-tuning (PEFT) approaches using adapters +consisting of small learnable layers have emerged as an alternative to FFT, +achieving comparable performance while maintaining high training efficiency. +However, the inflexibility of the adapter with respect to input instances +limits its capability of learning task-specific information in diverse +downstream tasks. In this paper, we propose a novel PEFT approach, +input-Conditioned transFormer, termed iConFormer, that leverages a dynamic +adapter conditioned on the input instances. To secure flexible learning ability +on input instances in various downstream tasks, we introduce an +input-Conditioned Network (iCoN) in the dynamic adapter that enables +instance-level feature transformation. To be specific, iCoN generates +channel-wise convolutional kernels for each feature and transform it using +adaptive convolution process to effectively capture task-specific and +fine-grained details tailor to downstream tasks. Experimental results +demonstrate that by tuning just 1.6% to 2.8% of the Transformer backbone +parameters, iConFormer achieves performance comparable to FFT in monocular +depth estimation and semantic segmentation, while outperforming it in image +classification and instance segmentation. Also, the proposed method +consistently outperforms recent PEFT methods for all the tasks mentioned above. + +
+
+
+
+
+ + ☆ ExpLLM: Towards Chain of Thought for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is a critical task in multimedia with +significant implications across various domains. However, analyzing the causes +of facial expressions is essential for accurately recognizing them. Current +approaches, such as those based on facial action units (AUs), typically provide +AU names and intensities but lack insight into the interactions and +relationships between AUs and the overall expression. In this paper, we propose +a novel method called ExpLLM, which leverages large language models to generate +an accurate chain of thought (CoT) for facial expression recognition. +Specifically, we have designed the CoT mechanism from three key perspectives: +key observations, overall emotional interpretation, and conclusion. The key +observations describe the AU's name, intensity, and associated emotions. The +overall emotional interpretation provides an analysis based on multiple AUs and +their interactions, identifying the dominant emotions and their relationships. +Finally, the conclusion presents the final expression label derived from the +preceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed +to construct this expression CoT and generate instruction-description data for +training our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets +demonstrate that ExpLLM outperforms current state-of-the-art FER methods. +ExpLLM also surpasses the latest GPT-4o in expression CoT generation, +particularly in recognizing micro-expressions where GPT-4o frequently fails. + +
+
+ comment: project page: https://starhiking.github.io/ExpLLM_Page/ +
+
+
+
+
+ + ☆ Automatic facial axes standardization of 3D fetal ultrasound images + + +
+ Craniofacial anomalies indicate early developmental disturbances and are +usually linked to many genetic syndromes. Early diagnosis is critical, yet +ultrasound (US) examinations often fail to identify these features. This study +presents an AI-driven tool to assist clinicians in standardizing fetal facial +axes/planes in 3D US, reducing sonographer workload and facilitating the facial +evaluation. Our network, structured into three blocks-feature extractor, +rotation and translation regression, and spatial transformer-processes three +orthogonal 2D slices to estimate the necessary transformations for +standardizing the facial planes in the 3D US. These transformations are applied +to the original 3D US using a differentiable module (the spatial transformer +block), yielding a standardized 3D US and the corresponding 2D facial standard +planes. The dataset used consists of 1180 fetal facial 3D US images acquired +between weeks 20 and 35 of gestation. Results show that our network +considerably reduces inter-observer rotation variability in the test set, with +a mean geodesic angle difference of 14.12$^{\circ}$ $\pm$ 18.27$^{\circ}$ and +an Euclidean angle error of 7.45$^{\circ}$ $\pm$ 14.88$^{\circ}$. These +findings demonstrate the network's ability to effectively standardize facial +axes, crucial for consistent fetal facial assessments. In conclusion, the +proposed network demonstrates potential for improving the consistency and +accuracy of fetal facial assessments in clinical settings, facilitating early +evaluation of craniofacial anomalies. + +
+
+
+
+
+ + ☆ Deep Learning Meets Satellite Images -- An Evaluation on Handcrafted and + Learning-based Features for Multi-date Satellite Stereo Images ECCV2024 + + +
+ A critical step in the digital surface models(DSM) generation is feature +matching. Off-track (or multi-date) satellite stereo images, in particular, can +challenge the performance of feature matching due to spectral distortions +between images, long baseline, and wide intersection angles. Feature matching +methods have evolved over the years from handcrafted methods (e.g., SIFT) to +learning-based methods (e.g., SuperPoint and SuperGlue). In this paper, we +compare the performance of different features, also known as feature extraction +and matching methods, applied to satellite imagery. A wide range of stereo +pairs(~500) covering two separate study sites are used. SIFT, as a widely used +classic feature extraction and matching algorithm, is compared with seven +deep-learning matching methods: SuperGlue, LightGlue, LoFTR, ASpanFormer, DKM, +GIM-LightGlue, and GIM-DKM. Results demonstrate that traditional matching +methods are still competitive in this age of deep learning, although for +particular scenarios learning-based methods are very promising. + +
+
+ comment: ECCV2024 Workshop - TradiCV +
+
+
+
+
+ + ☆ MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding + Benchmark + + +
+ This paper introduces MMMU-Pro, a robust version of the Massive +Multi-discipline Multimodal Understanding and Reasoning (MMMU) benchmark. +MMMU-Pro rigorously assesses multimodal models' true understanding and +reasoning capabilities through a three-step process based on MMMU: (1) +filtering out questions answerable by text-only models, (2) augmenting +candidate options, and (3) introducing a vision-only input setting where +questions are embedded within images. This setting challenges AI to truly "see" +and "read" simultaneously, testing a fundamental human cognitive skill of +seamlessly integrating visual and textual information. Results show that model +performance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8% +to 26.9% across models. We explore the impact of OCR prompts and Chain of +Thought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT +generally improves performance. MMMU-Pro provides a more rigorous evaluation +tool, closely mimicking real-world scenarios and offering valuable directions +for future research in multimodal AI. + +
+
+
+
+
+ + ☆ UnLearning from Experience to Avoid Spurious Correlations + + +
+ While deep neural networks can achieve state-of-the-art performance in many +tasks, these models are more fragile than they appear. They are prone to +learning spurious correlations in their training data, leading to surprising +failure cases. In this paper, we propose a new approach that addresses the +issue of spurious correlations: UnLearning from Experience (ULE). Our method is +based on using two classification models trained in parallel: student and +teacher models. Both models receive the same batches of training data. The +student model is trained with no constraints and pursues the spurious +correlations in the data. The teacher model is trained to solve the same +classification problem while avoiding the mistakes of the student model. As +training is done in parallel, the better the student model learns the spurious +correlations, the more robust the teacher model becomes. The teacher model uses +the gradient of the student's output with respect to its input to unlearn +mistakes made by the student. We show that our method is effective on the +Waterbirds, CelebA, Spawrious and UrbanCars datasets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Validation of musculoskeletal segmentation model with uncertainty + estimation for bone and muscle assessment in hip-to-knee clinical CT images + + +
+ Deep learning-based image segmentation has allowed for the fully automated, +accurate, and rapid analysis of musculoskeletal (MSK) structures from medical +images. However, current approaches were either applied only to 2D +cross-sectional images, addressed few structures, or were validated on small +datasets, which limit the application in large-scale databases. This study +aimed to validate an improved deep learning model for volumetric MSK +segmentation of the hip and thigh with uncertainty estimation from clinical +computed tomography (CT) images. Databases of CT images from multiple +manufacturers/scanners, disease status, and patient positioning were used. The +segmentation accuracy, and accuracy in estimating the structures volume and +density, i.e., mean HU, were evaluated. An approach for segmentation failure +detection based on predictive uncertainty was also investigated. The model has +shown an overall improvement with respect to all segmentation accuracy and +structure volume/density evaluation metrics. The predictive uncertainty yielded +large areas under the receiver operating characteristic (AUROC) curves +(AUROCs>=.95) in detecting inaccurate and failed segmentations. The high +segmentation and muscle volume/density estimation accuracy, along with the high +accuracy in failure detection based on the predictive uncertainty, exhibited +the model's reliability for analyzing individual MSK structures in large-scale +CT databases. + +
+
+ comment: 29 pages, 7+10supp figures, 8 tables +
+
+
+
+
+ + ☆ CLDA: Collaborative Learning for Enhanced Unsupervised Domain Adaptation + + +
+ Unsupervised Domain Adaptation (UDA) endeavors to bridge the gap between a +model trained on a labeled source domain and its deployment in an unlabeled +target domain. However, current high-performance models demand significant +resources, resulting in prohibitive deployment costs and highlighting the need +for small yet effective models. For UDA of lightweight models, Knowledge +Distillation (KD) in a Teacher-Student framework can be a common approach, but +we find that domain shift in UDA leads to a significant increase in non-salient +parameters in the teacher model, degrading model's generalization ability and +transferring misleading information to the student model. Interestingly, we +observed that this phenomenon occurs considerably less in the student model. +Driven by this insight, we introduce Collaborative Learning, a method that +updates the teacher's non-salient parameters using the student model and at the +same time enhance the student's performance using the updated teacher model. +Experiments across various tasks and datasets show consistent performance +improvements for both student and teacher models. For example, in semantic +segmentation, CLDA achieves an improvement of +0.7% mIoU for teacher and +1.4% +mIoU for student compared to the baseline model in the GTA to Cityscapes. In +the Synthia to Cityscapes, it achieves an improvement of +0.8% mIoU for teacher +and +2.0% mIoU for student. + +
+
+
+
+
+ + ☆ Rethinking HTG Evaluation: Bridging Generation and Recognition + + +
+ The evaluation of generative models for natural image tasks has been +extensively studied. Similar protocols and metrics are used in cases with +unique particularities, such as Handwriting Generation, even if they might not +be completely appropriate. In this work, we introduce three measures tailored +for HTG evaluation, $ \text{HTG}_{\text{HTR}} $, $ \text{HTG}_{\text{style}} $, +and $ \text{HTG}_{\text{OOV}} $, and argue that they are more expedient to +evaluate the quality of generated handwritten images. The metrics rely on the +recognition error/accuracy of Handwriting Text Recognition and Writer +Identification models and emphasize writing style, textual content, and +diversity as the main aspects that adhere to the content of handwritten images. +We conduct comprehensive experiments on the IAM handwriting database, +showcasing that widely used metrics such as FID fail to properly quantify the +diversity and the practical utility of generated handwriting samples. Our +findings show that our metrics are richer in information and underscore the +necessity of standardized evaluation protocols in HTG. The proposed metrics +provide a more robust and informative protocol for assessing HTG quality, +contributing to improved performance in HTR. Code for the evaluation protocol +is available at: https://github.com/koninik/HTG_evaluation. + +
+
+
+
+
+ + ☆ Improved Single Camera BEV Perception Using Multi-Camera Training SC 2024 + + +
+ Bird's Eye View (BEV) map prediction is essential for downstream autonomous +driving tasks like trajectory prediction. In the past, this was accomplished +through the use of a sophisticated sensor configuration that captured a +surround view from multiple cameras. However, in large-scale production, cost +efficiency is an optimization goal, so that using fewer cameras becomes more +relevant. But the consequence of fewer input images correlates with a +performance drop. This raises the problem of developing a BEV perception model +that provides a sufficient performance on a low-cost sensor setup. Although, +primarily relevant for inference time on production cars, this cost restriction +is less problematic on a test vehicle during training. Therefore, the objective +of our approach is to reduce the aforementioned performance drop as much as +possible using a modern multi-camera surround view model reduced for +single-camera inference. The approach includes three features, a modern masking +technique, a cyclic Learning Rate (LR) schedule, and a feature reconstruction +loss for supervising the transition from six-camera inputs to one-camera input +during training. Our method outperforms versions trained strictly with one +camera or strictly with six-camera surround view for single-camera inference +resulting in reduced hallucination and better quality of the BEV map. + +
+
+ comment: This Paper has been accepted to the 27th IEEE International + Conference on Intelligent Transportation Systems (ITSC 2024) +
+
+
+
+
+ + ☆ Multi-Head Attention Residual Unfolded Network for Model-Based + Pansharpening + + +
+ The objective of pansharpening and hypersharpening is to accurately combine a +high-resolution panchromatic (PAN) image with a low-resolution multispectral +(MS) or hyperspectral (HS) image, respectively. Unfolding fusion methods +integrate the powerful representation capabilities of deep learning with the +robustness of model-based approaches. These techniques involve unrolling the +steps of the optimization scheme derived from the minimization of an energy +into a deep learning framework, resulting in efficient and highly interpretable +architectures. In this paper, we propose a model-based deep unfolded method for +satellite image fusion. Our approach is based on a variational formulation that +incorporates the classic observation model for MS/HS data, a high-frequency +injection constraint based on the PAN image, and an arbitrary convex prior. For +the unfolding stage, we introduce upsampling and downsampling layers that use +geometric information encoded in the PAN image through residual networks. The +backbone of our method is a multi-head attention residual network (MARNet), +which replaces the proximity operator in the optimization scheme and combines +multiple head attentions with residual learning to exploit image +self-similarities via nonlocal operators defined in terms of patches. +Additionally, we incorporate a post-processing module based on the MARNet +architecture to further enhance the quality of the fused images. Experimental +results on PRISMA, Quickbird, and WorldView2 datasets demonstrate the superior +performance of our method and its ability to generalize across different sensor +configurations and varying spatial and spectral resolutions. The source code +will be available at https://github.com/TAMI-UIB/MARNet. + +
+
+
+
+
+ + ☆ Standing on the Shoulders of Giants: Reprogramming Visual-Language Model + for General Deepfake Detection + + +
+ The proliferation of deepfake faces poses huge potential negative impacts on +our daily lives. Despite substantial advancements in deepfake detection over +these years, the generalizability of existing methods against forgeries from +unseen datasets or created by emerging generative models remains constrained. +In this paper, inspired by the zero-shot advantages of Vision-Language Models +(VLMs), we propose a novel approach that repurposes a well-trained VLM for +general deepfake detection. Motivated by the model reprogramming paradigm that +manipulates the model prediction via data perturbations, our method can +reprogram a pretrained VLM model (e.g., CLIP) solely based on manipulating its +input without tuning the inner parameters. Furthermore, we insert a pseudo-word +guided by facial identity into the text prompt. Extensive experiments on +several popular benchmarks demonstrate that (1) the cross-dataset and +cross-manipulation performances of deepfake detection can be significantly and +consistently improved (e.g., over 88% AUC in cross-dataset setting from FF++ to +WildDeepfake) using a pre-trained CLIP model with our proposed reprogramming +method; (2) our superior performances are at less cost of trainable parameters, +making it a promising approach for real-world applications. + +
+
+
+
+
+ + ☆ PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for + One-Shot Talking Head Generation + + +
+ While previous audio-driven talking head generation (THG) methods generate +head poses from driving audio, the generated poses or lips cannot match the +audio well or are not editable. In this study, we propose \textbf{PoseTalk}, a +THG system that can freely generate lip-synchronized talking head videos with +free head poses conditioned on text prompts and audio. The core insight of our +method is using head pose to connect visual, linguistic, and audio signals. +First, we propose to generate poses from both audio and text prompts, where the +audio offers short-term variations and rhythm correspondence of the head +movements and the text prompts describe the long-term semantics of head +motions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to +generate motion latent from text prompts and audio cues in a pose latent space. +Second, we observe a loss-imbalance problem: the loss for the lip region +contributes less than 4\% of the total reconstruction loss caused by both pose +and lip, making optimization lean towards head movements rather than lip +shapes. To address this issue, we propose a refinement-based learning strategy +to synthesize natural talking videos using two cascaded networks, i.e., +CoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce +animated images in novel poses and the RefineNet focuses on learning finer lip +motions by progressively estimating lip motions from low-to-high resolutions, +yielding improved lip-synchronization performance. Experiments demonstrate our +pose prediction strategy achieves better pose diversity and realness compared +to text-only or audio-only, and our video generator model outperforms +state-of-the-art methods in synthesizing talking videos with natural head +motions. Project: https://junleen.github.io/projects/posetalk. + +
+
+ comment: 7+5 pages, 15 figures +
+
+
+
+
+ + ☆ Skip-and-Play: Depth-Driven Pose-Preserved Image Generation for Any + Objects + + +
+ The emergence of diffusion models has enabled the generation of diverse +high-quality images solely from text, prompting subsequent efforts to enhance +the controllability of these models. Despite the improvement in +controllability, pose control remains limited to specific objects (e.g., +humans) or poses (e.g., frontal view) due to the fact that pose is generally +controlled via camera parameters (e.g., rotation angle) or keypoints (e.g., +eyes, nose). Specifically, camera parameters-conditional pose control models +generate unrealistic images depending on the object, owing to the small size of +3D datasets for training. Also, keypoint-based approaches encounter challenges +in acquiring reliable keypoints for various objects (e.g., church) or poses +(e.g., back view). To address these limitations, we propose depth-based pose +control, as depth maps are easily obtainable from a single depth estimation +model regardless of objects and poses, unlike camera parameters and keypoints. +However, depth-based pose control confronts issues of shape dependency, as +depth maps influence not only the pose but also the shape of the generated +images. To tackle this issue, we propose Skip-and-Play (SnP), designed via +analysis of the impact of three components of depth-conditional ControlNet on +the pose and the shape of the generated images. To be specific, based on the +analysis, we selectively skip parts of the components to mitigate shape +dependency on the depth map while preserving the pose. Through various +experiments, we demonstrate the superiority of SnP over baselines and showcase +the ability of SnP to generate images of diverse objects and poses. Remarkably, +SnP exhibits the ability to generate images even when the objects in the +condition (e.g., a horse) and the prompt (e.g., a hedgehog) differ from each +other. + +
+
+
+
+
+ + ☆ Creating a Microstructure Latent Space with Rich Material Information + for Multiphase Alloy Design + + +
+ The intricate microstructure serves as the cornerstone for the +composition/processing-structure-property (CPSP) connection in multiphase +alloys. Traditional alloy design methods often overlook microstructural +details, which diminishes the reliability and effectiveness of the outcomes. +This study introduces an improved alloy design algorithm that integrates +authentic microstructural information to establish precise CPSP relationships. +The approach utilizes a deep-learning framework based on a variational +autoencoder to map real microstructural data to a latent space, enabling the +prediction of composition, processing steps, and material properties from the +latent space vector. By integrating this deep learning model with a specific +sampling strategy in the latent space, a novel, microstructure-centered +algorithm for multiphase alloy design is developed. This algorithm is +demonstrated through the design of a unified dual-phase steel, and the results +are assessed at three performance levels. Moreover, an exploration into the +latent vector space of the model highlights its seamless interpolation ability +and its rich material information content. Notably, the current configuration +of the latent space is particularly advantageous for alloy design, offering an +exhaustive representation of microstructure, composition, processing, and +property variations essential for multiphase alloys. + +
+
+
+
+
+ + ☆ Learning-Based Error Detection System for Advanced Vehicle Instrument + Cluster Rendering + + +
+ The automotive industry is currently expanding digital display options with +every new model that comes onto the market. This entails not just an expansion +in dimensions, resolution, and customization choices, but also the capability +to employ novel display effects like overlays while assembling the content of +the display cluster. Unfortunately, this raises the need for appropriate +monitoring systems that can detect rendering errors and apply appropriate +countermeasures when required. Classical solutions such as Cyclic Redundancy +Checks (CRC) will soon be no longer viable as any sort of alpha blending, +warping of scaling of content can cause unwanted CRC violations. Therefore, we +propose a novel monitoring approach to verify correctness of displayed content +using telltales (e.g. warning signs) as example. It uses a learning-based +approach to separate "good" telltales, i.e. those that a human driver will +understand correctly, and "corrupted" telltales, i.e. those that will not be +visible or perceived correctly. As a result, it possesses inherent resilience +against individual pixel errors and implicitly supports changing backgrounds, +overlay or scaling effects. This is underlined by our experimental study where +all "corrupted" test patterns were correctly classified, while no false alarms +were triggered. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ MADiff: Motion-Aware Mamba Diffusion Models for Hand Trajectory + Prediction on Egocentric Videos + + +
+ Understanding human intentions and actions through egocentric videos is +important on the path to embodied artificial intelligence. As a branch of +egocentric vision techniques, hand trajectory prediction plays a vital role in +comprehending human motion patterns, benefiting downstream tasks in extended +reality and robot manipulation. However, capturing high-level human intentions +consistent with reasonable temporal causality is challenging when only +egocentric videos are available. This difficulty is exacerbated under camera +egomotion interference and the absence of affordance labels to explicitly guide +the optimization of hand waypoint distribution. In this work, we propose a +novel hand trajectory prediction method dubbed MADiff, which forecasts future +hand waypoints with diffusion models. The devised denoising operation in the +latent space is achieved by our proposed motion-aware Mamba, where the camera +wearer's egomotion is integrated to achieve motion-driven selective scan +(MDSS). To discern the relationship between hands and scenarios without +explicit affordance supervision, we leverage a foundation model that fuses +visual and language features to capture high-level semantics from video clips. +Comprehensive experiments conducted on five public datasets with the existing +and our proposed new evaluation metrics demonstrate that MADiff predicts +comparably reasonable hand trajectories compared to the state-of-the-art +baselines, and achieves real-time performance. We will release our code and +pretrained models of MADiff at the project page: +https://irmvlab.github.io/madiff.github.io. + +
+
+
+
+
+ + ☆ Loopy: Taming Audio-Driven Portrait Avatar with Long-Term Motion + Dependency + + +
+ With the introduction of diffusion-based video generation techniques, +audio-conditioned human video generation has recently achieved significant +breakthroughs in both the naturalness of motion and the synthesis of portrait +details. Due to the limited control of audio signals in driving human motion, +existing methods often add auxiliary spatial signals to stabilize movements, +which may compromise the naturalness and freedom of motion. In this paper, we +propose an end-to-end audio-only conditioned video diffusion model named Loopy. +Specifically, we designed an inter- and intra-clip temporal module and an +audio-to-latents module, enabling the model to leverage long-term motion +information from the data to learn natural motion patterns and improving +audio-portrait movement correlation. This method removes the need for manually +specified spatial motion templates used in existing methods to constrain motion +during inference. Extensive experiments show that Loopy outperforms recent +audio-driven portrait diffusion models, delivering more lifelike and +high-quality results across various scenarios. + +
+
+
+
+
+ + ☆ AdvSecureNet: A Python Toolkit for Adversarial Machine Learning + + +
+ Machine learning models are vulnerable to adversarial attacks. Several tools +have been developed to research these vulnerabilities, but they often lack +comprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch +based toolkit for adversarial machine learning that is the first to natively +support multi-GPU setups for attacks, defenses, and evaluation. It is the first +toolkit that supports both CLI and API interfaces and external YAML +configuration files to enhance versatility and reproducibility. The toolkit +includes multiple attacks, defenses and evaluation metrics. Rigiorous software +engineering practices are followed to ensure high code quality and +maintainability. The project is available as an open-source project on GitHub +at https://github.com/melihcatal/advsecurenet and installable via PyPI. + +
+
+
+
+
+ + ☆ GoT-CQA: Graph-of-Thought Guided Compositional Reasoning for Chart + Question Answering + + +
+ Chart Question Answering (CQA) aims at answering questions based on the +visual chart content, which plays an important role in chart sumarization, +business data analysis, and data report generation. CQA is a challenging +multi-modal task because of the strong context dependence and complex reasoning +requirement. The former refers to answering this question strictly based on the +analysis of the visual content or internal data of the given chart, while the +latter emphasizes the various logical and numerical reasoning involved in +answer prediction process. In this paper, we pay more attention on the complex +reasoning in CQA task, and propose a novel Graph-of-Thought (GoT) guided +compositional reasoning model called GoT-CQA to overcome this problem. At +first, we transform the chart-oriented question into a directed acyclic GoT +composed of multiple operator nodes, including localization, numerical and +logical operator. It intuitively reflects the human brain's solution process to +this question. After that, we design an efficient auto-compositional reasoning +framework guided by the GoT, to excute the multi-step reasoning operations in +various types of questions. Comprehensive experiments on ChartQA and PlotQA-D +datasets show that GoT-CQA achieves outstanding performance, especially in +complex human-written and reasoning questions, comparing with the latest +popular baselines. + +
+
+
+
+
+ + ☆ A Medical Multimodal Large Language Model for Pediatric Pneumonia + + +
+ Pediatric pneumonia is the leading cause of death among children under five +years worldwide, imposing a substantial burden on affected families. Currently, +there are three significant hurdles in diagnosing and treating pediatric +pneumonia. Firstly, pediatric pneumonia shares similar symptoms with other +respiratory diseases, making rapid and accurate differential diagnosis +challenging. Secondly, primary hospitals often lack sufficient medical +resources and experienced doctors. Lastly, providing personalized diagnostic +reports and treatment recommendations is labor-intensive and time-consuming. To +tackle these challenges, we proposed a Medical Multimodal Large Language Model +for Pediatric Pneumonia (P2Med-MLLM). It was capable of handling diverse +clinical tasks, such as generating free-text radiology reports and medical +records within a unified framework. Specifically, P2Med-MLLM can process both +pure text and image-text data, trained on an extensive and large-scale dataset +(P2Med-MD), including real clinical information from 163,999 outpatient and +8,684 inpatient cases. This dataset comprised 2D chest X-ray images, 3D chest +CT images, corresponding radiology reports, and outpatient and inpatient +records. We designed a three-stage training strategy to enable P2Med-MLLM to +comprehend medical knowledge and follow instructions for various clinical +tasks. To rigorously evaluate P2Med-MLLM's performance, we developed +P2Med-MBench, a benchmark consisting of 642 meticulously verified samples by +pediatric pulmonology specialists, covering six clinical decision-support tasks +and a balanced variety of diseases. The automated scoring results demonstrated +the superiority of P2Med-MLLM. This work plays a crucial role in assisting +primary care doctors with prompt disease diagnosis and treatment planning, +reducing severe symptom mortality rates, and optimizing the allocation of +medical resources. + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ☆ A Fashion Item Recommendation Model in Hyperbolic Space CVPR 2024 + + +
+ In this work, we propose a fashion item recommendation model that +incorporates hyperbolic geometry into user and item representations. Using +hyperbolic space, our model aims to capture implicit hierarchies among items +based on their visual data and users' purchase history. During training, we +apply a multi-task learning framework that considers both hyperbolic and +Euclidean distances in the loss function. Our experiments on three data sets +show that our model performs better than previous models trained in Euclidean +space only, confirming the effectiveness of our model. Our ablation studies +show that multi-task learning plays a key role, and removing the Euclidean loss +substantially deteriorates the model performance. + +
+
+ comment: This work was presented at the CVFAD Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ SurgTrack: CAD-Free 3D Tracking of Real-world Surgical Instruments + + +
+ Vision-based surgical navigation has received increasing attention due to its +non-invasive, cost-effective, and flexible advantages. In particular, a +critical element of the vision-based navigation system is tracking surgical +instruments. Compared with 2D instrument tracking methods, 3D instrument +tracking has broader value in clinical practice, but is also more challenging +due to weak texture, occlusion, and lack of Computer-Aided Design (CAD) models +for 3D registration. To solve these challenges, we propose the SurgTrack, a +two-stage 3D instrument tracking method for CAD-free and robust real-world +applications. In the first registration stage, we incorporate an Instrument +Signed Distance Field (SDF) modeling the 3D representation of instruments, +achieving CAD-freed 3D registration. Due to this, we can obtain the location +and orientation of instruments in the 3D space by matching the video stream +with the registered SDF model. In the second tracking stage, we devise a +posture graph optimization module, leveraging the historical tracking results +of the posture memory pool to optimize the tracking results and improve the +occlusion robustness. Furthermore, we collect the Instrument3D dataset to +comprehensively evaluate the 3D tracking of surgical instruments. The extensive +experiments validate the superiority and scalability of our SurgTrack, by +outperforming the state-of-the-arts with a remarkable improvement. The code and +dataset are available at https://github.com/wenwucode/SurgTrack. + +
+
+
+
+
+ + ☆ BMI Prediction from Handwritten English Characters Using a Convolutional + Neural Network + + +
+ A person's Body Mass Index, or BMI, is the most widely used parameter for +assessing their health. BMI is a crucial predictor of potential diseases that +may arise at higher body fat levels because it is correlated with body fat. +Conversely, a community's or an individual's nutritional status can be +determined using the BMI. Although deep learning models are used in several +studies to estimate BMI from face photos and other data, no previous research +established a clear connection between deep learning techniques for handwriting +analysis and BMI prediction. This article addresses this research gap with a +deep learning approach to estimating BMI from handwritten characters by +developing a convolutional neural network (CNN). A dataset containing samples +from 48 people in lowercase English scripts is successfully captured for the +BMI prediction task. The proposed CNN-based approach reports a commendable +accuracy of 99.92%. Performance comparison with other popular CNN architectures +reveals that AlexNet and InceptionV3 achieve the second and third-best +performance, with the accuracy of 99.69% and 99.53%, respectively. + +
+
+
+
+
+ + ☆ Object Gaussian for Monocular 6D Pose Estimation from Sparse Views + + +
+ Monocular object pose estimation, as a pivotal task in computer vision and +robotics, heavily depends on accurate 2D-3D correspondences, which often demand +costly CAD models that may not be readily available. Object 3D reconstruction +methods offer an alternative, among which recent advancements in 3D Gaussian +Splatting (3DGS) afford a compelling potential. Yet its performance still +suffers and tends to overfit with fewer input views. Embracing this challenge, +we introduce SGPose, a novel framework for sparse view object pose estimation +using Gaussian-based methods. Given as few as ten views, SGPose generates a +geometric-aware representation by starting with a random cuboid initialization, +eschewing reliance on Structure-from-Motion (SfM) pipeline-derived geometry as +required by traditional 3DGS methods. SGPose removes the dependence on CAD +models by regressing dense 2D-3D correspondences between images and the +reconstructed model from sparse input and random initialization, while the +geometric-consistent depth supervision and online synthetic view warping are +key to the success. Experiments on typical benchmarks, especially on the +Occlusion LM-O dataset, demonstrate that SGPose outperforms existing methods +even under sparse view constraints, under-scoring its potential in real-world +applications. + +
+
+
+
+
+ + ☆ Solving Video Inverse Problems Using Image Diffusion Models + + +
+ Recently, diffusion model-based inverse problem solvers (DIS) have emerged as +state-of-the-art approaches for addressing inverse problems, including image +super-resolution, deblurring, inpainting, etc. However, their application to +video inverse problems arising from spatio-temporal degradation remains largely +unexplored due to the challenges in training video diffusion models. To address +this issue, here we introduce an innovative video inverse solver that leverages +only image diffusion models. Specifically, by drawing inspiration from the +success of the recent decomposed diffusion sampler (DDS), our method treats the +time dimension of a video as the batch dimension of image diffusion models and +solves spatio-temporal optimization problems within denoised spatio-temporal +batches derived from each image diffusion model. Moreover, we introduce a +batch-consistent diffusion sampling strategy that encourages consistency across +batches by synchronizing the stochastic noise components in image diffusion +models. Our approach synergistically combines batch-consistent sampling with +simultaneous optimization of denoised spatio-temporal batches at each reverse +diffusion step, resulting in a novel and efficient diffusion sampling strategy +for video inverse problems. Experimental results demonstrate that our method +effectively addresses various spatio-temporal degradations in video inverse +problems, achieving state-of-the-art reconstructions. Project page: +https://solving-video-inverse.github.io/main/ + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ☆ Evaluation Study on SAM 2 for Class-agnostic Instance-level Segmentation + + +
+ Segment Anything Model (SAM) has demonstrated powerful zero-shot segmentation +performance in natural scenes. The recently released Segment Anything Model 2 +(SAM2) has further heightened researchers' expectations towards image +segmentation capabilities. To evaluate the performance of SAM2 on +class-agnostic instance-level segmentation tasks, we adopt different prompt +strategies for SAM2 to cope with instance-level tasks for three relevant +scenarios: Salient Instance Segmentation (SIS), Camouflaged Instance +Segmentation (CIS), and Shadow Instance Detection (SID). In addition, to +further explore the effectiveness of SAM2 in segmenting granular object +structures, we also conduct detailed tests on the high-resolution Dichotomous +Image Segmentation (DIS) benchmark to assess the fine-grained segmentation +capability. Qualitative and quantitative experimental results indicate that the +performance of SAM2 varies significantly across different scenarios. Besides, +SAM2 is not particularly sensitive to segmenting high-resolution fine details. +We hope this technique report can drive the emergence of SAM2-based adapters, +aiming to enhance the performance ceiling of large vision models on +class-agnostic instance segmentation tasks. + +
+
+
+
+
+ + ☆ How Do You Perceive My Face? Recognizing Facial Expressions in + Multi-Modal Context by Modeling Mental Representations + + +
+ Facial expression perception in humans inherently relies on prior knowledge +and contextual cues, contributing to efficient and flexible processing. For +instance, multi-modal emotional context (such as voice color, affective text, +body pose, etc.) can prompt people to perceive emotional expressions in +objectively neutral faces. Drawing inspiration from this, we introduce a novel +approach for facial expression classification that goes beyond simple +classification tasks. Our model accurately classifies a perceived face and +synthesizes the corresponding mental representation perceived by a human when +observing a face in context. With this, our model offers visual insights into +its internal decision-making process. We achieve this by learning two +independent representations of content and context using a VAE-GAN +architecture. Subsequently, we propose a novel attention mechanism for +context-dependent feature adaptation. The adapted representation is used for +classification and to generate a context-augmented expression. We evaluate +synthesized expressions in a human study, showing that our model effectively +produces approximations of human mental representations. We achieve +State-of-the-Art classification accuracies of 81.01% on the RAVDESS dataset and +79.34% on the MEAD dataset. We make our code publicly available. + +
+
+ comment: GCPR 2024 +
+
+
+
+
+ + ☆ Interacting Multiple Model-based Joint Homography Matrix and Multiple + Object State Estimation + + +
+ A novel MOT algorithm, IMM Joint Homography State Estimation (IMM-JHSE), is +proposed. By jointly modelling the camera projection matrix as part of track +state vectors, IMM-JHSE removes the explicit influence of camera motion +compensation techniques on predicted track position states, which was prevalent +in previous approaches. Expanding upon this, static and dynamic camera motion +models are combined through the use of an IMM filter. A simple bounding box +motion model is used to predict bounding box positions to incorporate image +plane information. In addition to applying an IMM to camera motion, a +non-standard IMM approach is applied where bounding-box-based BIoU scores are +mixed with ground-plane-based Mahalanobis distances in an IMM-like fashion to +perform association only. Finally, IMM-JHSE makes use of dynamic process and +measurement noise estimation techniques. IMM-JHSE improves upon related +techniques on the DanceTrack and KITTI-car datasets, increasing HOTA by 2.64 +and 2.11, respectively, while offering competitive performance on the MOT17, +MOT20 and KITTI-pedestrian datasets. + +
+
+ comment: Preprint submitted to Information Fusion +
+
+
+
+
+ + ☆ Low-Resolution Object Recognition with Cross-Resolution Relational + Contrastive Distillation + + +
+ Recognizing objects in low-resolution images is a challenging task due to the +lack of informative details. Recent studies have shown that knowledge +distillation approaches can effectively transfer knowledge from a +high-resolution teacher model to a low-resolution student model by aligning +cross-resolution representations. However, these approaches still face +limitations in adapting to the situation where the recognized objects exhibit +significant representation discrepancies between training and testing images. +In this study, we propose a cross-resolution relational contrastive +distillation approach to facilitate low-resolution object recognition. Our +approach enables the student model to mimic the behavior of a well-trained +teacher model which delivers high accuracy in identifying high-resolution +objects. To extract sufficient knowledge, the student learning is supervised +with contrastive relational distillation loss, which preserves the similarities +in various relational structures in contrastive representation space. In this +manner, the capability of recovering missing details of familiar low-resolution +objects can be effectively enhanced, leading to a better knowledge transfer. +Extensive experiments on low-resolution object classification and +low-resolution face recognition clearly demonstrate the effectiveness and +adaptability of our approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Circuits and Systems + for Video Technology (TCSVT) +
+
+
+
+
+ + ☆ Real-Time Dynamic Scale-Aware Fusion Detection Network: Take Road Damage + Detection as an example + + +
+ Unmanned Aerial Vehicle (UAV)-based Road Damage Detection (RDD) is important +for daily maintenance and safety in cities, especially in terms of +significantly reducing labor costs. However, current UAV-based RDD research is +still faces many challenges. For example, the damage with irregular size and +direction, the masking of damage by the background, and the difficulty of +distinguishing damage from the background significantly affect the ability of +UAV to detect road damage in daily inspection. To solve these problems and +improve the performance of UAV in real-time road damage detection, we design +and propose three corresponding modules: a feature extraction module that +flexibly adapts to shape and background; a module that fuses multiscale +perception and adapts to shape and background ; an efficient downsampling +module. Based on these modules, we designed a multi-scale, adaptive road damage +detection model with the ability to automatically remove background +interference, called Dynamic Scale-Aware Fusion Detection Model (RT-DSAFDet). +Experimental results on the UAV-PDD2023 public dataset show that our model +RT-DSAFDet achieves a mAP50 of 54.2%, which is 11.1% higher than that of +YOLOv10-m, an efficient variant of the latest real-time object detection model +YOLOv10, while the amount of parameters is reduced to 1.8M and FLOPs to 4.6G, +with a decreased by 88% and 93%, respectively. Furthermore, on the large +generalized object detection public dataset MS COCO2017 also shows the +superiority of our model with mAP50-95 is the same as YOLOv9-t, but with 0.5% +higher mAP50, 10% less parameters volume, and 40% less FLOPs. + +
+
+
+
+
+ + ☆ UniTT-Stereo: Unified Training of Transformer for Enhanced Stereo + Matching + + +
+ Unlike other vision tasks where Transformer-based approaches are becoming +increasingly common, stereo depth estimation is still dominated by +convolution-based approaches. This is mainly due to the limited availability of +real-world ground truth for stereo matching, which is a limiting factor in +improving the performance of Transformer-based stereo approaches. In this +paper, we propose UniTT-Stereo, a method to maximize the potential of +Transformer-based stereo architectures by unifying self-supervised learning +used for pre-training with stereo matching framework based on supervised +learning. To be specific, we explore the effectiveness of reconstructing +features of masked portions in an input image and at the same time predicting +corresponding points in another image from the perspective of locality +inductive bias, which is crucial in training models with limited training data. +Moreover, to address these challenging tasks of reconstruction-and-prediction, +we present a new strategy to vary a masking ratio when training the stereo +model with stereo-tailored losses. State-of-the-art performance of UniTT-Stereo +is validated on various benchmarks such as ETH3D, KITTI 2012, and KITTI 2015 +datasets. Lastly, to investigate the advantages of the proposed approach, we +provide a frequency analysis of feature maps and the analysis of locality +inductive bias based on attention maps. + +
+
+
+
+
+ + ☆ StyleTokenizer: Defining Image Style by a Single Instance for + Controlling Diffusion Models ECCV2024 + + +
+ Despite the burst of innovative methods for controlling the diffusion +process, effectively controlling image styles in text-to-image generation +remains a challenging task. Many adapter-based methods impose image +representation conditions on the denoising process to accomplish image control. +However these conditions are not aligned with the word embedding space, leading +to interference between image and text control conditions and the potential +loss of semantic information from the text prompt. Addressing this issue +involves two key challenges. Firstly, how to inject the style representation +without compromising the effectiveness of text representation in control. +Secondly, how to obtain the accurate style representation from a single +reference image. To tackle these challenges, we introduce StyleTokenizer, a +zero-shot style control image generation method that aligns style +representation with text representation using a style tokenizer. This alignment +effectively minimizes the impact on the effectiveness of text prompts. +Furthermore, we collect a well-labeled style dataset named Style30k to train a +style feature extractor capable of accurately representing style while +excluding other content information. Experimental results demonstrate that our +method fully grasps the style characteristics of the reference image, +generating appealing images that are consistent with both the target image +style and text prompt. The code and dataset are available at +https://github.com/alipay/style-tokenizer. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ☆ Sample what you cant compress + + +
+ For learned image representations, basic autoencoders often produce blurry +results. Reconstruction quality can be improved by incorporating additional +penalties such as adversarial (GAN) and perceptual losses. Arguably, these +approaches lack a principled interpretation. Concurrently, in generative +settings diffusion has demonstrated a remarkable ability to create crisp, high +quality results and has solid theoretical underpinnings (from variational +inference to direct study as the Fisher Divergence). Our work combines +autoencoder representation learning with diffusion and is, to our knowledge, +the first to demonstrate the efficacy of jointly learning a continuous encoder +and decoder under a diffusion-based loss. We demonstrate that this approach +yields better reconstruction quality as compared to GAN-based autoencoders +while being easier to tune. We also show that the resulting representation is +easier to model with a latent diffusion model as compared to the representation +obtained from a state-of-the-art GAN-based loss. Since our decoder is +stochastic, it can generate details not encoded in the otherwise deterministic +latent representation; we therefore name our approach "Sample what you can't +compress", or SWYCC for short. + +
+
+
+
+
+ + ☆ SG-MIM: Structured Knowledge Guided Efficient Pre-training for Dense + Prediction + + +
+ Masked Image Modeling (MIM) techniques have redefined the landscape of +computer vision, enabling pre-trained models to achieve exceptional performance +across a broad spectrum of tasks. Despite their success, the full potential of +MIM-based methods in dense prediction tasks, particularly in depth estimation, +remains untapped. Existing MIM approaches primarily rely on single-image +inputs, which makes it challenging to capture the crucial structured +information, leading to suboptimal performance in tasks requiring fine-grained +feature representation. To address these limitations, we propose SG-MIM, a +novel Structured knowledge Guided Masked Image Modeling framework designed to +enhance dense prediction tasks by utilizing structured knowledge alongside +images. SG-MIM employs a lightweight relational guidance framework, allowing it +to guide structured knowledge individually at the feature level rather than +naively combining at the pixel level within the same architecture, as is common +in traditional multi-modal pre-training methods. This approach enables the +model to efficiently capture essential information while minimizing +discrepancies between pre-training and downstream tasks. Furthermore, SG-MIM +employs a selective masking strategy to incorporate structured knowledge, +maximizing the synergy between general representation learning and structured +knowledge-specific learning. Our method requires no additional annotations, +making it a versatile and efficient solution for a wide range of applications. +Our evaluations on the KITTI, NYU-v2, and ADE20k datasets demonstrate SG-MIM's +superiority in monocular depth estimation and semantic segmentation. + +
+
+
+
+
+ + ☆ TLD: A Vehicle Tail Light signal Dataset and Benchmark + + +
+ Understanding other drivers' intentions is crucial for safe driving. The role +of taillights in conveying these intentions is underemphasized in current +autonomous driving systems. Accurately identifying taillight signals is +essential for predicting vehicle behavior and preventing collisions. +Open-source taillight datasets are scarce, often small and inconsistently +annotated. To address this gap, we introduce a new large-scale taillight +dataset called TLD. Sourced globally, our dataset covers diverse traffic +scenarios. To our knowledge, TLD is the first dataset to separately annotate +brake lights and turn signals in real driving scenarios. We collected 17.78 +hours of driving videos from the internet. This dataset consists of 152k +labeled image frames sampled at a rate of 2 Hz, along with 1.5 million +unlabeled frames interspersed throughout. Additionally, we have developed a +two-stage vehicle light detection model consisting of two primary modules: a +vehicle detector and a taillight classifier. Initially, YOLOv10 and DeepSORT +captured consecutive vehicle images over time. Subsequently, the two +classifiers work simultaneously to determine the states of the brake lights and +turn signals. A post-processing procedure is then used to eliminate noise +caused by misidentifications and provide the taillight states of the vehicle +within a given time frame. Our method shows exceptional performance on our +dataset, establishing a benchmark for vehicle taillight detection. The dataset +is available at https://huggingface.co/datasets/ChaiJohn/TLD/tree/main + +
+
+
+
+
+ + ☆ A Learnable Color Correction Matrix for RAW Reconstruction BMVC2024 + + +
+ Autonomous driving algorithms usually employ sRGB images as model input due +to their compatibility with the human visual system. However, visually pleasing +sRGB images are possibly sub-optimal for downstream tasks when compared to RAW +images. The availability of RAW images is constrained by the difficulties in +collecting real-world driving data and the associated challenges of annotation. +To address this limitation and support research in RAW-domain driving +perception, we design a novel and ultra-lightweight RAW reconstruction method. +The proposed model introduces a learnable color correction matrix (CCM), which +uses only a single convolutional layer to approximate the complex inverse image +signal processor (ISP). Experimental results demonstrate that simulated RAW +(simRAW) images generated by our method provide performance improvements +equivalent to those produced by more complex inverse ISP methods when +pretraining RAW-domain object detectors, which highlights the effectiveness and +practicality of our approach. + +
+
+ comment: Accepted by BMVC2024 +
+
+
+
+
+ + ☆ Plane2Depth: Hierarchical Adaptive Plane Guidance for Monocular Depth + Estimation + + +
+ Monocular depth estimation aims to infer a dense depth map from a single +image, which is a fundamental and prevalent task in computer vision. Many +previous works have shown impressive depth estimation results through carefully +designed network structures, but they usually ignore the planar information and +therefore perform poorly in low-texture areas of indoor scenes. In this paper, +we propose Plane2Depth, which adaptively utilizes plane information to improve +depth prediction within a hierarchical framework. Specifically, in the proposed +plane guided depth generator (PGDG), we design a set of plane queries as +prototypes to softly model planes in the scene and predict per-pixel plane +coefficients. Then the predicted plane coefficients can be converted into +metric depth values with the pinhole camera model. In the proposed adaptive +plane query aggregation (APGA) module, we introduce a novel feature interaction +approach to improve the aggregation of multi-scale plane features in a top-down +manner. Extensive experiments show that our method can achieve outstanding +performance, especially in low-texture or repetitive areas. Furthermore, under +the same backbone network, our method outperforms the state-of-the-art methods +on the NYU-Depth-v2 dataset, achieves competitive results with state-of-the-art +methods KITTI dataset and can be generalized to unseen scenes effectively. + +
+
+ comment: 14 pages, 12 figures, 8 tables +
+
+
+
+
+ + ☆ Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of + Data-Driven Optimization Routine + + +
+ Diffusion tensor imaging (DTI) holds significant importance in clinical +diagnosis and neuroscience research. However, conventional model-based fitting +methods often suffer from sensitivity to noise, leading to decreased accuracy +in estimating DTI parameters. While traditional data-driven deep learning +methods have shown potential in terms of accuracy and efficiency, their limited +generalization to out-of-training-distribution data impedes their broader +application due to the diverse scan protocols used across centers, scanners, +and studies. This work aims to tackle these challenges and promote the use of +DTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI +combines the weighted linear least squares fitting algorithm and regularization +by denoising technique. The former fits DW images from diverse acquisition +settings into diffusion tensor field, while the latter applies a deep +learning-based denoiser to regularize the diffusion tensor field instead of the +DW images, which is free from the limitation of fixed-channel assignment of the +network. The optimization object is solved using the alternating direction +method of multipliers and then unrolled to construct a deep neural network, +leveraging a data-driven strategy to learn network parameters. Extensive +validation experiments are conducted utilizing both internally simulated +datasets and externally obtained in-vivo datasets. The results, encompassing +both qualitative and quantitative analyses, showcase that the proposed method +attains state-of-the-art performance in DTI parameter estimation. Notably, it +demonstrates superior generalization, accuracy, and efficiency, rendering it +highly reliable for widespread application in the field. + +
+
+
+
+
+ + ☆ TP-GMOT: Tracking Generic Multiple Object by Textual Prompt with + Motion-Appearance Cost (MAC) SORT + + +
+ While Multi-Object Tracking (MOT) has made substantial advancements, it is +limited by heavy reliance on prior knowledge and limited to predefined +categories. In contrast, Generic Multiple Object Tracking (GMOT), tracking +multiple objects with similar appearance, requires less prior information about +the targets but faces challenges with variants like viewpoint, lighting, +occlusion, and resolution. Our contributions commence with the introduction of +the \textbf{\text{Refer-GMOT dataset}} a collection of videos, each accompanied +by fine-grained textual descriptions of their attributes. Subsequently, we +introduce a novel text prompt-based open-vocabulary GMOT framework, called +\textbf{\text{TP-GMOT}}, which can track never-seen object categories with zero +training examples. Within \text{TP-GMOT} framework, we introduce two novel +components: (i) {\textbf{\text{TP-OD}}, an object detection by a textual +prompt}, for accurately detecting unseen objects with specific characteristics. +(ii) Motion-Appearance Cost SORT \textbf{\text{MAC-SORT}}, a novel object +association approach that adeptly integrates motion and appearance-based +matching strategies to tackle the complex task of tracking multiple generic +objects with high similarity. Our contributions are benchmarked on the +\text{Refer-GMOT} dataset for GMOT task. Additionally, to assess the +generalizability of the proposed \text{TP-GMOT} framework and the effectiveness +of \text{MAC-SORT} tracker, we conduct ablation studies on the DanceTrack and +MOT20 datasets for the MOT task. Our dataset, code, and models will be publicly +available at: https://fsoft-aic.github.io/TP-GMOT + +
+
+
+
+
+ + ☆ Boosting Generalizability towards Zero-Shot Cross-Dataset Single-Image + Indoor Depth by Meta-Initialization IROS 2024 + + +
+ Indoor robots rely on depth to perform tasks like navigation or obstacle +detection, and single-image depth estimation is widely used to assist +perception. Most indoor single-image depth prediction focuses less on model +generalizability to unseen datasets, concerned with in-the-wild robustness for +system deployment. This work leverages gradient-based meta-learning to gain +higher generalizability on zero-shot cross-dataset inference. Unlike the +most-studied meta-learning of image classification associated with explicit +class labels, no explicit task boundaries exist for continuous depth values +tied to highly varying indoor environments regarding object arrangement and +scene composition. We propose fine-grained task that treats each RGB-D +mini-batch as a task in our meta-learning formulation. We first show that our +method on limited data induces a much better prior (max 27.8% in RMSE). Then, +finetuning on meta-learned initialization consistently outperforms baselines +without the meta approach. Aiming at generalization, we propose zero-shot +cross-dataset protocols and validate higher generalizability induced by our +meta-initialization, as a simple and useful plugin to many existing depth +estimation methods. The work at the intersection of depth and meta-learning +potentially drives both research to step closer to practical robotic and +machine perception usage. + +
+
+ comment: IROS 2024. The version supersedes 2305.07269. arXiv admin note: text + overlap with arXiv:2305.07269 +
+
+
+
+
+ + ☆ TASAR: Transferable Attack on Skeletal Action Recognition + + +
+ Skeletal sequences, as well-structured representations of human behaviors, +are crucial in Human Activity Recognition (HAR). The transferability of +adversarial skeletal sequences enables attacks in real-world HAR scenarios, +such as autonomous driving, intelligent surveillance, and human-computer +interactions. However, existing Skeleton-based HAR (S-HAR) attacks exhibit weak +adversarial transferability and, therefore, cannot be considered true +transfer-based S-HAR attacks. More importantly, the reason for this failure +remains unclear. In this paper, we study this phenomenon through the lens of +loss surface, and find that its sharpness contributes to the poor +transferability in S-HAR. Inspired by this observation, we assume and +empirically validate that smoothening the rugged loss landscape could +potentially improve adversarial transferability in S-HAR. To this end, we +propose the first Transfer-based Attack on Skeletal Action Recognition, TASAR. +TASAR explores the smoothed model posterior without re-training the pre-trained +surrogates, which is achieved by a new post-train Dual Bayesian optimization +strategy. Furthermore, unlike previous transfer-based attacks that treat each +frame independently and overlook temporal coherence within sequences, TASAR +incorporates motion dynamics into the Bayesian attack gradient, effectively +disrupting the spatial-temporal coherence of S-HARs. To exhaustively evaluate +the effectiveness of existing methods and our method, we build the first +large-scale robust S-HAR benchmark, comprising 7 S-HAR models, 10 attack +methods, 3 S-HAR datasets and 2 defense models. Extensive results demonstrate +the superiority of TASAR. Our benchmark enables easy comparisons for future +studies, with the code available in the supplementary material. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2407.08572 +
+
+
+
+
+ + ☆ Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes + + +
+ High-quality real-time view synthesis methods are based on volume rendering, +splatting, or surface rendering. While surface-based methods generally are the +fastest, they cannot faithfully model fuzzy geometry like hair. In turn, +alpha-blending techniques excel at representing fuzzy materials but require an +unbounded number of samples per ray (P1). Further overheads are induced by +empty space skipping in volume rendering (P2) and sorting input primitives in +splatting (P3). These problems are exacerbated on low-performance graphics +hardware, e.g. on mobile devices. We present a novel representation for +real-time view synthesis where the (P1) number of sampling locations is small +and bounded, (P2) sampling locations are efficiently found via rasterization, +and (P3) rendering is sorting-free. We achieve this by representing objects as +semi-transparent multi-layer meshes, rendered in fixed layer order from +outermost to innermost. We model mesh layers as SDF shells with optimal spacing +learned during training. After baking, we fit UV textures to the corresponding +meshes. We show that our method can represent challenging fuzzy objects while +achieving higher frame rates than volume-based and splatting-based methods on +low-end and mobile devices. + +
+
+
+
+
+ + ☆ FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video + Reconstruction in Resource and Timing Constrained Network Settings + + +
+ Despite the growing adoption of video processing via Internet of Things (IoT) +devices due to their cost-effectiveness, transmitting captured data to nearby +servers poses challenges due to varying timing constraints and scarcity of +network bandwidth. Existing video compression methods face difficulties in +recovering compressed data when incomplete data is provided. Here, we introduce +\emph{\project}, a deep-learning based solution that utilizes previously +received data to predict the missing segments of a frame, enabling the +reconstruction of a frame from partially received data. + +
+
+
+
+
+ + ☆ Detecting Korean Food Using Image using Hierarchical Model + + +
+ A solution was made available for Korean Food lovers who have dietary +restrictions to identify the Korean food before consuming. Just by uploading a +clear photo of the dish, people can get to know what they are eating. Image +processing techniques together with machine learning helped to come up with +this solution. + +
+
+
+
+
+ + ☆ Non-target Divergence Hypothesis: Toward Understanding Domain Gaps in + Cross-Modal Knowledge Distillation + + +
+ Compared to single-modal knowledge distillation, cross-modal knowledge +distillation faces more severe challenges due to domain gaps between +modalities. Although various methods have proposed various solutions to +overcome these challenges, there is still limited research on how domain gaps +affect cross-modal knowledge distillation. This paper provides an in-depth +analysis and evaluation of this issue. We first introduce the Non-Target +Divergence Hypothesis (NTDH) to reveal the impact of domain gaps on cross-modal +knowledge distillation. Our key finding is that domain gaps between modalities +lead to distribution differences in non-target classes, and the smaller these +differences, the better the performance of cross-modal knowledge distillation. +Subsequently, based on Vapnik-Chervonenkis (VC) theory, we derive the upper and +lower bounds of the approximation error for cross-modal knowledge distillation, +thereby theoretically validating the NTDH. Finally, experiments on five +cross-modal datasets further confirm the validity, generalisability, and +applicability of the NTDH. + +
+
+
+
+
+ + ☆ Training-free Color-Style Disentanglement for Constrained Text-to-Image + Synthesis + + +
+ We consider the problem of independently, in a disentangled fashion, +controlling the outputs of text-to-image diffusion models with color and style +attributes of a user-supplied reference image. We present the first +training-free, test-time-only method to disentangle and condition text-to-image +models on color and style attributes from reference image. To realize this, we +propose two key innovations. Our first contribution is to transform the latent +codes at inference time using feature transformations that make the covariance +matrix of current generation follow that of the reference image, helping +meaningfully transfer color. Next, we observe that there exists a natural +disentanglement between color and style in the LAB image space, which we +exploit to transform the self-attention feature maps of the image being +generated with respect to those of the reference computed from its L channel. +Both these operations happen purely at test time and can be done independently +or merged. This results in a flexible method where color and style information +can come from the same reference image or two different sources, and a new +generation can seamlessly fuse them in either scenario. + +
+
+ comment: 16 pages, 17 figures +
+
+
+
+
+ + ☆ Diffusion Models Learn Low-Dimensional Distributions via Subspace + Clustering + + +
+ Recent empirical studies have demonstrated that diffusion models can +effectively learn the image distribution and generate new samples. Remarkably, +these models can achieve this even with a small number of training samples +despite a large image dimension, circumventing the curse of dimensionality. In +this work, we provide theoretical insights into this phenomenon by leveraging +key empirical observations: (i) the low intrinsic dimensionality of image data, +(ii) a union of manifold structure of image data, and (iii) the low-rank +property of the denoising autoencoder in trained diffusion models. These +observations motivate us to assume the underlying data distribution of image +data as a mixture of low-rank Gaussians and to parameterize the denoising +autoencoder as a low-rank model according to the score function of the assumed +distribution. With these setups, we rigorously show that optimizing the +training loss of diffusion models is equivalent to solving the canonical +subspace clustering problem over the training samples. Based on this +equivalence, we further show that the minimal number of samples required to +learn the underlying distribution scales linearly with the intrinsic dimensions +under the above data and model assumptions. This insight sheds light on why +diffusion models can break the curse of dimensionality and exhibit the phase +transition in learning distributions. Moreover, we empirically establish a +correspondence between the subspaces and the semantic representations of image +data, facilitating image editing. We validate these results with corroborated +experimental results on both simulated distributions and image datasets. + +
+
+ comment: 39 pages, 9 figures +
+
+
+
+
+ + ☆ MOSMOS: Multi-organ segmentation facilitated by medical report + supervision + + +
+ Owing to a large amount of multi-modal data in modern medical systems, such +as medical images and reports, Medical Vision-Language Pre-training (Med-VLP) +has demonstrated incredible achievements in coarse-grained downstream tasks +(i.e., medical classification, retrieval, and visual question answering). +However, the problem of transferring knowledge learned from Med-VLP to +fine-grained multi-organ segmentation tasks has barely been investigated. +Multi-organ segmentation is challenging mainly due to the lack of large-scale +fully annotated datasets and the wide variation in the shape and size of the +same organ between individuals with different diseases. In this paper, we +propose a novel pre-training & fine-tuning framework for Multi-Organ +Segmentation by harnessing Medical repOrt Supervision (MOSMOS). Specifically, +we first introduce global contrastive learning to maximally align the medical +image-report pairs in the pre-training stage. To remedy the granularity +discrepancy, we further leverage multi-label recognition to implicitly learn +the semantic correspondence between image pixels and organ tags. More +importantly, our pre-trained models can be transferred to any segmentation +model by introducing the pixel-tag attention maps. Different network settings, +i.e., 2D U-Net and 3D UNETR, are utilized to validate the generalization. We +have extensively evaluated our approach using different diseases and modalities +on BTCV, AMOS, MMWHS, and BRATS datasets. Experimental results in various +settings demonstrate the effectiveness of our framework. This framework can +serve as the foundation to facilitate future research on automatic annotation +tasks under the supervision of medical reports. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Local map Construction Methods with SD map: A Novel Survey + + +
+ In recent years, significant academic advancements have been made in the +field of autonomous vehicles, with Local maps emerging as a crucial component +of autonomous driving technology. Local maps not only provide intricate details +of road networks but also serve as fundamental inputs for critical tasks such +as vehicle localization, navigation, and decision-making. Given the +characteristics of SD map (Standard Definition Map), which include low cost, +ease of acquisition, and high versatility, perception methods that integrate SD +map as prior information have demonstrated significant potential in the field +of Local map perception. The purpose of this paper is to provide researchers +with a comprehensive overview and summary of the latest advancements in the +integration of SD map as prior information for Local map perception methods. +This review begins by introducing the task definition and general pipeline of +local map perception methods that incorporate SD maps as prior information, +along with relevant public datasets. And then it focuses on the representation +and encoding methods of multi-source information, as well as the methods for +fusing multi-source information. In response to this burgeoning trend, this +article presents a comprehensive and meticulous overview of the diverse +research efforts in this particular field. Finally, the article addresses +pertinent issues and future challenges with the aim of guiding researchers in +understanding the current trends and methodologies prevalent in the field. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ Hadamard Row-Wise Generation Algorithm + + +
+ In this paper, we introduce an efficient algorithm for generating specific +Hadamard rows, addressing the memory demands of pre-computing the entire +matrix. Leveraging Sylvester's recursive construction, our method generates the +required $i$-th row on demand, significantly reducing computational resources. +The algorithm uses the Kronecker product to construct the desired row from the +binary representation of the index, without creating the full matrix. This +approach is particularly useful for single-pixel imaging systems that need only +one row at a time. + +
+
+
+
+
+ + ☆ Neural Dynamics Model of Visual Decision-Making: Learning from Human + Experts + + +
+ Uncovering the fundamental neural correlates of biological intelligence, +developing mathematical models, and conducting computational simulations are +critical for advancing new paradigms in artificial intelligence (AI). In this +study, we implemented a comprehensive visual decision-making model that spans +from visual input to behavioral output, using a neural dynamics modeling +approach. Drawing inspiration from the key components of the dorsal visual +pathway in primates, our model not only aligns closely with human behavior but +also reflects neural activities in primates, and achieving accuracy comparable +to convolutional neural networks (CNNs). Moreover, magnetic resonance imaging +(MRI) identified key neuroimaging features such as structural connections and +functional connectivity that are associated with performance in perceptual +decision-making tasks. A neuroimaging-informed fine-tuning approach was +introduced and applied to the model, leading to performance improvements that +paralleled the behavioral variations observed among subjects. Compared to +classical deep learning models, our model more accurately replicates the +behavioral performance of biological intelligence, relying on the structural +characteristics of biological neural networks rather than extensive training +data, and demonstrating enhanced resilience to perturbation. + +
+
+
+
+
+ + ☆ Multi-modal Situated Reasoning in 3D Scenes + + +
+ Situation awareness is essential for understanding and reasoning about 3D +scenes in embodied AI agents. However, existing datasets and benchmarks for +situated understanding are limited in data modality, diversity, scale, and task +scope. To address these limitations, we propose Multi-modal Situated Question +Answering (MSQA), a large-scale multi-modal situated reasoning dataset, +scalably collected leveraging 3D scene graphs and vision-language models (VLMs) +across a diverse range of real-world 3D scenes. MSQA includes 251K situated +question-answering pairs across 9 distinct question categories, covering +complex scenarios within 3D scenes. We introduce a novel interleaved +multi-modal input setting in our benchmark to provide text, image, and point +cloud for situation and question description, resolving ambiguity in previous +single-modality convention (e.g., text). Additionally, we devise the +Multi-modal Situated Next-step Navigation (MSNN) benchmark to evaluate models' +situated reasoning for navigation. Comprehensive evaluations on MSQA and MSNN +highlight the limitations of existing vision-language models and underscore the +importance of handling multi-modal interleaved inputs and situation modeling. +Experiments on data scaling and cross-domain transfer further demonstrate the +efficacy of leveraging MSQA as a pre-training dataset for developing more +powerful situated reasoning models. + +
+
+ comment: Project page: https://msr3d.github.io/ +
+
+
+
+
+ + ☆ Unified Framework with Consistency across Modalities for Human Activity + Recognition BMVC 2024 + + +
+ Recognizing human activities in videos is challenging due to the +spatio-temporal complexity and context-dependence of human interactions. Prior +studies often rely on single input modalities, such as RGB or skeletal data, +limiting their ability to exploit the complementary advantages across +modalities. Recent studies focus on combining these two modalities using simple +feature fusion techniques. However, due to the inherent disparities in +representation between these input modalities, designing a unified neural +network architecture to effectively leverage their complementary information +remains a significant challenge. To address this, we propose a comprehensive +multimodal framework for robust video-based human activity recognition. Our key +contribution is the introduction of a novel compositional query machine, called +COMPUTER ($\textbf{COMP}ositional h\textbf{U}man-cen\textbf{T}ric +qu\textbf{ER}y$ machine), a generic neural architecture that models the +interactions between a human of interest and its surroundings in both space and +time. Thanks to its versatile design, COMPUTER can be leveraged to distill +distinctive representations for various input modalities. Additionally, we +introduce a consistency loss that enforces agreement in prediction between +modalities, exploiting the complementary information from multimodal inputs for +robust human movement recognition. Through extensive experiments on action +localization and group activity recognition tasks, our approach demonstrates +superior performance when compared with state-of-the-art methods. Our code is +available at: https://github.com/tranxuantuyen/COMPUTER. + +
+
+ comment: Accepted to BMVC 2024 +
+
+
+
+
+ + ☆ GGS: Generalizable Gaussian Splatting for Lane Switching in Autonomous + Driving + + +
+ We propose GGS, a Generalizable Gaussian Splatting method for Autonomous +Driving which can achieve realistic rendering under large viewpoint changes. +Previous generalizable 3D gaussian splatting methods are limited to rendering +novel views that are very close to the original pair of images, which cannot +handle large differences in viewpoint. Especially in autonomous driving +scenarios, images are typically collected from a single lane. The limited +training perspective makes rendering images of a different lane very +challenging. To further improve the rendering capability of GGS under large +viewpoint changes, we introduces a novel virtual lane generation module into +GSS method to enables high-quality lane switching even without a multi-lane +dataset. Besides, we design a diffusion loss to supervise the generation of +virtual lane image to further address the problem of lack of data in the +virtual lanes. Finally, we also propose a depth refinement module to optimize +depth estimation in the GSS model. Extensive validation of our method, compared +to existing approaches, demonstrates state-of-the-art performance. + +
+
+
+
+
+ + ☆ Coral Model Generation from Single Images for Virtual Reality + Applications + + +
+ With the rapid development of VR technology, the demand for high-quality 3D +models is increasing. Traditional methods struggle with efficiency and quality +in large-scale customization. This paper introduces a deep-learning framework +that generates high-precision 3D coral models from a single image. Using the +Coral dataset, the framework extracts geometric and texture features, performs +3D reconstruction, and optimizes design and material blending. Advanced +optimization and polygon count control ensure shape accuracy, detail retention, +and flexible output for various complexities, catering to high-quality +rendering and real-time interaction needs.The project incorporates Explainable +AI (XAI) to transform AI-generated models into interactive "artworks," best +viewed in VR and XR. This enhances model interpretability and human-machine +collaboration. Real-time feedback in VR interactions displays information like +coral species and habitat, enriching user experience. The generated models +surpass traditional methods in detail, visual quality, and efficiency. This +research offers an intelligent approach to 3D content creation for VR, lowering +production barriers, and promoting widespread VR applications. Additionally, +integrating XAI provides new insights into AI-generated visual content and +advances research in 3D vision interpretability. + +
+
+ comment: In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts + 2024) arXiv:2406.14485 +
+
+
+
+
+ + ☆ Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable + Image Editing + + +
+ Recently, diffusion models have emerged as a powerful class of generative +models. Despite their success, there is still limited understanding of their +semantic spaces. This makes it challenging to achieve precise and disentangled +image generation without additional training, especially in an unsupervised +way. In this work, we improve the understanding of their semantic spaces from +intriguing observations: among a certain range of noise levels, (1) the learned +posterior mean predictor (PMP) in the diffusion model is locally linear, and +(2) the singular vectors of its Jacobian lie in low-dimensional semantic +subspaces. We provide a solid theoretical basis to justify the linearity and +low-rankness in the PMP. These insights allow us to propose an unsupervised, +single-step, training-free LOw-rank COntrollable image editing (LOCO Edit) +method for precise local editing in diffusion models. LOCO Edit identified +editing directions with nice properties: homogeneity, transferability, +composability, and linearity. These properties of LOCO Edit benefit greatly +from the low-dimensional semantic subspace. Our method can further be extended +to unsupervised or text-supervised editing in various text-to-image diffusion +models (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the +effectiveness and efficiency of LOCO Edit. The codes will be released at +https://github.com/ChicyChen/LOCO-Edit. + +
+
+
+
+
+ + ☆ Unfolding Videos Dynamics via Taylor Expansion + + +
+ Taking inspiration from physical motion, we present a new self-supervised +dynamics learning strategy for videos: Video Time-Differentiation for Instance +Discrimination (ViDiDi). ViDiDi is a simple and data-efficient strategy, +readily applicable to existing self-supervised video representation learning +frameworks based on instance discrimination. At its core, ViDiDi observes +different aspects of a video through various orders of temporal derivatives of +its frame sequence. These derivatives, along with the original frames, support +the Taylor series expansion of the underlying continuous dynamics at discrete +times, where higher-order derivatives emphasize higher-order motion features. +ViDiDi learns a single neural network that encodes a video and its temporal +derivatives into consistent embeddings following a balanced alternating +learning algorithm. By learning consistent representations for original frames +and derivatives, the encoder is steered to emphasize motion features over +static backgrounds and uncover the hidden dynamics in original frames. Hence, +video representations are better separated by dynamic features. We integrate +ViDiDi into existing instance discrimination frameworks (VICReg, BYOL, and +SimCLR) for pretraining on UCF101 or Kinetics and test on standard benchmarks +including video retrieval, action recognition, and action detection. The +performances are enhanced by a significant margin without the need for large +models or extensive datasets. + +
+
+
+
+
+ + ☆ Pluralistic Salient Object Detection + + +
+ We introduce pluralistic salient object detection (PSOD), a novel task aimed +at generating multiple plausible salient segmentation results for a given input +image. Unlike conventional SOD methods that produce a single segmentation mask +for salient objects, this new setting recognizes the inherent complexity of +real-world images, comprising multiple objects, and the ambiguity in defining +salient objects due to different user intentions. To study this task, we +present two new SOD datasets "DUTS-MM" and "DUS-MQ", along with newly designed +evaluation metrics. DUTS-MM builds upon the DUTS dataset but enriches the +ground-truth mask annotations from three aspects which 1) improves the mask +quality especially for boundary and fine-grained structures; 2) alleviates the +annotation inconsistency issue; and 3) provides multiple ground-truth masks for +images with saliency ambiguity. DUTS-MQ consists of approximately 100K +image-mask pairs with human-annotated preference scores, enabling the learning +of real human preferences in measuring mask quality. Building upon these two +datasets, we propose a simple yet effective pluralistic SOD baseline based on a +Mixture-of-Experts (MOE) design. Equipped with two prediction heads, it +simultaneously predicts multiple masks using different query prompts and +predicts human preference scores for each mask candidate. Extensive experiments +and analyses underscore the significance of our proposed datasets and affirm +the effectiveness of our PSOD framework. + +
+
+
+
+
+ + ☆ Developing, Analyzing, and Evaluating Self-Drive Algorithms Using + Drive-by-Wire Electric Vehicles + + +
+ Reliable lane-following algorithms are essential for safe and effective +autonomous driving. This project was primarily focused on developing and +evaluating different lane-following programs to find the most reliable +algorithm for a Vehicle to Everything (V2X) project. The algorithms were first +tested on a simulator and then with real vehicles equipped with a drive-by-wire +system using ROS (Robot Operating System). Their performance was assessed +through reliability, comfort, speed, and adaptability metrics. The results show +that the two most reliable approaches detect both lane lines and use +unsupervised learning to separate them. These approaches proved to be robust in +various driving scenarios, making them suitable candidates for integration into +the V2X project. + +
+
+ comment: Supported by the National Science Foundation under Grants No. 2150292 + and 2150096 +
+
+
+
+
+ + ☆ MSTT-199: MRI Dataset for Musculoskeletal Soft Tissue Tumor Segmentation + + +
+ Accurate musculoskeletal soft tissue tumor segmentation is vital for +assessing tumor size, location, diagnosis, and response to treatment, thereby +influencing patient outcomes. However, segmentation of these tumors requires +clinical expertise, and an automated segmentation model would save valuable +time for both clinician and patient. Training an automatic model requires a +large dataset of annotated images. In this work, we describe the collection of +an MR imaging dataset of 199 musculoskeletal soft tissue tumors from 199 +patients. We trained segmentation models on this dataset and then benchmarked +them on a publicly available dataset. Our model achieved the state-of-the-art +dice score of 0.79 out of the box without any fine tuning, which shows the +diversity and utility of our curated dataset. We analyzed the model predictions +and found that its performance suffered on fibrous and vascular tumors due to +their diverse anatomical location, size, and intensity heterogeneity. The code +and models are available in the following github repository, +https://github.com/Reasat/mstt + +
+
+ comment: Dataset will be made publicly available after the acceptance of the + paper +
+
+
+
+
+ + ☆ Spatial Diffusion for Cell Layout Generation MICCAI 2024 + + +
+ Generative models, such as GANs and diffusion models, have been used to +augment training sets and boost performances in different tasks. We focus on +generative models for cell detection instead, i.e., locating and classifying +cells in given pathology images. One important information that has been +largely overlooked is the spatial patterns of the cells. In this paper, we +propose a spatial-pattern-guided generative model for cell layout generation. +Specifically, a novel diffusion model guided by spatial features and generates +realistic cell layouts has been proposed. We explore different density models +as spatial features for the diffusion model. In downstream tasks, we show that +the generated cell layouts can be used to guide the generation of high-quality +pathology images. Augmenting with these images can significantly boost the +performance of SOTA cell detection methods. The code is available at +https://github.com/superlc1995/Diffusion-cell. + +
+
+ comment: 12 pages, 4 figures, accepted by MICCAI 2024 +
+
+
+
+
+ + ☆ Coupling AI and Citizen Science in Creation of Enhanced Training Dataset + for Medical Image Segmentation + + +
+ Recent advancements in medical imaging and artificial intelligence (AI) have +greatly enhanced diagnostic capabilities, but the development of effective deep +learning (DL) models is still constrained by the lack of high-quality annotated +datasets. The traditional manual annotation process by medical experts is time- +and resource-intensive, limiting the scalability of these datasets. In this +work, we introduce a robust and versatile framework that combines AI and +crowdsourcing to improve both the quality and quantity of medical image +datasets across different modalities. Our approach utilises a user-friendly +online platform that enables a diverse group of crowd annotators to label +medical images efficiently. By integrating the MedSAM segmentation AI with this +platform, we accelerate the annotation process while maintaining expert-level +quality through an algorithm that merges crowd-labelled images. Additionally, +we employ pix2pixGAN, a generative AI model, to expand the training dataset +with synthetic images that capture realistic morphological features. These +methods are combined into a cohesive framework designed to produce an enhanced +dataset, which can serve as a universal pre-processing pipeline to boost the +training of any medical deep learning segmentation model. Our results +demonstrate that this framework significantly improves model performance, +especially when training data is limited. + +
+
+
+
+
+ + ☆ MobileUNETR: A Lightweight End-To-End Hybrid Vision Transformer For + Efficient Medical Image Segmentation ECCV 2024 + + +
+ Skin cancer segmentation poses a significant challenge in medical image +analysis. Numerous existing solutions, predominantly CNN-based, face issues +related to a lack of global contextual understanding. Alternatively, some +approaches resort to large-scale Transformer models to bridge the global +contextual gaps, but at the expense of model size and computational complexity. +Finally many Transformer based approaches rely primarily on CNN based decoders +overlooking the benefits of Transformer based decoding models. Recognizing +these limitations, we address the need efficient lightweight solutions by +introducing MobileUNETR, which aims to overcome the performance constraints +associated with both CNNs and Transformers while minimizing model size, +presenting a promising stride towards efficient image segmentation. MobileUNETR +has 3 main features. 1) MobileUNETR comprises of a lightweight hybrid +CNN-Transformer encoder to help balance local and global contextual feature +extraction in an efficient manner; 2) A novel hybrid decoder that +simultaneously utilizes low-level and global features at different resolutions +within the decoding stage for accurate mask generation; 3) surpassing large and +complex architectures, MobileUNETR achieves superior performance with 3 million +parameters and a computational complexity of 1.3 GFLOP resulting in 10x and 23x +reduction in parameters and FLOPS, respectively. Extensive experiments have +been conducted to validate the effectiveness of our proposed method on four +publicly available skin lesion segmentation datasets, including ISIC 2016, ISIC +2017, ISIC 2018, and PH2 datasets. The code will be publicly available at: +https://github.com/OSUPCVLab/MobileUNETR.git + +
+
+ comment: Accepted at ECCV 2024 - BioImage Computing Workshop (Oral) +
+
+
+
+
+ + ☆ Incorporating dense metric depth into neural 3D representations for view + synthesis and relighting + + +
+ Synthesizing accurate geometry and photo-realistic appearance of small scenes +is an active area of research with compelling use cases in gaming, virtual +reality, robotic-manipulation, autonomous driving, convenient product capture, +and consumer-level photography. When applying scene geometry and appearance +estimation techniques to robotics, we found that the narrow cone of possible +viewpoints due to the limited range of robot motion and scene clutter caused +current estimation techniques to produce poor quality estimates or even fail. +On the other hand, in robotic applications, dense metric depth can often be +measured directly using stereo and illumination can be controlled. Depth can +provide a good initial estimate of the object geometry to improve +reconstruction, while multi-illumination images can facilitate relighting. In +this work we demonstrate a method to incorporate dense metric depth into the +training of neural 3D representations and address an artifact observed while +jointly refining geometry and appearance by disambiguating between texture and +geometry edges. We also discuss a multi-flash stereo camera system developed to +capture the necessary data for our pipeline and show results on relighting and +view synthesis with a few training views. + +
+
+ comment: Project webpage: https://stereomfc.github.io +
+
+
+
+
+ + ☆ Can Your Generative Model Detect Out-of-Distribution Covariate Shift? ECCV 2024 + + +
+ Detecting Out-of-Distribution~(OOD) sensory data and covariate distribution +shift aims to identify new test examples with different high-level image +statistics to the captured, normal and In-Distribution (ID) set. Existing OOD +detection literature largely focuses on semantic shift with little-to-no +consensus over covariate shift. Generative models capture the ID data in an +unsupervised manner, enabling them to effectively identify samples that deviate +significantly from this learned distribution, irrespective of the downstream +task. In this work, we elucidate the ability of generative models to detect and +quantify domain-specific covariate shift through extensive analyses that +involves a variety of models. To this end, we conjecture that it is sufficient +to detect most occurring sensory faults (anomalies and deviations in global +signals statistics) by solely modeling high-frequency signal-dependent and +independent details. We propose a novel method, CovariateFlow, for OOD +detection, specifically tailored to covariate heteroscedastic high-frequency +image-components using conditional Normalizing Flows (cNFs). Our results on +CIFAR10 vs. CIFAR10-C and ImageNet200 vs. ImageNet200-C demonstrate the +effectiveness of the method by accurately detecting OOD covariate shift. This +work contributes to enhancing the fidelity of imaging systems and aiding +machine learning models in OOD detection in the presence of covariate shift. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ MDNF: Multi-Diffusion-Nets for Neural Fields on Meshes + + +
+ We propose a novel framework for representing neural fields on triangle +meshes that is multi-resolution across both spatial and frequency domains. +Inspired by the Neural Fourier Filter Bank (NFFB), our architecture decomposes +the spatial and frequency domains by associating finer spatial resolution +levels with higher frequency bands, while coarser resolutions are mapped to +lower frequencies. To achieve geometry-aware spatial decomposition we leverage +multiple DiffusionNet components, each associated with a different spatial +resolution level. Subsequently, we apply a Fourier feature mapping to encourage +finer resolution levels to be associated with higher frequencies. The final +signal is composed in a wavelet-inspired manner using a sine-activated MLP, +aggregating higher-frequency signals on top of lower-frequency ones. Our +architecture attains high accuracy in learning complex neural fields and is +robust to discontinuities, exponential scale variations of the target field, +and mesh modification. We demonstrate the effectiveness of our approach through +its application to diverse neural fields, such as synthetic RGB functions, UV +texture coordinates, and vertex normals, illustrating different challenges. To +validate our method, we compare its performance against two alternatives, +showcasing the advantages of our multi-resolution architecture. + +
+
+
+
+
+ + ☆ A General Albedo Recovery Approach for Aerial Photogrammetric Images + through Inverse Rendering SP + + +
+ Modeling outdoor scenes for the synthetic 3D environment requires the +recovery of reflectance/albedo information from raw images, which is an +ill-posed problem due to the complicated unmodeled physics in this process +(e.g., indirect lighting, volume scattering, specular reflection). The problem +remains unsolved in a practical context. The recovered albedo can facilitate +model relighting and shading, which can further enhance the realism of rendered +models and the applications of digital twins. Typically, photogrammetric 3D +models simply take the source images as texture materials, which inherently +embed unwanted lighting artifacts (at the time of capture) into the texture. +Therefore, these polluted textures are suboptimal for a synthetic environment +to enable realistic rendering. In addition, these embedded environmental +lightings further bring challenges to photo-consistencies across different +images that cause image-matching uncertainties. This paper presents a general +image formation model for albedo recovery from typical aerial photogrammetric +images under natural illuminations and derives the inverse model to resolve the +albedo information through inverse rendering intrinsic image decomposition. Our +approach builds on the fact that both the sun illumination and scene geometry +are estimable in aerial photogrammetry, thus they can provide direct inputs for +this ill-posed problem. This physics-based approach does not require additional +input other than data acquired through the typical drone-based photogrammetric +collection and was shown to favorably outperform existing approaches. We also +demonstrate that the recovered albedo image can in turn improve typical image +processing tasks in photogrammetry such as feature and dense matching, edge, +and line extraction. + +
+
+ comment: ISPRS Journal of Photogrammetry and Remote Sensing +
+
+
+
+
+ + ☆ No Detail Left Behind: Revisiting Self-Retrieval for Fine-Grained Image + Captioning + + +
+ Image captioning systems are unable to generate fine-grained captions as they +are trained on data that is either noisy (alt-text) or generic (human +annotations). This is further exacerbated by maximum likelihood training that +encourages generation of frequently occurring phrases. Previous works have +tried to address this limitation by fine-tuning captioners with a +self-retrieval (SR) reward. However, we find that SR fine-tuning has a tendency +to reduce caption faithfulness and even hallucinate. In this work, we +circumvent this bottleneck by improving the MLE initialization of the +captioning system and designing a curriculum for the SR fine-tuning process. To +this extent, we present (1) Visual Caption Boosting, a novel framework to +instill fine-grainedness in generic image captioning datasets while remaining +anchored in human annotations; and (2) BagCurri, a carefully designed training +curriculum that more optimally leverages the contrastive nature of the +self-retrieval reward. Jointly, they enable the captioner to describe +fine-grained aspects in the image while preserving faithfulness to ground-truth +captions. Our approach outperforms previous work by +8.9% on SR against 99 +random distractors (RD100) (Dessi et al., 2023); and +7.6% on ImageCoDe. + Additionally, existing metrics to evaluate captioning systems fail to reward +diversity or evaluate a model's fine-grained understanding ability. Our third +contribution addresses this by proposing self-retrieval from the lens of +evaluation. We introduce TrueMatch, a benchmark comprising bags of highly +similar images that uses SR to assess the captioner's ability to capture subtle +visual distinctions. We evaluate and compare several state-of-the-art +open-source MLLMs on TrueMatch, and find that our SR approach outperforms them +all by a significant margin (e.g. +4.8% - 7.1% over Cambrian) while having 1-2 +orders of magnitude fewer parameters. + +
+
+
+
+
+ + ☆ Boundless: Generating Photorealistic Synthetic Data for Object Detection + in Urban Streetscapes + + +
+ We introduce Boundless, a photo-realistic synthetic data generation system +for enabling highly accurate object detection in dense urban streetscapes. +Boundless can replace massive real-world data collection and manual +ground-truth object annotation (labeling) with an automated and configurable +process. Boundless is based on the Unreal Engine 5 (UE5) City Sample project +with improvements enabling accurate collection of 3D bounding boxes across +different lighting and scene variability conditions. + We evaluate the performance of object detection models trained on the dataset +generated by Boundless when used for inference on a real-world dataset acquired +from medium-altitude cameras. We compare the performance of the +Boundless-trained model against the CARLA-trained model and observe an +improvement of 7.8 mAP. The results we achieved support the premise that +synthetic data generation is a credible methodology for training/fine-tuning +scalable object detection models for urban scenes. + +
+
+
+
+
+ + ☆ Design and Evaluation of Camera-Centric Mobile Crowdsourcing + Applications + + +
+ The data that underlies automated methods in computer vision and machine +learning, such as image retrieval and fine-grained recognition, often comes +from crowdsourcing. In contexts that rely on the intrinsic motivation of users, +we seek to understand how the application design affects a user's willingness +to contribute and the quantity and quality of the data they capture. In this +project, we designed three versions of a camera-based mobile crowdsourcing +application, which varied in the amount of labeling effort requested of the +user and conducted a user study to evaluate the trade-off between the level of +user-contributed information requested and the quantity and quality of labeled +images collected. The results suggest that higher levels of user labeling do +not lead to reduced contribution. Users collected and annotated the most images +using the application version with the highest requested level of labeling with +no decrease in user satisfaction. In preliminary experiments, the additional +labeled data supported increased performance on an image retrieval task. + +
+
+
+
+
+ + ☆ Vec2Face: Scaling Face Dataset Generation with Loosely Constrained + Vectors + + +
+ This paper studies how to synthesize face images of non-existent persons, to +create a dataset that allows effective training of face recognition (FR) +models. Two important goals are (1) the ability to generate a large number of +distinct identities (inter-class separation) with (2) a wide variation in +appearance of each identity (intra-class variation). However, existing works 1) +are typically limited in how many well-separated identities can be generated +and 2) either neglect or use a separate editing model for attribute +augmentation. We propose Vec2Face, a holistic model that uses only a sampled +vector as input and can flexibly generate and control face images and their +attributes. Composed of a feature masked autoencoder and a decoder, Vec2Face is +supervised by face image reconstruction and can be conveniently used in +inference. Using vectors with low similarity among themselves as inputs, +Vec2Face generates well-separated identities. Randomly perturbing an input +identity vector within a small range allows Vec2Face to generate faces of the +same identity with robust variation in face attributes. It is also possible to +generate images with designated attributes by adjusting vector values with a +gradient descent method. Vec2Face has efficiently synthesized as many as 300K +identities with 15 million total images, whereas 60K is the largest number of +identities created in the previous works. FR models trained with the generated +HSFace datasets, from 10k to 300k identities, achieve state-of-the-art +accuracy, from 92% to 93.52%, on five real-world test sets. For the first time, +our model created using a synthetic training set achieves higher accuracy than +the model created using a same-scale training set of real face images (on the +CALFW test set). + +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. The code is +available (https://github.com/batmanlab/Ladder). + +
+
+
+
+
+ + ♻ ☆ Quantifying uncertainty in lung cancer segmentation with foundation + models applied to mixed-domain datasets + + +
+ Medical image foundation models have shown the ability to segment organs and +tumors with minimal fine-tuning. These models are typically evaluated on +task-specific in-distribution (ID) datasets. However, reliable performance on +ID dataset does not guarantee robust generalization on out-of-distribution +(OOD) datasets. Importantly, once deployed for clinical use, it is impractical +to have `ground truth' delineations to assess ongoing performance drifts, +especially when images fall into OOD category due to different imaging +protocols. Hence, we introduced a comprehensive set of computationally fast +metrics to evaluate the performance of multiple foundation models (Swin UNETR, +SimMIM, iBOT, SMIT) trained with self-supervised learning (SSL). SSL +pretraining was selected as this approach is applicable for large, diverse, and +unlabeled image sets. All models were fine-tuned on identical datasets for lung +tumor segmentation from computed tomography (CT) scans. SimMIM, iBOT, and SMIT +used identical architecture, pretraining, and fine-tuning datasets to assess +performance variations with the choice of pretext tasks used in SSL. Evaluation +was performed on two public lung cancer datasets (LRAD: n = 140, 5Rater: n = +21) with different image acquisitions and tumor stage compared to training data +(n = 317 public resource with stage III-IV lung cancers) and a public +non-cancer dataset containing volumetric CT scans of patients with pulmonary +embolism (n = 120). All models produced similarly accurate tumor segmentation +on the lung cancer testing datasets. SMIT produced a highest F1-score (LRAD: +0.60, 5Rater: 0.64) and lowest entropy (LRAD: 0.06, 5Rater: 0.12), indicating +higher tumor detection rate and confident segmentations. In the OOD dataset, +SMIT misdetected least number of tumors, indicated by median volume occupancy +of 5.67 cc compared to second best method SimMIM of 9.97 cc. + +
+
+
+
+
+ + ♻ ☆ Multi-task Learning Approach for Intracranial Hemorrhage Prognosis MICCAI 2024 + + +
+ Prognosis after intracranial hemorrhage (ICH) is influenced by a complex +interplay between imaging and tabular data. Rapid and reliable prognosis are +crucial for effective patient stratification and informed treatment +decision-making. In this study, we aim to enhance image-based prognosis by +learning a robust feature representation shared between prognosis and the +clinical and demographic variables most highly correlated with it. Our approach +mimics clinical decision-making by reinforcing the model to learn valuable +prognostic data embedded in the image. We propose a 3D multi-task image model +to predict prognosis, Glasgow Coma Scale and age, improving accuracy and +interpretability. Our method outperforms current state-of-the-art baseline +image models, and demonstrates superior performance in ICH prognosis compared +to four board-certified neuroradiologists using only CT scans as input. We +further validate our model with interpretability saliency maps. Code is +available at https://github.com/MiriamCobo/MultitaskLearning_ICH_Prognosis.git. + +
+
+ comment: 16 pages. Accepted at Machine Learning in Medical Imaging Workshop @ + MICCAI 2024 (MLMI2024). This is the submitted manuscript with added link to + github repo, funding acknowledgements and authors' names and affiliations. No + further post submission improvements or corrections were integrated. Final + version not published yet +
+
+
+
+
+ + ♻ ☆ SDE-based Multiplicative Noise Removal + + +
+ Multiplicative noise, also known as speckle or pepper noise, commonly affects +images produced by synthetic aperture radar (SAR), lasers, or optical lenses. +Unlike additive noise, which typically arises from thermal processes or +external factors, multiplicative noise is inherent to the system, originating +from the fluctuation in diffuse reflections. These fluctuations result in +multiple copies of the same signal with varying magnitudes being combined. +Consequently, despeckling, or removing multiplicative noise, necessitates +different techniques compared to those used for additive noise removal. + In this paper, we propose a novel approach using Stochastic Differential +Equations based diffusion models to address multiplicative noise. We +demonstrate that multiplicative noise can be effectively modeled as a Geometric +Brownian Motion process in the logarithmic domain. Utilizing the Fokker-Planck +equation, we derive the corresponding reverse process for image denoising. To +validate our method, we conduct extensive experiments on two different +datasets, comparing our approach to both classical signal processing techniques +and contemporary CNN-based noise removal models. Our results indicate that the +proposed method significantly outperforms existing methods on perception-based +metrics such as FID and LPIPS, while maintaining competitive performance on +traditional metrics like PSNR and SSIM. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Learning Local Pattern Modularization for Point Cloud Reconstruction + from Unseen Classes ECCV 2024 + + +
+ It is challenging to reconstruct 3D point clouds in unseen classes from +single 2D images. Instead of object-centered coordinate system, current methods +generalized global priors learned in seen classes to reconstruct 3D shapes from +unseen classes in viewer-centered coordinate system. However, the +reconstruction accuracy and interpretability are still eager to get improved. +To resolve this issue, we introduce to learn local pattern modularization for +reconstructing 3D shapes in unseen classes, which achieves both good +generalization ability and high reconstruction accuracy. Our insight is to +learn a local prior which is class-agnostic and easy to generalize in +object-centered coordinate system. Specifically, the local prior is learned via +a process of learning and customizing local pattern modularization in seen +classes. During this process, we first learn a set of patterns in local +regions, which is the basis in the object-centered coordinate system to +represent an arbitrary region on shapes across different classes. Then, we +modularize each region on an initially reconstructed shape using the learned +local patterns. Based on that, we customize the local pattern modularization +using the input image by refining the reconstruction with more details. Our +method enables to reconstruct high fidelity point clouds from unseen classes in +object-centered coordinate system without requiring a large number of patterns +or any additional information, such as segmentation supervision or camera +poses. Our experimental results under widely used benchmarks show that our +method achieves the state-of-the-art reconstruction accuracy for shapes from +unseen classes. The code is available at https://github.com/chenchao15/Unseen. + +
+
+ comment: 14pages, 11figures, accepted by ECCV 2024 +
+
+
+
+
+ + ♻ ☆ CONDA: Condensed Deep Association Learning for Co-Salient Object + Detection + + +
+ Inter-image association modeling is crucial for co-salient object detection. +Despite satisfactory performance, previous methods still have limitations on +sufficient inter-image association modeling. Because most of them focus on +image feature optimization under the guidance of heuristically calculated raw +inter-image associations. They directly rely on raw associations which are not +reliable in complex scenarios, and their image feature optimization approach is +not explicit for inter-image association modeling. To alleviate these +limitations, this paper proposes a deep association learning strategy that +deploys deep networks on raw associations to explicitly transform them into +deep association features. Specifically, we first create hyperassociations to +collect dense pixel-pair-wise raw associations and then deploys deep +aggregation networks on them. We design a progressive association generation +module for this purpose with additional enhancement of the hyperassociation +calculation. More importantly, we propose a correspondence-induced association +condensation module that introduces a pretext task, i.e. semantic +correspondence estimation, to condense the hyperassociations for computational +burden reduction and noise elimination. We also design an object-aware cycle +consistency loss for high-quality correspondence estimations. Experimental +results in three benchmark datasets demonstrate the remarkable effectiveness of +our proposed method with various training settings. + +
+
+ comment: There is an error. In Sec 4.1, the number of images in some dataset + is incorrect and needs to be revised +
+
+
+
+
+ + ♻ ☆ Open Gaze: Open Source eye tracker for smartphone devices using Deep + Learning + + +
+ Eye tracking has been a pivotal tool in diverse fields such as vision +research, language analysis, and usability assessment. The majority of prior +investigations, however, have concentrated on expansive desktop displays +employing specialized, costly eye tracking hardware that lacks scalability. +Remarkably little insight exists into ocular movement patterns on smartphones, +despite their widespread adoption and significant usage. In this manuscript, we +present an open-source implementation of a smartphone-based gaze tracker that +emulates the methodology proposed by a GooglePaper (whose source code remains +proprietary). Our focus is on attaining accuracy comparable to that attained +through the GooglePaper's methodology, without the necessity for supplementary +hardware. Through the integration of machine learning techniques, we unveil an +accurate eye tracking solution that is native to smartphones. Our approach +demonstrates precision akin to the state-of-the-art mobile eye trackers, which +are characterized by a cost that is two orders of magnitude higher. Leveraging +the vast MIT GazeCapture dataset, which is available through registration on +the dataset's website, we successfully replicate crucial findings from previous +studies concerning ocular motion behavior in oculomotor tasks and saliency +analyses during natural image observation. Furthermore, we emphasize the +applicability of smartphone-based gaze tracking in discerning reading +comprehension challenges. Our findings exhibit the inherent potential to +amplify eye movement research by significant proportions, accommodating +participation from thousands of subjects with explicit consent. This +scalability not only fosters advancements in vision research, but also extends +its benefits to domains such as accessibility enhancement and healthcare +applications. + +
+
+ comment: This paper results are incorrectly reported. The paper is not + authentic and conclusions are not correct +
+
+
+
+
+ + ♻ ☆ Q-Seg: Quantum Annealing-Based Unsupervised Image Segmentation + + +
+ We present Q-Seg, a novel unsupervised image segmentation method based on +quantum annealing, tailored for existing quantum hardware. We formulate the +pixel-wise segmentation problem, which assimilates spectral and spatial +information of the image, as a graph-cut optimization task. Our method +efficiently leverages the interconnected qubit topology of the D-Wave Advantage +device, offering superior scalability over existing quantum approaches and +outperforming several tested state-of-the-art classical methods. Empirical +evaluations on synthetic datasets have shown that Q-Seg has better runtime +performance than the state-of-the-art classical optimizer Gurobi. The method +has also been tested on earth observation image segmentation, a critical area +with noisy and unreliable annotations. In the era of noisy intermediate-scale +quantum, Q-Seg emerges as a reliable contender for real-world applications in +comparison to advanced techniques like Segment Anything. Consequently, Q-Seg +offers a promising solution using available quantum hardware, especially in +situations constrained by limited labeled data and the need for efficient +computational runtime. + +
+
+ comment: 12 pages, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Enhancing the vision-language foundation model with key semantic + knowledge-emphasized report refinement + + +
+ Recently, vision-language representation learning has made remarkable +advancements in building up medical foundation models, holding immense +potential for transforming the landscape of clinical research and medical care. +The underlying hypothesis is that the rich knowledge embedded in radiology +reports can effectively assist and guide the learning process, reducing the +need for additional labels. However, these reports tend to be complex and +sometimes even consist of redundant descriptions that make the representation +learning too challenging to capture the key semantic information. This paper +develops a novel iterative vision-language representation learning framework by +proposing a key semantic knowledge-emphasized report refinement method. +Particularly, raw radiology reports are refined to highlight the key +information according to a constructed clinical dictionary and two +model-optimized knowledge-enhancement metrics. The iterative framework is +designed to progressively learn, starting from gaining a general understanding +of the patient's condition based on raw reports and gradually refines and +extracts critical information essential to the fine-grained analysis tasks. The +effectiveness of the proposed framework is validated on various downstream +medical image analysis tasks, including disease classification, +region-of-interest segmentation, and phrase grounding. Our framework surpasses +seven state-of-the-art methods in both fine-tuning and zero-shot settings, +demonstrating its encouraging potential for different clinical applications. + +
+
+
+
+
+ + ♻ ☆ Pre-processing and Compression: Understanding Hidden Representation + Refinement Across Imaging Domains via Intrinsic Dimension + + +
+ In recent years, there has been interest in how geometric properties such as +intrinsic dimension (ID) of a neural network's hidden representations change +through its layers, and how such properties are predictive of important model +behavior such as generalization ability. However, evidence has begun to emerge +that such behavior can change significantly depending on the domain of the +network's training data, such as natural versus medical images. Here, we +further this inquiry by exploring how the ID of a network's learned +representations changes through its layers, in essence, characterizing how the +network successively refines the information content of input data to be used +for predictions. Analyzing eleven natural and medical image datasets across six +network architectures, we find that how ID changes through the network differs +noticeably between natural and medical image models. Specifically, medical +image models peak in representation ID earlier in the network, implying a +difference in the image features and their abstractness that are typically used +for downstream tasks in these domains. Additionally, we discover a strong +correlation of this peak representation ID with the ID of the data in its input +space, implying that the intrinsic information content of a model's learned +representations is guided by that of the data it was trained on. Overall, our +findings emphasize notable discrepancies in network behavior between natural +and non-natural imaging domains regarding hidden representation information +content, and provide further insights into how a network's learned features are +shaped by its training data. + +
+
+
+
+
+ + ♻ ☆ CHOTA: A Higher Order Accuracy Metric for Cell Tracking + + +
+ The evaluation of cell tracking results steers the development of tracking +methods, significantly impacting biomedical research. This is quantitatively +achieved by means of evaluation metrics. Unfortunately, current metrics favor +local correctness and weakly reward global coherence, impeding high-level +biological analysis. To also foster global coherence, we propose the CHOTA +metric (Cell-specific Higher Order Tracking Accuracy) which unifies the +evaluation of all relevant aspects of cell tracking: cell detections and local +associations, global coherence, and lineage tracking. We achieve this by +introducing a new definition of the term 'trajectory' that includes the entire +cell lineage and by including this into the well-established HOTA metric from +general multiple object tracking. Furthermore, we provide a detailed survey of +contemporary cell tracking metrics to compare our novel CHOTA metric and to +show its advantages. All metrics are extensively evaluated on state-of-the-art +real-data cell tracking results and synthetic results that simulate specific +tracking errors. We show that CHOTA is sensitive to all tracking errors and +gives a good indication of the biologically relevant capability of a method to +reconstruct the full lineage of cells. It introduces a robust and comprehensive +alternative to the currently used metrics in cell tracking. Python code is +available at https://github.com/CellTrackingChallenge/py-ctcmetrics . + +
+
+ comment: Accepted at BIC Workshop at European Conference on Computer Vision + 2024, 14 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Large Scale Unsupervised Brain MRI Image Registration Solution for + Learn2Reg 2024 MICCAI + + +
+ In this paper, we summarize the methods and experimental results we proposed +for Task 2 in the learn2reg 2024 Challenge. This task focuses on unsupervised +registration of anatomical structures in brain MRI images between different +patients. The difficulty lies in: (1) without segmentation labels, and (2) a +large amount of data. To address these challenges, we built an efficient +backbone network and explored several schemes to further enhance registration +accuracy. Under the guidance of the NCC loss function and smoothness +regularization loss function, we obtained a smooth and reasonable deformation +field. According to the leaderboard, our method achieved a Dice coefficient of +77.34%, which is 1.4% higher than the TransMorph. Overall, we won second place +on the leaderboard for Task 2. + +
+
+ comment: MICCAI Learn2Reg 2024 Challenge & WBIR 2024 Workshop on Biomedical + Imaging Registration +
+
+
+
+
+ + ♻ ☆ Nickel and Diming Your GAN: A Dual-Method Approach to Enhancing GAN + Efficiency via Knowledge Distillation + + +
+ In this paper, we address the challenge of compressing generative adversarial +networks (GANs) for deployment in resource-constrained environments by +proposing two novel methodologies: Distribution Matching for Efficient +compression (DiME) and Network Interactive Compression via Knowledge Exchange +and Learning (NICKEL). DiME employs foundation models as embedding kernels for +efficient distribution matching, leveraging maximum mean discrepancy to +facilitate effective knowledge distillation. Simultaneously, NICKEL employs an +interactive compression method that enhances the communication between the +student generator and discriminator, achieving a balanced and stable +compression process. Our comprehensive evaluation on the StyleGAN2 architecture +with the FFHQ dataset shows the effectiveness of our approach, with NICKEL & +DiME achieving FID scores of 10.45 and 15.93 at compression rates of 95.73% and +98.92%, respectively. Remarkably, our methods sustain generative quality even +at an extreme compression rate of 99.69%, surpassing the previous +state-of-the-art performance by a large margin. These findings not only +demonstrate our methodologies' capacity to significantly lower GANs' +computational demands but also pave the way for deploying high-quality GAN +models in settings with limited resources. Our code will be released soon. + +
+
+
+
+
+ + ♻ ☆ When Does Visual Prompting Outperform Linear Probing for Vision-Language + Models? A Likelihood Perspective + + +
+ Adapting pre-trained models to new tasks can exhibit varying effectiveness +across datasets. Visual prompting, a state-of-the-art parameter-efficient +transfer learning method, can significantly improve the performance of +out-of-distribution tasks. On the other hand, linear probing, a standard +transfer learning method, can sometimes become the best approach. We propose a +log-likelihood ratio (LLR) approach to analyze the comparative benefits of +visual prompting and linear probing. By employing the LLR score alongside +resource-efficient visual prompts approximations, our cost-effective measure +attains up to a 100-fold reduction in run time compared to full training, while +achieving prediction accuracies up to 91%. The source code is available at +https://github.com/IBM/VP-LLR. + +
+
+
+
+
+ + ♻ ☆ MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN + for Precise Facial Expression Intensity Estimation + + +
+ This paper presents MMA-MRNNet, a novel deep learning architecture for +dynamic multi-output Facial Expression Intensity Estimation (FEIE) from video +data. Traditional approaches to this task often rely on complex 3-D CNNs, which +require extensive pre-training and assume that facial expressions are uniformly +distributed across all frames of a video. These methods struggle to handle +videos of varying lengths, often resorting to ad-hoc strategies that either +discard valuable information or introduce bias. MMA-MRNNet addresses these +challenges through a two-stage process. First, the Multiple Models of Affect +(MMA) extractor component is a Multi-Task Learning CNN that concurrently +estimates valence-arousal, recognizes basic facial expressions, and detects +action units in each frame. These representations are then processed by a +Masked RNN component, which captures temporal dependencies and dynamically +updates weights according to the true length of the input video, ensuring that +only the most relevant features are used for the final prediction. The proposed +unimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction +dataset and demonstrated significantly superior performance, surpassing +state-of-the-art methods by a wide margin, regardless of whether they were +unimodal, multimodal, or ensemble approaches. Finally, we demonstrated the +effectiveness of the MMA component of our proposed method across multiple +in-the-wild datasets, where it consistently outperformed all state-of-the-art +methods across various metrics. + +
+
+
+
+
+ + ♻ ☆ In the Search for Optimal Multi-view Learning Models for Crop + Classification with Global Remote Sensing Data + + +
+ Studying and analyzing cropland is a difficult task due to its dynamic and +heterogeneous growth behavior. Usually, diverse data sources can be collected +for its estimation. Although deep learning models have proven to excel in the +crop classification task, they face substantial challenges when dealing with +multiple inputs, named Multi-View Learning (MVL). The methods used in the MVL +scenario can be structured based on the encoder architecture, the fusion +strategy, and the optimization technique. The literature has primarily focused +on using specific encoder architectures for local regions, lacking a deeper +exploration of other components in the MVL methodology. In contrast, we +investigate the simultaneous selection of the fusion strategy and encoder +architecture, assessing global-scale cropland and crop-type classifications. We +use a range of five fusion strategies (Input, Feature, Decision, Ensemble, +Hybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible +configurations in the MVL method. We use the CropHarvest dataset for +validation, which provides optical, radar, weather time series, and topographic +information as input data. We found that in scenarios with a limited number of +labeled samples, a unique configuration is insufficient for all the cases. +Instead, a specialized combination should be meticulously sought, including an +encoder and fusion strategy. To streamline this search process, we suggest +identifying the optimal encoder architecture tailored for a particular fusion +strategy, and then determining the most suitable fusion strategy for the +classification task. We provide a methodological framework for researchers +exploring crop classification through an MVL methodology. + +
+
+ comment: submitted to journal +
+
+
+
+
+ + ♻ ☆ Increasing the Robustness of Model Predictions to Missing Sensors in + Earth Observation ACL + + +
+ Multi-sensor ML models for EO aim to enhance prediction accuracy by +integrating data from various sources. However, the presence of missing data +poses a significant challenge, particularly in non-persistent sensors that can +be affected by external factors. Existing literature has explored strategies +like temporal dropout and sensor-invariant models to address the generalization +to missing data issues. Inspired by these works, we study two novel methods +tailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and +Ensemble Sensor Invariant (ESensI). Through experimentation on three +multi-sensor temporal EO datasets, we demonstrate that these methods +effectively increase the robustness of model predictions to missing sensors. +Particularly, we focus on how the predictive performance of models drops when +sensors are missing at different levels. We observe that ensemble multi-sensor +models are the most robust to the lack of sensors. In addition, the sensor +dropout component in ISensD shows promising robustness results. + +
+
+ comment: Accepted at the MACLEAN workshop in the ECML/PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Scalable Glacier Mapping using Deep Learning and Open Earth Observation + Data Matches the Accuracy of Manual Delineation + + +
+ Accurate global glacier mapping is critical for understanding climate change +impacts. Despite its importance, automated glacier mapping at a global scale +remains largely unexplored. Here we address this gap and propose +Glacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep +learning model, and five strategies for multitemporal global-scale glacier +mapping using open satellite imagery. Assessing the spatial, temporal and +cross-sensor generalisation shows that our best strategy achieves intersection +over union >0.85 on previously unobserved images in most cases, which drops to +>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90 +for regions dominated by clean ice. A comparative validation against human +expert uncertainties in terms of area and distance deviations underscores +GlaViTU performance, approaching or matching expert-level delineation. Adding +synthetic aperture radar data, namely, backscatter and interferometric +coherence, increases the accuracy in all regions where available. The +calibrated confidence for glacier extents is reported making the predictions +more reliable and interpretable. We also release a benchmark dataset that +covers 9% of glaciers worldwide. Our results support efforts towards automated +multitemporal and global glacier mapping. + +
+
+ comment: after major revision, expanded validation +
+
+
+
+
+ + ♻ ☆ CSGO: Content-Style Composition in Text-to-Image Generation + + +
+ The diffusion model has shown exceptional capabilities in controlled image +generation, which has further fueled interest in image style transfer. Existing +works mainly focus on training free-based methods (e.g., image inversion) due +to the scarcity of specific data. In this study, we present a data construction +pipeline for content-style-stylized image triplets that generates and +automatically cleanses stylized data triplets. Based on this pipeline, we +construct a dataset IMAGStyle, the first large-scale style transfer dataset +containing 210k image triplets, available for the community to explore and +research. Equipped with IMAGStyle, we propose CSGO, a style transfer model +based on end-to-end training, which explicitly decouples content and style +features employing independent feature injection. The unified CSGO implements +image-driven style transfer, text-driven stylized synthesis, and text +editing-driven stylized synthesis. Extensive experiments demonstrate the +effectiveness of our approach in enhancing style control capabilities in image +generation. Additional visualization and access to the source code can be +located on the project page: \url{https://csgo-gen.github.io/}. + +
+
+
+
+
+ + ♻ ☆ Object-Size-Driven Design of Convolutional Neural Networks: Virtual Axle + Detection based on Raw Data + + +
+ As infrastructure ages, the need for efficient monitoring methods becomes +increasingly critical. Bridge Weigh-In-Motion (BWIM) systems are crucial for +cost-efficient load and thus residual service life determination of road and +railway infrastructure. However, conventional BWIM systems require additional +sensors for axle detection, which have to be installed in potentially +inaccessible locations or in locations that interfere with bridge operation. +This study addresses this challenge by replacing dedicated axle detectors with +a novel approach to real-time detection of train axles using sensors +arbitrarily placed on bridges. The proposed Virtual Axle Detector with Enhanced +Receptive Field (VADER) has been validated on a single-track railway bridge, +demonstrating that it achieves to detect 99.9% of axles with a spatial error of +3.69cm using only acceleration measurements. Using raw data as input +outperforms the state-of-the-art spectrogram-based method in both speed and +memory usage by 99%, making real-time application feasible for the first time. +Additionally, we introduce the Maximum Receptive Field (MRF) rule, a novel +approach to optimise hyperparameters of Convolutional Neural Networks (CNNs) +based on the size of objects, which in this case relates to the fundamental +frequency of a bridge. The MRF rule effectively narrows the hyperparameter +search space, potentially replacing the need for extensive hyperparameter +tuning. Since the MRF rule is theoretically applicable to all unstructured +data, it could have implications for a wide range of deep learning problems +from earthquake prediction to object recognition. + +
+
+
+
+
+ + ♻ ☆ Filter & Align: Leveraging Human Knowledge to Curate Image-Text Data + + +
+ The increasing availability of image-text pairs has largely fueled the rapid +advancement in vision-language foundation models. However, the vast scale of +these datasets inevitably introduces significant variability in data quality, +which can adversely affect the model performance. This highlights the critical +role of data filtering, not only to enhance training efficiency but also to +improve overall data quality. Existing methods typically rely on metrics such +as CLIP Score and BLIP Score, which are derived from pre-trained models. +However, these models are often trained on uncurated, noisy datasets, which can +perpetuate errors and misalignments in the filtered dataset. We present a novel +algorithm that incorporates human knowledge on image-text alignment to guide +filtering vast corpus of web-crawled image-text datasets into a compact and +high-quality form. To systemically capture human preferences on image-text +alignments, we collect a diverse image-text dataset where each image is +associated with multiple captions from various sources, and establish a +comprehensive set of both subjective and objective criteria for critically +guiding the alignment assessment from labelers. Additionally, we train a reward +model on these human-preference annotations to internalize the nuanced human +understanding of image-text alignment. The resulting reward model thus can act +as a human-like referee to filter image-text pairs. Extensive experiments +demonstrate that we can maintain, sometimes even improve, model performance +while compressing the image-text datasets up to ~90%. An impressive example is +that, by aggressively reducing the total training sample from 130M to only +15.5M, our BLIP-B/16 models consistently show an average improvement of 2.9% on +retrieval tasks and 11.5% on captioning tasks compared to full-size-dataset +counterparts. + +
+
+
+
+
+ + ♻ ☆ Multi-Task Multi-Modal Self-Supervised Learning for Facial Expression + Recognition CVPR 2024 + + +
+ Human communication is multi-modal; e.g., face-to-face interaction involves +auditory signals (speech) and visual signals (face movements and hand +gestures). Hence, it is essential to exploit multiple modalities when designing +machine learning-based facial expression recognition systems. In addition, +given the ever-growing quantities of video data that capture human facial +expressions, such systems should utilize raw unlabeled videos without requiring +expensive annotations. Therefore, in this work, we employ a multitask +multi-modal self-supervised learning method for facial expression recognition +from in-the-wild video data. Our model combines three self-supervised objective +functions: First, a multi-modal contrastive loss, that pulls diverse data +modalities of the same video together in the representation space. Second, a +multi-modal clustering loss that preserves the semantic structure of input data +in the representation space. Finally, a multi-modal data reconstruction loss. +We conduct a comprehensive study on this multimodal multi-task self-supervised +learning method on three facial expression recognition benchmarks. To that end, +we examine the performance of learning through different combinations of +self-supervised tasks on the facial expression recognition downstream task. Our +model ConCluGen outperforms several multi-modal self-supervised and fully +supervised baselines on the CMU-MOSEI dataset. Our results generally show that +multi-modal self-supervision tasks offer large performance gains for +challenging tasks such as facial expression recognition, while also reducing +the amount of manual annotations required. We release our pre-trained models as +well as source code publicly + +
+
+ comment: The paper will appear in the CVPR 2024 workshops proceedings +
+
+
+
+
+ + ♻ ☆ UHD-IQA Benchmark Database: Pushing the Boundaries of Blind Photo + Quality Assessment + + +
+ We introduce a novel Image Quality Assessment (IQA) dataset comprising 6073 +UHD-1 (4K) images, annotated at a fixed width of 3840 pixels. Contrary to +existing No-Reference (NR) IQA datasets, ours focuses on highly aesthetic +photos of high technical quality, filling a gap in the literature. The images, +carefully curated to exclude synthetic content, are sufficiently diverse to +train general NR-IQA models. Importantly, the dataset is annotated with +perceptual quality ratings obtained through a crowdsourcing study. Ten expert +raters, comprising photographers and graphics artists, assessed each image at +least twice in multiple sessions spanning several days, resulting in 20 highly +reliable ratings per image. Annotators were rigorously selected based on +several metrics, including self-consistency, to ensure their reliability. The +dataset includes rich metadata with user and machine-generated tags from over +5,000 categories and popularity indicators such as favorites, likes, downloads, +and views. With its unique characteristics, such as its focus on high-quality +images, reliable crowdsourced annotations, and high annotation resolution, our +dataset opens up new opportunities for advancing perceptual image quality +assessment research and developing practical NR-IQA models that apply to modern +photos. Our dataset is available at +https://database.mmsp-kn.de/uhd-iqa-benchmark-database.html + +
+
+
+
+
+ + ♻ ☆ Model-agnostic explainable artificial intelligence for object detection + in image data + + +
+ In recent years, deep neural networks have been widely used for building +high-performance Artificial Intelligence (AI) systems for computer vision +applications. Object detection is a fundamental task in computer vision, which +has been greatly progressed through developing large and intricate AI models. +However, the lack of transparency is a big challenge that may not allow the +widespread adoption of these models. Explainable artificial intelligence is a +field of research where methods are developed to help users understand the +behavior, decision logics, and vulnerabilities of AI systems. Previously, few +explanation methods were developed for object detection based on random +masking. However, random masks may raise some issues regarding the actual +importance of pixels within an image. In this paper, we design and implement a +black-box explanation method named Black-box Object Detection Explanation by +Masking (BODEM) through adopting a hierarchical random masking approach for +object detection systems. We propose a hierarchical random masking framework in +which coarse-grained masks are used in lower levels to find salient regions +within an image, and fine-grained mask are used to refine the salient regions +in higher levels. Experimentations on various object detection datasets and +models showed that BODEM can effectively explain the behavior of object +detectors. Moreover, our method outperformed Detector Randomized Input Sampling +for Explanation (D-RISE) and Local Interpretable Model-agnostic Explanations +(LIME) with respect to different quantitative measures of explanation +effectiveness. The experimental results demonstrate that BODEM can be an +effective method for explaining and validating object detection systems in +black-box testing scenarios. + +
+
+
+
+
+ + ♻ ☆ Map-Free Visual Relocalization Enhanced by Instance Knowledge and Depth + Knowledge + + +
+ Map-free relocalization technology is crucial for applications in autonomous +navigation and augmented reality, but relying on pre-built maps is often +impractical. It faces significant challenges due to limitations in matching +methods and the inherent lack of scale in monocular images. These issues lead +to substantial rotational and metric errors and even localization failures in +real-world scenarios. Large matching errors significantly impact the overall +relocalization process, affecting both rotational and translational accuracy. +Due to the inherent limitations of the camera itself, recovering the metric +scale from a single image is crucial, as this significantly impacts the +translation error. To address these challenges, we propose a map-free +relocalization method enhanced by instance knowledge and depth knowledge. By +leveraging instance-based matching information to improve global matching +results, our method significantly reduces the possibility of mismatching across +different objects. The robustness of instance knowledge across the scene helps +the feature point matching model focus on relevant regions and enhance matching +accuracy. Additionally, we use estimated metric depth from a single image to +reduce metric errors and improve scale recovery accuracy. By integrating +methods dedicated to mitigating large translational and rotational errors, our +approach demonstrates superior performance in map-free relocalization +techniques. + +
+
+ comment: 17 pages,6 figures +
+
+
+
+
+ + ♻ ☆ CT-AGRG: Automated Abnormality-Guided Report Generation from 3D Chest CT + Volumes + + +
+ The rapid increase of computed tomography (CT) scans and their time-consuming +manual analysis have created an urgent need for robust automated analysis +techniques in clinical settings. These aim to assist radiologists and help them +managing their growing workload. Existing methods typically generate entire +reports directly from 3D CT images, without explicitly focusing on observed +abnormalities. This unguided approach often results in repetitive content or +incomplete reports, failing to prioritize anomaly-specific descriptions. We +propose a new anomaly-guided report generation model, which first predicts +abnormalities and then generates targeted descriptions for each. Evaluation on +a public dataset demonstrates significant improvements in report quality and +clinical relevance. We extend our work by conducting an ablation study to +demonstrate its effectiveness. + +
+
+ comment: 15 pages, 9 figures, submitted to ISBI 2025 +
+
+
+
+
+ + ♻ ☆ Path-SAM2: Transfer SAM2 for digital pathology semantic segmentation + + +
+ The semantic segmentation task in pathology plays an indispensable role in +assisting physicians in determining the condition of tissue lesions. With the +proposal of Segment Anything Model (SAM), more and more foundation models have +seen rapid development in the field of image segmentation. Recently, SAM2 has +garnered widespread attention in both natural image and medical image +segmentation. Compared to SAM, it has significantly improved in terms of +segmentation accuracy and generalization performance. We compared the +foundational models based on SAM and found that their performance in semantic +segmentation of pathological images was hardly satisfactory. In this paper, we +propose Path-SAM2, which for the first time adapts the SAM2 model to cater to +the task of pathological semantic segmentation. We integrate the largest +pretrained vision encoder for histopathology (UNI) with the original SAM2 +encoder, adding more pathology-based prior knowledge. Additionally, we +introduce a learnable Kolmogorov-Arnold Networks (KAN) classification module to +replace the manual prompt process. In three adenoma pathological datasets, +Path-SAM2 has achieved state-of-the-art performance.This study demonstrates the +great potential of adapting SAM2 to pathology image segmentation tasks. We plan +to release the code and model weights for this paper at: +https://github.com/simzhangbest/SAM2PATH + +
+
+ comment: 5 pages , 5 figures +
+
+
+
+
+ + ♻ ☆ Bayesian Evidential Learning for Few-Shot Classification + + +
+ Few-Shot Classification(FSC) aims to generalize from base classes to novel +classes given very limited labeled samples, which is an important step on the +path toward human-like machine learning. State-of-the-art solutions involve +learning to find a good metric and representation space to compute the distance +between samples. Despite the promising accuracy performance, how to model +uncertainty for metric-based FSC methods effectively is still a challenge. To +model uncertainty, We place a distribution over class probability based on the +theory of evidence. As a result, uncertainty modeling and metric learning can +be decoupled. To reduce the uncertainty of classification, we propose a +Bayesian evidence fusion theorem. Given observed samples, the network learns to +get posterior distribution parameters given the prior parameters produced by +the pre-trained network. Detailed gradient analysis shows that our method +provides a smooth optimization target and can capture the uncertainty. The +proposed method is agnostic to metric learning strategies and can be +implemented as a plug-and-play module. We integrate our method into several +newest FSC methods and demonstrate the improved accuracy and uncertainty +quantification on standard FSC benchmarks. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ AI-Assisted Cervical Cancer Screening + + +
+ Visual Inspection with Acetic Acid (VIA) remains the most feasible cervical +cancer screening test in resource-constrained settings of low- and +middle-income countries (LMICs), which are often performed screening camps or +primary/community health centers by nurses instead of the preferred but +unavailable expert Gynecologist. To address the highly subjective nature of the +test, various handheld devices integrating cameras or smartphones have been +recently explored to capture cervical images during VIA and aid decision-making +via telemedicine or AI models. Most studies proposing AI models retrospectively +use a relatively small number of already collected images from specific +devices, digital cameras, or smartphones; the challenges and protocol for +quality image acquisition during VIA in resource-constrained camp settings, +challenges in getting gold standard, data imbalance, etc. are often overlooked. +We present a novel approach and describe the end-to-end design process to build +a robust smartphone-based AI-assisted system that does not require buying a +separate integrated device: the proposed protocol for quality image acquisition +in resource-constrained settings, dataset collected from 1,430 women during VIA +performed by nurses in screening camps, preprocessing pipeline, and training +and evaluation of a deep-learning-based classification model aimed to identify +(pre)cancerous lesions. Our work shows that the readily available smartphones +and a suitable protocol can capture the cervix images with the required details +for the VIA test well; the deep-learning-based classification model provides +promising results to assist nurses in VIA screening; and provides a direction +for large-scale data collection and validation in resource-constrained +settings. + +
+
+
+
+
+ + ♻ ☆ Style-NeRF2NeRF: 3D Style Transfer From Style-Aligned Multi-View Images + + +
+ We propose a simple yet effective pipeline for stylizing a 3D scene, +harnessing the power of 2D image diffusion models. Given a NeRF model +reconstructed from a set of multi-view images, we perform 3D style transfer by +refining the source NeRF model using stylized images generated by a +style-aligned image-to-image diffusion model. Given a target style prompt, we +first generate perceptually similar multi-view images by leveraging a +depth-conditioned diffusion model with an attention-sharing mechanism. Next, +based on the stylized multi-view images, we propose to guide the style transfer +process with the sliced Wasserstein loss based on the feature maps extracted +from a pre-trained CNN model. Our pipeline consists of decoupled steps, +allowing users to test various prompt ideas and preview the stylized 3D result +before proceeding to the NeRF fine-tuning stage. We demonstrate that our method +can transfer diverse artistic styles to real-world 3D scenes with competitive +quality. Result videos are also available on our project page: +https://haruolabs.github.io/style-n2n/ + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Rethinking Barely-Supervised Volumetric Medical Image Segmentation from + an Unsupervised Domain Adaptation Perspective + + +
+ This paper investigates an extremely challenging problem: barely-supervised +volumetric medical image segmentation (BSS). A BSS training dataset consists of +two parts: 1) a barely-annotated labeled set, where each labeled image contains +only a single-slice annotation, and 2) an unlabeled set comprising numerous +unlabeled volumetric images. State-of-the-art BSS methods employ a +registration-based paradigm, which uses inter-slice image registration to +propagate single-slice annotations into volumetric pseudo labels, constructing +a completely annotated labeled set, to which a semi-supervised segmentation +scheme can be applied. However, the paradigm has a critical limitation: the +pseudo-labels generated by image registration are unreliable and noisy. +Motivated by this, we propose a new perspective: instead of solving BSS within +a semi-supervised learning scheme, this work formulates BSS as an unsupervised +domain adaptation problem. To this end, we propose a novel BSS framework, +\textbf{B}arely-supervised learning \textbf{via} unsupervised domain +\textbf{A}daptation (BvA), as an alternative to the dominant registration +paradigm. Specifically, we first design a novel noise-free labeled data +construction algorithm (NFC) for slice-to-volume labeled data synthesis. Then, +we introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the +domain shifts. Extensive experiments demonstrate that our method provides a +promising alternative for BSS. Remarkably, the proposed method, trained on the +left atrial segmentation dataset with \textbf{only one} barely-labeled image, +achieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%. +The code is available at https://github.com/Senyh/BvA. + +
+
+
+
+
+ + ♻ ☆ Zero-shot 3D Segmentation of Abdominal Organs in CT Scans Using Segment + Anything Model 2: Adapting Video Tracking Capabilities for 3D Medical Imaging + + +
+ Purpose: + To evaluate the zero-shot performance of Segment Anything Model 2 (SAM 2) in +3D segmentation of abdominal organs in CT scans, and to investigate the effects +of prompt settings on segmentation results. + Materials and Methods: + Using a subset of the TotalSegmentator CT dataset (n = 123) from eight +institutions, we assessed SAM 2's ability to segment eight abdominal organs. +Segmentation was initiated from three different z-coordinate levels (caudal, +mid, and cranial levels) of each organ. Performance was measured using the Dice +similarity coefficient (DSC). We also analyzed the impact of "negative +prompts," which explicitly exclude certain regions from the segmentation +process, on accuracy. Additionally, we analyzed organ volumes to contextualize +the segmentation performance. + Results: + As a zero-shot approach, larger organs with clear boundaries demonstrated +high segmentation performance, with mean(median) DSCs as follows: liver +0.821(0.898), left kidney 0.870(0.921), right kidney 0.862(0.935), and spleen +0.891(0.932). Smaller organs showed lower performance: gallbladder +0.531(0.590), pancreas 0.361(0.359), and adrenal glands, right 0.203(0.109), +left 0.308(0.231). The initial slice for segmentation and the use of negative +prompts significantly influenced the results. By removing negative prompts from +the input, the DSCs significantly decreased for six organs. Moderate positive +correlations were observed between volume sizes and DSCs. + Conclusion: + SAM 2 demonstrated promising zero-shot performance in segmenting certain +abdominal organs in CT scans, particularly larger organs with clear boundaries. +Performance was significantly influenced by input negative prompts and initial +slice selection, highlighting the importance of optimizing these factors for +effective segmentation. + +
+
+ comment: 20 pages, 7 figures (including 2 supplemental figure), 4 tables +
+
+
+
+
+ + ♻ ☆ RMT-BVQA: Recurrent Memory Transformer-based Blind Video Quality + Assessment for Enhanced Video Content ECCV 2024 + + +
+ With recent advances in deep learning, numerous algorithms have been +developed to enhance video quality, reduce visual artifacts, and improve +perceptual quality. However, little research has been reported on the quality +assessment of enhanced content - the evaluation of enhancement methods is often +based on quality metrics that were designed for compression applications. In +this paper, we propose a novel blind deep video quality assessment (VQA) method +specifically for enhanced video content. It employs a new Recurrent Memory +Transformer (RMT) based network architecture to obtain video quality +representations, which is optimized through a novel content-quality-aware +contrastive learning strategy based on a new database containing 13K training +patches with enhanced content. The extracted quality representations are then +combined through linear regression to generate video-level quality indices. The +proposed method, RMT-BVQA, has been evaluated on the VDPVE (VQA Dataset for +Perceptual Video Enhancement) database through a five-fold cross validation. +The results show its superior correlation performance when compared to ten +existing no-reference quality metrics. + +
+
+ comment: This paper has been accepted by the ECCV 2024 AIM Advances in Image + Manipulation workshop +
+
+
+
+
+ + ♻ ☆ Group-aware Parameter-efficient Updating for Content-Adaptive Neural + Video Compression ACM MM 2024 + + +
+ Content-adaptive compression is crucial for enhancing the adaptability of the +pre-trained neural codec for various contents. Although these methods have been +very practical in neural image compression (NIC), their application in neural +video compression (NVC) is still limited due to two main aspects: 1), video +compression relies heavily on temporal redundancy, therefore updating just one +or a few frames can lead to significant errors accumulating over time; 2), NVC +frameworks are generally more complex, with many large components that are not +easy to update quickly during encoding. To address the previously mentioned +challenges, we have developed a content-adaptive NVC technique called +Group-aware Parameter-Efficient Updating (GPU). Initially, to minimize error +accumulation, we adopt a group-aware approach for updating encoder parameters. +This involves adopting a patch-based Group of Pictures (GoP) training strategy +to segment a video into patch-based GoPs, which will be updated to facilitate a +globally optimized domain-transferable solution. Subsequently, we introduce a +parameter-efficient delta-tuning strategy, which is achieved by integrating +several light-weight adapters into each coding component of the encoding +process by both serial and parallel configuration. Such architecture-agnostic +modules stimulate the components with large parameters, thereby reducing both +the update cost and the encoding time. We incorporate our GPU into the latest +NVC framework and conduct comprehensive experiments, whose results showcase +outstanding video compression efficiency across four video benchmarks and +adaptability of one medical image benchmark. + +
+
+ comment: Accepted by ACM MM 2024, Melbourne, Australia +
+
+
+
+
+ + ♻ ☆ CrossDF: Improving Cross-Domain Deepfake Detection with Deep Information + Decomposition + + +
+ Deepfake technology poses a significant threat to security and social trust. +Although existing detection methods have shown high performance in identifying +forgeries within datasets that use the same deepfake techniques for both +training and testing, they suffer from sharp performance degradation when faced +with cross-dataset scenarios where unseen deepfake techniques are tested. To +address this challenge, we propose a Deep Information Decomposition (DID) +framework to enhance the performance of Cross-dataset Deepfake Detection +(CrossDF). Unlike most existing deepfake detection methods, our framework +prioritizes high-level semantic features over specific visual artifacts. +Specifically, it adaptively decomposes facial features into deepfake-related +and irrelevant information, only using the intrinsic deepfake-related +information for real/fake discrimination. Moreover, it optimizes these two +kinds of information to be independent with a de-correlation learning module, +thereby enhancing the model's robustness against various irrelevant information +changes and generalization ability to unseen forgery methods. Our extensive +experimental evaluation and comparison with existing state-of-the-art detection +methods validate the effectiveness and superiority of the DID framework on +cross-dataset deepfake detection. + +
+
+
+
+
+ + ♻ ☆ Towards Extreme Image Compression with Latent Feature Guidance and + Diffusion Prior + + +
+ Image compression at extremely low bitrates (below 0.1 bits per pixel (bpp)) +is a significant challenge due to substantial information loss. In this work, +we propose a novel two-stage extreme image compression framework that exploits +the powerful generative capability of pre-trained diffusion models to achieve +realistic image reconstruction at extremely low bitrates. In the first stage, +we treat the latent representation of images in the diffusion space as +guidance, employing a VAE-based compression approach to compress images and +initially decode the compressed information into content variables. The second +stage leverages pre-trained stable diffusion to reconstruct images under the +guidance of content variables. Specifically, we introduce a small control +module to inject content information while keeping the stable diffusion model +fixed to maintain its generative capability. Furthermore, we design a space +alignment loss to force the content variables to align with the diffusion space +and provide the necessary constraints for optimization. Extensive experiments +demonstrate that our method significantly outperforms state-of-the-art +approaches in terms of visual performance at extremely low bitrates. The source +code and trained models are available at https://github.com/huai-chang/DiffEIC. + +
+
+ comment: Accepted by IEEE TCSVT +
+
+
+
+
+ + ♻ ☆ Robust Semi-supervised Multimodal Medical Image Segmentation via Cross + Modality Collaboration + + +
+ Multimodal learning leverages complementary information derived from +different modalities, thereby enhancing performance in medical image +segmentation. However, prevailing multimodal learning methods heavily rely on +extensive well-annotated data from various modalities to achieve accurate +segmentation performance. This dependence often poses a challenge in clinical +settings due to limited availability of such data. Moreover, the inherent +anatomical misalignment between different imaging modalities further +complicates the endeavor to enhance segmentation performance. To address this +problem, we propose a novel semi-supervised multimodal segmentation framework +that is robust to scarce labeled data and misaligned modalities. Our framework +employs a novel cross modality collaboration strategy to distill +modality-independent knowledge, which is inherently associated with each +modality, and integrates this information into a unified fusion layer for +feature amalgamation. With a channel-wise semantic consistency loss, our +framework ensures alignment of modality-independent information from a +feature-wise perspective across modalities, thereby fortifying it against +misalignments in multimodal scenarios. Furthermore, our framework effectively +integrates contrastive consistent learning to regulate anatomical structures, +facilitating anatomical-wise prediction alignment on unlabeled data in +semi-supervised segmentation tasks. Our method achieves competitive performance +compared to other multimodal methods across three tasks: cardiac, abdominal +multi-organ, and thyroid-associated orbitopathy segmentations. It also +demonstrates outstanding robustness in scenarios involving scarce labeled data +and misaligned modalities. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Intracranial Hemorrhage Segmentation with YOLO and an + Uncertainty Rectified Segment Anything Model + + +
+ Intracranial hemorrhage (ICH) is a life-threatening condition that requires +rapid and accurate diagnosis to improve treatment outcomes and patient survival +rates. Recent advancements in supervised deep learning have greatly improved +the analysis of medical images, but often rely on extensive datasets with +high-quality annotations, which are costly, time-consuming, and require medical +expertise to prepare. To mitigate the need for large amounts of expert-prepared +segmentation data, we have developed a novel weakly supervised ICH segmentation +method that utilizes the YOLO object detection model and an +uncertainty-rectified Segment Anything Model (SAM). In addition, we have +proposed a novel point prompt generator for this model to further improve +segmentation results with YOLO-predicted bounding box prompts. Our approach +achieved a high accuracy of 0.933 and an AUC of 0.796 in ICH detection, along +with a mean Dice score of 0.629 for ICH segmentation, outperforming existing +weakly supervised and popular supervised (UNet and Swin-UNETR) approaches. +Overall, the proposed method provides a robust and accurate alternative to the +more commonly used supervised techniques for ICH quantification without +requiring refined segmentation ground truths during model training. + +
+
+ comment: Manuscript was accepted at SWITCH2024. 10 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Learned Image Transmission with Hierarchical Variational Autoencoder + + +
+ In this paper, we introduce an innovative hierarchical joint source-channel +coding (HJSCC) framework for image transmission, utilizing a hierarchical +variational autoencoder (VAE). Our approach leverages a combination of +bottom-up and top-down paths at the transmitter to autoregressively generate +multiple hierarchical representations of the original image. These +representations are then directly mapped to channel symbols for transmission by +the JSCC encoder. We extend this framework to scenarios with a feedback link, +modeling transmission over a noisy channel as a probabilistic sampling process +and deriving a novel generative formulation for JSCC with feedback. Compared +with existing approaches, our proposed HJSCC provides enhanced adaptability by +dynamically adjusting transmission bandwidth, encoding these representations +into varying amounts of channel symbols. Additionally, we introduce a rate +attention module to guide the JSCC encoder in optimizing its encoding strategy +based on prior information. Extensive experiments on images of varying +resolutions demonstrate that our proposed model outperforms existing baselines +in rate-distortion performance and maintains robustness against channel noise. + +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Classify Power Quality Signals Using Vision + Transformers + + +
+ With the rapid integration of electronically interfaced renewable energy +resources and loads into smart grids, there is increasing interest in power +quality disturbances (PQD) classification to enhance the security and +efficiency of these grids. This paper introduces a new approach to PQD +classification based on the Vision Transformer (ViT) model. When a PQD occurs, +the proposed approach first converts the power quality signal into an image and +then utilizes a pre-trained ViT to accurately determine the class of the PQD. +Unlike most previous works, which were limited to a few disturbance classes or +small datasets, the proposed method is trained and tested on a large dataset +with 17 disturbance classes. Our experimental results show that the proposed +ViT-based approach achieves PQD classification precision and recall of 98.28% +and 97.98%, respectively, outperforming recently proposed techniques applied to +the same dataset. + +
+
+ comment: IECON 2024-50th Annual Conference of the IEEE Industrial Electronics + Society, Chicago, U.S.A, 2024, pp. 1-6 +
+
+
+
+
+ + ♻ ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images + + +
+ Text-to-image generation models have achieved remarkable advancements in +recent years, aiming to produce realistic images from textual descriptions. +However, these models often struggle with generating anatomically accurate +representations of human hands. The resulting images frequently exhibit issues +such as incorrect numbers of fingers, unnatural twisting or interlacing of +fingers, or blurred and indistinct hands. These issues stem from the inherent +complexity of hand structures and the difficulty in aligning textual +descriptions with precise visual depictions of hands. To address these +challenges, we propose a novel approach named Hand1000 that enables the +generation of realistic hand images with target gesture using only 1,000 +training samples. The training of Hand1000 is divided into three stages with +the first stage aiming to enhance the model's understanding of hand anatomy by +using a pre-trained hand gesture recognition model to extract gesture +representation. The second stage further optimizes text embedding by +incorporating the extracted hand gesture representation, to improve alignment +between the textual descriptions and the generated hand images. The third stage +utilizes the optimized embedding to fine-tune the Stable Diffusion model to +generate realistic hand images. In addition, we construct the first publicly +available dataset specifically designed for text-to-hand image generation. +Based on the existing hand gesture recognition dataset, we adopt advanced image +captioning models and LLaMA3 to generate high-quality textual descriptions +enriched with detailed gesture information. Extensive experiments demonstrate +that Hand1000 significantly outperforms existing models in producing +anatomically correct hand images while faithfully representing other details in +the text, such as faces, clothing, and colors. + +
+
+ comment: Project page https://haozhuo-zhang.github.io/Hand1000-project-page/ +
+
+
+
+
+ + ♻ ☆ MV-VTON: Multi-View Virtual Try-On with Diffusion Models + + +
+ The goal of image-based virtual try-on is to generate an image of the target +person naturally wearing the given clothing. However, existing methods solely +focus on the frontal try-on using the frontal clothing. When the views of the +clothing and person are significantly inconsistent, particularly when the +person's view is non-frontal, the results are unsatisfactory. To address this +challenge, we introduce Multi-View Virtual Try-ON (MV-VTON), which aims to +reconstruct the dressing results from multiple views using the given clothes. +Given that single-view clothes provide insufficient information for MV-VTON, we +instead employ two images, i.e., the frontal and back views of the clothing, to +encompass the complete view as much as possible. Moreover, we adopt diffusion +models that have demonstrated superior abilities to perform our MV-VTON. In +particular, we propose a view-adaptive selection method where hard-selection +and soft-selection are applied to the global and local clothing feature +extraction, respectively. This ensures that the clothing features are roughly +fit to the person's view. Subsequently, we suggest joint attention blocks to +align and fuse clothing features with person features. Additionally, we collect +a MV-VTON dataset MVG, in which each person has multiple photos with diverse +views and poses. Experiments show that the proposed method not only achieves +state-of-the-art results on MV-VTON task using our MVG dataset, but also has +superiority on frontal-view virtual try-on task using VITON-HD and DressCode +datasets. Codes and datasets are publicly released at +https://github.com/hywang2002/MV-VTON . + +
+
+ comment: Project url: https://hywang2002.github.io/MV-VTON/ +
+
+
+
+
+ + ♻ ☆ Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in + Federated Class Continual Learning ECCV 2024 + + +
+ Federated Class Continual Learning (FCCL) merges the challenges of +distributed client learning with the need for seamless adaptation to new +classes without forgetting old ones. The key challenge in FCCL is catastrophic +forgetting, an issue that has been explored to some extent in Continual +Learning (CL). However, due to privacy preservation requirements, some +conventional methods, such as experience replay, are not directly applicable to +FCCL. Existing FCCL methods mitigate forgetting by generating historical data +through federated training of GANs or data-free knowledge distillation. +However, these approaches often suffer from unstable training of generators or +low-quality generated data, limiting their guidance for the model. To address +this challenge, we propose a novel method of data replay based on diffusion +models. Instead of training a diffusion model, we employ a pre-trained +conditional diffusion model to reverse-engineer each class, searching the +corresponding input conditions for each class within the model's input space, +significantly reducing computational resources and time consumption while +ensuring effective generation. Furthermore, we enhance the classifier's domain +generalization ability on generated and real data through contrastive learning, +indirectly improving the representational capability of generated data for real +data. Comprehensive experiments demonstrate that our method significantly +outperforms existing baselines. Code is available at +https://github.com/jinglin-liang/DDDR. + +
+
+ comment: Accepted by ECCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ ORMNet: Object-centric Relationship Modeling for Egocentric Hand-object + Segmentation + + +
+ Egocentric hand-object segmentation (EgoHOS) is a promising new task aiming +at segmenting hands and interacting objects in egocentric images. Although +EgoHOS has the potential to enable various applications, current methods +struggle to achieve both high performance and end-to-end optimization +simultaneously. Moreover, existing approaches fail to fully leverage hand cues +to assist the interacting-object segmentation and overlook the coupled +relationships between diverse interacting-object categories, resulting in +performance deficiencies. To address these limitations, this paper proposes a +novel Object-centric Relationship Modeling Network (ORMNet) to fulfill +end-to-end and effective EgoHOS by modeling relationships between hands and +objects as well as objects and objects. Specifically, a Hand-Object Relation +(HOR) module is introduced to capture the correlation between hands and +objects, which uses hand features to guide the network to extract more +distinguishing interacting-object features. Besides, we find the coupling +relations between diverse interacting-object categories and design the Object +Relation Decoupling (ORD) strategy to disentangle them, emphasizing learning of +the interaction between hands and objects and reducing the confusion of +interacting-object classification. In-domain experiments show that ORMNet has +notably exceptional segmentation performance compared with state-of-the-art +methods, while out-of-domain experiments further exhibit its robust +generalization capability. The project is available at +https://github.com/yuggiehk/ORMNet/ + +
+
+
+
+
+ + ♻ ☆ MADE-for-ASD: A Multi-Atlas Deep Ensemble Network for Diagnosing Autism + Spectrum Disorder + + +
+ In response to the global need for efficient early diagnosis of Autism +Spectrum Disorder (ASD), this paper bridges the gap between traditional, +time-consuming diagnostic methods and potential automated solutions. We propose +a multi-atlas deep ensemble network, MADE-for-ASD, that integrates multiple +atlases of the brain's functional magnetic resonance imaging (fMRI) data +through a weighted deep ensemble network. Our approach integrates demographic +information into the prediction workflow, which enhances ASD diagnosis +performance and offers a more holistic perspective on patient profiling. We +experiment with the well-known publicly available ABIDE (Autism Brain Imaging +Data Exchange) I dataset, consisting of resting state fMRI data from 17 +different laboratories around the globe. Our proposed system achieves 75.20% +accuracy on the entire dataset and 96.40% on a specific subset $-$ both +surpassing reported ASD diagnosis accuracy in ABIDE I fMRI studies. +Specifically, our model improves by 4.4 percentage points over prior works on +the same amount of data. The model exhibits a sensitivity of 82.90% and a +specificity of 69.70% on the entire dataset, and 91.00% and 99.50%, +respectively, on the specific subset. We leverage the F-score to pinpoint the +top 10 ROI in ASD diagnosis, such as precuneus and anterior +cingulate/ventromedial. The proposed system can potentially pave the way for +more cost-effective, efficient and scalable strategies in ASD diagnosis. Codes +and evaluations are publicly available at +https://github.com/hasan-rakibul/MADE-for-ASD. + +
+
+ comment: Xuehan Liu and Md Rakibul Hasan contributed equally to this work +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+ + ♻ ☆ Asynchronous Blob Tracker for Event Cameras + + +
+ Event-based cameras are popular for tracking fast-moving objects due to their +high temporal resolution, low latency, and high dynamic range. In this paper, +we propose a novel algorithm for tracking event blobs using raw events +asynchronously in real time. We introduce the concept of an event blob as a +spatio-temporal likelihood of event occurrence where the conditional spatial +likelihood is blob-like. Many real-world objects such as car headlights or any +quickly moving foreground objects generate event blob data. The proposed +algorithm uses a nearest neighbour classifier with a dynamic threshold criteria +for data association coupled with an extended Kalman filter to track the event +blob state. Our algorithm achieves highly accurate blob tracking, velocity +estimation, and shape estimation even under challenging lighting conditions and +high-speed motions (> 11000 pixels/s). The microsecond time resolution achieved +means that the filter output can be used to derive secondary information such +as time-to-contact or range estimation, that will enable applications to +real-world problems such as collision avoidance in autonomous driving. + +
+
+ comment: 18 pages, 16 figures. The manuscript was accepted on August 7, 2024, + by IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video + Solution to Enhance Community Safety + + +
+ This article adopts and evaluates an AI-enabled Smart Video Solution (SVS) +designed to enhance safety in the real world. The system integrates with +existing infrastructure camera networks, leveraging recent advancements in AI +for easy adoption. Prioritizing privacy and ethical standards, pose based data +is used for downstream AI tasks such as anomaly detection. Cloud-based +infrastructure and mobile app are deployed, enabling real-time alerts within +communities. The SVS employs innovative data representation and visualization +techniques, such as the Occupancy Indicator, Statistical Anomaly Detection, +Bird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance +public safety. Evaluation of the SVS demonstrates its capacity to convert +complex computer vision outputs into actionable insights for stakeholders, +community partners, law enforcement, urban planners, and social scientists. +This article presents a comprehensive real-world deployment and evaluation of +the SVS, implemented in a community college environment across 16 cameras. The +system integrates AI-driven visual processing, supported by statistical +analysis, database management, cloud communication, and user notifications. +Additionally, the article evaluates the end-to-end latency from the moment an +AI algorithm detects anomalous behavior in real-time at the camera level to the +time stakeholders receive a notification. The results demonstrate the system's +robustness, effectively managing 16 CCTV cameras with a consistent throughput +of 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end +latency of 26.76 seconds between anomaly detection and alert issuance. + +
+
+
+
+
+ + ♻ ☆ Depth-guided NeRF Training via Earth Mover's Distance ECCV 2024 + + +
+ Neural Radiance Fields (NeRFs) are trained to minimize the rendering loss of +predicted viewpoints. However, the photometric loss often does not provide +enough information to disambiguate between different possible geometries +yielding the same image. Previous work has thus incorporated depth supervision +during NeRF training, leveraging dense predictions from pre-trained depth +networks as pseudo-ground truth. While these depth priors are assumed to be +perfect once filtered for noise, in practice, their accuracy is more +challenging to capture. This work proposes a novel approach to uncertainty in +depth priors for NeRF supervision. Instead of using custom-trained depth or +uncertainty priors, we use off-the-shelf pretrained diffusion models to predict +depth and capture uncertainty during the denoising process. Because we know +that depth priors are prone to errors, we propose to supervise the ray +termination distance distribution with Earth Mover's Distance instead of +enforcing the rendered depth to replicate the depth prior exactly through +L2-loss. Our depth-guided NeRF outperforms all baselines on standard depth +metrics by a large margin while maintaining performance on photometric +measures. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ♻ ☆ VANP: Learning Where to See for Navigation with Self-Supervised + Vision-Action Pre-Training IROS 2024 + + +
+ Humans excel at efficiently navigating through crowds without collision by +focusing on specific visual regions relevant to navigation. However, most +robotic visual navigation methods rely on deep learning models pre-trained on +vision tasks, which prioritize salient objects -- not necessarily relevant to +navigation and potentially misleading. Alternative approaches train specialized +navigation models from scratch, requiring significant computation. On the other +hand, self-supervised learning has revolutionized computer vision and natural +language processing, but its application to robotic navigation remains +underexplored due to the difficulty of defining effective self-supervision +signals. Motivated by these observations, in this work, we propose a +Self-Supervised Vision-Action Model for Visual Navigation Pre-Training (VANP). +Instead of detecting salient objects that are beneficial for tasks such as +classification or detection, VANP learns to focus only on specific visual +regions that are relevant to the navigation task. To achieve this, VANP uses a +history of visual observations, future actions, and a goal image for +self-supervision, and embeds them using two small Transformer Encoders. Then, +VANP maximizes the information between the embeddings by using a mutual +information maximization objective function. We demonstrate that most +VANP-extracted features match with human navigation intuition. VANP achieves +comparable performance as models learned end-to-end with half the training time +and models trained on a large-scale, fully supervised dataset, i.e., ImageNet, +with only 0.08% data. + +
+
+ comment: Extended version of the paper accepted at IROS 2024. Code: + https://github.com/mhnazeri/VANP +
+
+
+
+
+ + ♻ ☆ Contrastive Learning with Consistent Representations + + +
+ Contrastive learning demonstrates great promise for representation learning. +Data augmentations play a critical role in contrastive learning by providing +informative views of the data without necessitating explicit labels. +Nonetheless, the efficacy of current methodologies heavily hinges on the +quality of employed data augmentation (DA) functions, often chosen manually +from a limited set of options. While exploiting diverse data augmentations is +appealing, the complexities inherent in both DAs and representation learning +can lead to performance deterioration. Addressing this challenge and +facilitating the systematic incorporation of diverse data augmentations, this +paper proposes Contrastive Learning with Consistent Representations CoCor. At +the heart of CoCor is a novel consistency metric termed DA consistency. This +metric governs the mapping of augmented input data to the representation space, +ensuring that these instances are positioned optimally in a manner consistent +with the applied intensity of the DA. Moreover, we propose to learn the optimal +mapping locations as a function of DA, all while preserving a desired monotonic +property relative to DA intensity. Experimental results demonstrate that CoCor +notably enhances the generalizability and transferability of learned +representations in comparison to baseline methods. + +
+
+ comment: Accepted by TMLR +
+
+
+
+
+ + ♻ ☆ ConGeo: Robust Cross-view Geo-localization across Ground View Variations ECCV2024 + + +
+ Cross-view geo-localization aims at localizing a ground-level query image by +matching it to its corresponding geo-referenced aerial view. In real-world +scenarios, the task requires accommodating diverse ground images captured by +users with varying orientations and reduced field of views (FoVs). However, +existing learning pipelines are orientation-specific or FoV-specific, demanding +separate model training for different ground view variations. Such models +heavily depend on the North-aligned spatial correspondence and predefined FoVs +in the training data, compromising their robustness across different settings. +To tackle this challenge, we propose ConGeo, a single- and cross-view +Contrastive method for Geo-localization: it enhances robustness and consistency +in feature representations to improve a model's invariance to orientation and +its resilience to FoV variations, by enforcing proximity between ground view +variations of the same location. As a generic learning objective for cross-view +geo-localization, when integrated into state-of-the-art pipelines, ConGeo +significantly boosts the performance of three base models on four +geo-localization benchmarks for diverse ground view variations and outperforms +competing methods that train separate models for each ground view variation. + +
+
+ comment: ECCV2024. Project page at https://eceo-epfl.github.io/ConGeo/ +
+
+
+
+
+
+
+
+ + Information Retrieval 21 + +
+
+
+ + ☆ Bioinformatics Retrieval Augmentation Data (BRAD) Digital Assistant + + +
+ We present a prototype for a Bioinformatics Retrieval Augmentation Data +(BRAD) digital assistant. BRAD integrates a suite of tools to handle a wide +range of bioinformatics tasks, from code execution to online search. We +demonstrate BRAD's capabilities through (1) improved question-and-answering +with retrieval augmented generation (RAG), (2) BRAD's ability to run and write +complex software pipelines, and (3) BRAD's ability to organize and distribute +tasks across individual and teams of agents. We use BRAD for automation of +bioinformatics workflows, performing tasks ranging from gene enrichment and +searching the archive to automatic code generation and running biomarker +identification pipelines. BRAD is a step toward the ultimate goal to develop a +digital twin of laboratories driven by self-contained loops for hypothesis +generation and testing of digital biology experiments. + +
+
+
+
+
+ + ☆ Building a Scalable, Effective, and Steerable Search and Ranking + Platform + + +
+ Modern e-commerce platforms offer vast product selections, making it +difficult for customers to find items that they like and that are relevant to +their current session intent. This is why it is key for e-commerce platforms to +have near real-time scalable and adaptable personalized ranking and search +systems. While numerous methods exist in the scientific literature for building +such systems, many are unsuitable for large-scale industrial use due to +complexity and performance limitations. Consequently, industrial ranking +systems often resort to computationally efficient yet simplistic retrieval or +candidate generation approaches, which overlook near real-time and +heterogeneous customer signals, which results in a less personalized and +relevant experience. Moreover, related customer experiences are served by +completely different systems, which increases complexity, maintenance, and +inconsistent experiences. + In this paper, we present a personalized, adaptable near real-time ranking +platform that is reusable across various use cases, such as browsing and +search, and that is able to cater to millions of items and customers under +heavy load (thousands of requests per second). We employ transformer-based +models through different ranking layers which can learn complex behavior +patterns directly from customer action sequences while being able to +incorporate temporal (e.g. in-session) and contextual information. We validate +our system through a series of comprehensive offline and online real-world +experiments at a large online e-commerce platform, and we demonstrate its +superiority when compared to existing systems, both in terms of customer +experience as well as in net revenue. Finally, we share the lessons learned +from building a comprehensive, modern ranking platform for use in a large-scale +e-commerce environment. + +
+
+
+
+
+ + ☆ Pooling And Attention: What Are Effective Designs For LLm-Based + Embedding Models? + + +
+ The significant advancements of Large Language Models (LLMs) in generative +tasks have led to a growing body of work exploring LLM-based embedding models. +While these models, employing different pooling and attention strategies, have +achieved state-of-the-art performance on public embedding benchmarks, questions +still arise about what constitutes an effective design for LLM-based embedding +models. However, these models are often trained on different datasets, using +different LLM base models or training settings. Moreover, evaluations on public +embedding benchmarks often fail to report statistical significance, making it +difficult to determine which designs truly contribute to final performance. +This complicates the process for practitioners seeking optimal training recipes +for LLM-based embedding models. In this study, we conduct a large-scale +experiment by training a series of LLM-based embedding models using the same +training data and base model but differing in their pooling and attention +strategies. The results show that there is no one-size-fits-all solution: while +bidirectional attention and an additional trainable pooling layer outperform in +text similarity and information retrieval tasks, they do not significantly +surpass simpler designs like EOS-last token pooling and default causal +attention in clustering and classification tasks. Furthermore, we propose a new +pooling strategy, Multi-Layers Trainable Pooling, which transforms the outputs +of all hidden layers, rather than just the last layer, using a cross-attention +network. This method proves to be statistically superior in text similarity and +retrieval tasks compared to existing pooling methods. Overall, this paper sheds +light on effective training strategies for LLM-based embedding models. + +
+
+ comment: https://github.com/yixuantt/PoolingAndAttn +
+
+
+
+
+ + ☆ RouterRetriever: Exploring the Benefits of Routing over Multiple Expert + Embedding Models + + +
+ Information retrieval methods often rely on a single embedding model trained +on large, general-domain datasets like MSMARCO. While this approach can produce +a retriever with reasonable overall performance, models trained on +domain-specific data often yield better results within their respective +domains. While prior work in information retrieval has tackled this through +multi-task training, the topic of combining multiple domain-specific expert +retrievers remains unexplored, despite its popularity in language model +generation. In this work, we introduce RouterRetriever, a retrieval model that +leverages multiple domain-specific experts along with a routing mechanism to +select the most appropriate expert for each query. It is lightweight and allows +easy addition or removal of experts without additional training. Evaluation on +the BEIR benchmark demonstrates that RouterRetriever outperforms both +MSMARCO-trained (+2.1 absolute nDCG@10) and multi-task trained (+3.2) models. +This is achieved by employing our routing mechanism, which surpasses other +routing techniques (+1.8 on average) commonly used in language modeling. +Furthermore, the benefit generalizes well to other datasets, even in the +absence of a specific expert on the dataset. To our knowledge, RouterRetriever +is the first work to demonstrate the advantages of using multiple +domain-specific expert embedding models with effective routing over a single, +general-purpose embedding model in retrieval tasks. + +
+
+
+
+
+ + ☆ A Fashion Item Recommendation Model in Hyperbolic Space CVPR 2024 + + +
+ In this work, we propose a fashion item recommendation model that +incorporates hyperbolic geometry into user and item representations. Using +hyperbolic space, our model aims to capture implicit hierarchies among items +based on their visual data and users' purchase history. During training, we +apply a multi-task learning framework that considers both hyperbolic and +Euclidean distances in the loss function. Our experiments on three data sets +show that our model performs better than previous models trained in Euclidean +space only, confirming the effectiveness of our model. Our ablation studies +show that multi-task learning plays a key role, and removing the Euclidean loss +substantially deteriorates the model performance. + +
+
+ comment: This work was presented at the CVFAD Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ AlignGroup: Learning and Aligning Group Consensus with Member + Preferences for Group Recommendation CIKM 2024 + + +
+ Group activities are important behaviors in human society, providing +personalized recommendations for groups is referred to as the group +recommendation task. Existing methods can usually be categorized into two +strategies to infer group preferences: 1) determining group preferences by +aggregating members' personalized preferences, and 2) inferring group consensus +by capturing group members' coherent decisions after common compromises. +However, the former would suffer from the lack of group-level considerations, +and the latter overlooks the fine-grained preferences of individual users. To +this end, we propose a novel group recommendation method AlignGroup, which +focuses on both group consensus and individual preferences of group members to +infer the group decision-making. Specifically, AlignGroup explores group +consensus through a well-designed hypergraph neural network that efficiently +learns intra- and inter-group relationships. Moreover, AlignGroup innovatively +utilizes a self-supervised alignment task to capture fine-grained group +decision-making by aligning the group consensus with members' common +preferences. Extensive experiments on two real-world datasets validate that our +AlignGroup outperforms the state-of-the-art on both the group recommendation +task and the user recommendation task, as well as outperforms the efficiency of +most baselines. + +
+
+ comment: 10 pages, accepted by CIKM 2024 +
+
+
+
+
+ + ☆ iRangeGraph: Improvising Range-dedicated Graphs for Range-filtering + Nearest Neighbor Search SIGMOD 2025 + + +
+ Range-filtering approximate nearest neighbor (RFANN) search is attracting +increasing attention in academia and industry. Given a set of data objects, +each being a pair of a high-dimensional vector and a numeric value, an RFANN +query with a vector and a numeric range as parameters returns the data object +whose numeric value is in the query range and whose vector is nearest to the +query vector. To process this query, a recent study proposes to build $O(n^2)$ +dedicated graph-based indexes for all possible query ranges to enable efficient +processing on a database of $n$ objects. As storing all these indexes is +prohibitively expensive, the study constructs compressed indexes instead, which +reduces the memory consumption considerably. However, this incurs suboptimal +performance because the compression is lossy. In this study, instead of +materializing a compressed index for every possible query range in preparation +for querying, we materialize graph-based indexes, called elemental graphs, for +a moderate number of ranges. We then provide an effective and efficient +algorithm that during querying can construct an index for any query range using +the elemental graphs. We prove that the time needed to construct such an index +is low. We also cover an experimental study on real-world datasets that +provides evidence that the materialized elemental graphs only consume moderate +space and that the proposed method is capable of superior and stable query +performance across different query workloads. + +
+
+ comment: The paper has been accepted by SIGMOD 2025 +
+
+
+
+
+ + ☆ An Effective Tag Assignment Approach for Billboard Advertisement + + +
+ Billboard Advertisement has gained popularity due to its significant outrage +in return on investment. To make this advertisement approach more effective, +the relevant information about the product needs to be reached to the relevant +set of people. This can be achieved if the relevant set of tags can be mapped +to the correct slots. Formally, we call this problem the Tag Assignment Problem +in Billboard Advertisement. Given trajectory, billboard database, and a set of +selected billboard slots and tags, this problem asks to output a mapping of +selected tags to the selected slots so that the influence is maximized. We +model this as a variant of traditional bipartite matching called One-To-Many +Bipartite Matching (OMBM). Unlike traditional bipartite matching, a tag can be +assigned to only one slot; in the OMBM, a tag can be assigned to multiple slots +while the vice versa can not happen. We propose an iterative solution approach +that incrementally allocates the tags to the slots. The proposed methodology +has been explained with an illustrated example. A complexity analysis of the +proposed solution approach has also been conducted. The experimental results on +real-world trajectory and billboard datasets prove our claim on the +effectiveness and efficiency of the proposed solution. + +
+
+ comment: This Paper has been accepted at The 25th International Web + Information Systems Engineering Conference (WISE-2024) +
+
+
+
+
+ + ☆ Deep Adaptive Interest Network: Personalized Recommendation with + Context-Aware Learning + + +
+ In personalized recommendation systems, accurately capturing users' evolving +interests and combining them with contextual information is a critical research +area. This paper proposes a novel model called the Deep Adaptive Interest +Network (DAIN), which dynamically models users' interests while incorporating +context-aware learning mechanisms to achieve precise and adaptive personalized +recommendations. DAIN leverages deep learning techniques to build an adaptive +interest network structure that can capture users' interest changes in +real-time while further optimizing recommendation results by integrating +contextual information. Experiments conducted on several public datasets +demonstrate that DAIN excels in both recommendation performance and +computational efficiency. This research not only provides a new solution for +personalized recommendation systems but also offers fresh insights into the +application of context-aware learning in recommendation systems. + +
+
+
+
+
+ + ☆ NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for + Retrieval + + +
+ $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval) +from pre-trained embedding models is the predominant retrieval method for text +and images, as well as Retrieval-Augmented Generation (RAG) pipelines. In +practice, application developers often fine-tune the embeddings to improve +their accuracy on the dataset and query workload in hand. Existing approaches +either fine-tune the pre-trained model itself or, more efficiently, but at the +cost of accuracy, train adaptor models to transform the output of the +pre-trained model. We present NUDGE, a family of novel non-parametric embedding +fine-tuning approaches that are significantly more accurate and efficient than +both sets of existing approaches. NUDGE directly modifies the embeddings of +data records to maximize the accuracy of $k$-NN retrieval. We present a +thorough theoretical and experimental study of NUDGE's non-parametric approach. +We show that even though the underlying problem is NP-Hard, constrained +variations can be solved efficiently. These constraints additionally ensure +that the changes to the embeddings are modest, avoiding large distortions to +the semantics learned during pre-training. In experiments across five +pre-trained models and nine standard text and image retrieval datasets, NUDGE +runs in minutes and often improves NDCG@10 by more than 10% over existing +fine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase +in accuracy and runs 200x and 3x faster, respectively, over fine-tuning the +pre-trained model and training adaptors. + +
+
+
+
+
+ + ☆ Do We Trust What They Say or What They Do? A Multimodal User Embedding + Provides Personalized Explanations + + +
+ With the rapid development of social media, the importance of analyzing +social network user data has also been put on the agenda. User representation +learning in social media is a critical area of research, based on which we can +conduct personalized content delivery, or detect malicious actors. Being more +complicated than many other types of data, social network user data has +inherent multimodal nature. Various multimodal approaches have been proposed to +harness both text (i.e. post content) and relation (i.e. inter-user +interaction) information to learn user embeddings of higher quality. The advent +of Graph Neural Network models enables more end-to-end integration of user text +embeddings and user interaction graphs in social networks. However, most of +those approaches do not adequately elucidate which aspects of the data - text +or graph structure information - are more helpful for predicting each specific +user under a particular task, putting some burden on personalized downstream +analysis and untrustworthy information filtering. We propose a simple yet +effective framework called Contribution-Aware Multimodal User Embedding (CAMUE) +for social networks. We have demonstrated with empirical evidence, that our +approach can provide personalized explainable predictions, automatically +mitigating the impact of unreliable information. We also conducted case studies +to show how reasonable our results are. We observe that for most users, graph +structure information is more trustworthy than text information, but there are +some reasonable cases where text helps more. Our work paves the way for more +explainable, reliable, and effective social media user embedding which allows +for better personalized content delivery. + +
+
+
+
+
+ + ♻ ☆ The Design of an LLM-powered Unstructured Analytics System + + +
+ LLMs demonstrate an uncanny ability to process unstructured data, and as +such, have the potential to go beyond search and run complex, semantic analyses +at scale. We describe the design of an unstructured analytics system, Aryn, and +the tenets and use cases that motivate its design. With Aryn, users can specify +queries in natural language and the system automatically determines a semantic +plan and executes it to compute an answer from a large collection of +unstructured documents using LLMs. At the core of Aryn is Sycamore, a +declarative document processing engine, built using Ray, that provides a +reliable distributed abstraction called DocSets. Sycamore allows users to +analyze, enrich, and transform complex documents at scale. Aryn also comprises +Luna, a query planner that translates natural language queries to Sycamore +scripts, and the Aryn Partitioner, which takes raw PDFs and document images, +and converts them to DocSets for downstream processing. Using Aryn, we +demonstrate a real world use case for analyzing accident reports from the +National Transportation Safety Board (NTSB), and discuss some of the major +challenges we encountered in deploying Aryn in the wild. + +
+
+ comment: 6 pages, 3 figures, fixed typos +
+
+
+
+
+ + ♻ ☆ Multimodal Recommender Systems: A Survey + + +
+ The recommender system (RS) has been an integral toolkit of online services. +They are equipped with various deep learning techniques to model user +preference based on identifier and attribute information. With the emergence of +multimedia services, such as short videos, news and etc., understanding these +contents while recommending becomes critical. Besides, multimodal features are +also helpful in alleviating the problem of data sparsity in RS. Thus, +Multimodal Recommender System (MRS) has attracted much attention from both +academia and industry recently. In this paper, we will give a comprehensive +survey of the MRS models, mainly from technical views. First, we conclude the +general procedures and major challenges for MRS. Then, we introduce the +existing MRS models according to four categories, i.e., Modality Encoder, +Feature Interaction, Feature Enhancement and Model Optimization. Besides, to +make it convenient for those who want to research this field, we also summarize +the dataset and code resources. Finally, we discuss some promising future +directions of MRS and conclude this paper. To access more details of the +surveyed papers, such as implementation code, we open source a repository. + +
+
+ comment: accepted by CSUR +
+
+
+
+
+ + ♻ ☆ MARS: Matching Attribute-aware Representations for Text-based Sequential + Recommendation CIKM 2024 + + +
+ Sequential recommendation aims to predict the next item a user is likely to +prefer based on their sequential interaction history. Recently, text-based +sequential recommendation has emerged as a promising paradigm that uses +pre-trained language models to exploit textual item features to enhance +performance and facilitate knowledge transfer to unseen datasets. However, +existing text-based recommender models still struggle with two key challenges: +(i) representing users and items with multiple attributes, and (ii) matching +items with complex user interests. To address these challenges, we propose a +novel model, Matching Attribute-aware Representations for Text-based Sequential +Recommendation (MARS). MARS extracts detailed user and item representations +through attribute-aware text encoding, capturing diverse user intents with +multiple attribute-aware representations. It then computes user-item scores via +attribute-wise interaction matching, effectively capturing attribute-level user +preferences. Our extensive experiments demonstrate that MARS significantly +outperforms existing sequential models, achieving improvements of up to 24.43% +and 29.26% in Recall@10 and NDCG@10 across five benchmark datasets. Code is +available at https://github.com/junieberry/MARS + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ♻ ☆ HIRO: Hierarchical Information Retrieval Optimization + + +
+ Retrieval-Augmented Generation (RAG) has revolutionized natural language +processing by dynamically integrating external knowledge into Large Language +Models (LLMs), addressing their limitation of static training datasets. Recent +implementations of RAG leverage hierarchical data structures, which organize +documents at various levels of summarization and information density. This +complexity, however, can cause LLMs to "choke" on information overload, +necessitating more sophisticated querying mechanisms. In this context, we +introduce Hierarchical Information Retrieval Optimization (HIRO), a novel +querying approach that employs a Depth-First Search (DFS)-based recursive +similarity score calculation and branch pruning. This method uniquely minimizes +the context delivered to the LLM without informational loss, effectively +managing the challenge of excessive data. HIRO's refined approach is validated +by a 10.85% improvement in performance on the NarrativeQA dataset. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions, such as +search agents, within this expanding field. + +
+
+ comment: updated to version 3 +
+
+
+
+
+ + ♻ ☆ Smart E-commerce Recommendations with Semantic AI + + +
+ In e-commerce, web mining for page recommendations is widely used but often +fails to meet user needs. To address this, we propose a novel solution +combining semantic web mining with BP neural networks. We process user search +logs to extract five key features: content priority, time spent, user feedback, +recommendation semantics, and input deviation. These features are then fed into +a BP neural network to classify and prioritize web pages. The prioritized pages +are recommended to users. Using book sales pages for testing, our results +demonstrate that this solution can quickly and accurately identify the pages +users need. Our approach ensures that recommendations are more relevant and +tailored to individual preferences, enhancing the online shopping experience. +By leveraging advanced semantic analysis and neural network techniques, we +bridge the gap between user expectations and actual recommendations. This +innovative method not only improves accuracy but also speeds up the +recommendation process, making it a valuable tool for e-commerce platforms +aiming to boost user satisfaction and engagement. Additionally, our system +ability to handle large datasets and provide real-time recommendations makes it +a scalable and efficient solution for modern e-commerce challenges. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ NFARec: A Negative Feedback-Aware Recommender Model SIGIR 2024 + + +
+ Graph neural network (GNN)-based models have been extensively studied for +recommendations, as they can extract high-order collaborative signals +accurately which is required for high-quality recommender systems. However, +they neglect the valuable information gained through negative feedback in two +aspects: (1) different users might hold opposite feedback on the same item, +which hampers optimal information propagation in GNNs, and (2) even when an +item vastly deviates from users' preferences, they might still choose it and +provide a negative rating. In this paper, we propose a negative feedback-aware +recommender model (NFARec) that maximizes the leverage of negative feedback. To +transfer information to multi-hop neighbors along an optimal path effectively, +NFARec adopts a feedback-aware correlation that guides hypergraph convolutions +(HGCs) to learn users' structural representations. Moreover, NFARec +incorporates an auxiliary task - predicting the feedback sentiment polarity +(i.e., positive or negative) of the next interaction - based on the Transformer +Hawkes Process. The task is beneficial for understanding users by learning the +sentiment expressed in their previous sequential feedback patterns and +predicting future interactions. Extensive experiments demonstrate that NFARec +outperforms competitive baselines. Our source code and data are released at +https://github.com/WangXFng/NFARec. + +
+
+ comment: Accepted to SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ CaDRec: Contextualized and Debiased Recommender Model SIGIR 2024 + + +
+ Recommender models aimed at mining users' behavioral patterns have raised +great attention as one of the essential applications in daily life. Recent work +on graph neural networks (GNNs) or debiasing methods has attained remarkable +gains. However, they still suffer from (1) over-smoothing node embeddings +caused by recursive convolutions with GNNs, and (2) the skewed distribution of +interactions due to popularity and user-individual biases. This paper proposes +a contextualized and debiased recommender model (CaDRec). To overcome the +over-smoothing issue, we explore a novel hypergraph convolution operator that +can select effective neighbors during convolution by introducing both +structural context and sequential context. To tackle the skewed distribution, +we propose two strategies for disentangling interactions: (1) modeling +individual biases to learn unbiased item embeddings, and (2) incorporating item +popularity with positional encoding. Moreover, we mathematically show that the +imbalance of the gradients to update item embeddings exacerbates the popularity +bias, thus adopting regularization and weighting schemes as solutions. +Extensive experiments on four datasets demonstrate the superiority of the +CaDRec against state-of-the-art (SOTA) methods. Our source code and data are +released at https://github.com/WangXFng/CaDRec. + +
+
+ comment: Accepted to SIGIR 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating Named Entity Recognition Using Few-Shot Prompting with Large + Language Models + + +
+ This paper evaluates Few-Shot Prompting with Large Language Models for Named +Entity Recognition (NER). Traditional NER systems rely on extensive labeled +datasets, which are costly and time-consuming to obtain. Few-Shot Prompting or +in-context learning enables models to recognize entities with minimal examples. +We assess state-of-the-art models like GPT-4 in NER tasks, comparing their +few-shot performance to fully supervised benchmarks. Results show that while +there is a performance gap, large models excel in adapting to new entity types +and domains with very limited data. We also explore the effects of prompt +engineering, guided output format and context length on performance. This study +underscores Few-Shot Learning's potential to reduce the need for large labeled +datasets, enhancing NER scalability and accessibility. + +
+
+ comment: Github repo: https://github.com/GEODE-project/ner-llm +
+
+
+
+
+ + ♻ ☆ Jina-ColBERT-v2: A General-Purpose Multilingual Late Interaction + Retriever EMNLP + + +
+ Multi-vector dense models, such as ColBERT, have proven highly effective in +information retrieval. ColBERT's late interaction scoring approximates the +joint query-document attention seen in cross-encoders while maintaining +inference efficiency closer to traditional dense retrieval models, thanks to +its bi-encoder architecture and recent optimizations in indexing and search. In +this paper, we introduce a novel architecture and a training framework to +support long context window and multilingual retrieval. Our new model, +Jina-ColBERT-v2, demonstrates strong performance across a range of English and +multilingual retrieval tasks, + +
+
+ comment: 8 pages, references at pp7,8; EMNLP workshop submission +
+
+
+
+
+
+
+
+ + Machine Learning 149 + +
+
+
+ + ☆ Masked Diffusion Models are Secretly Time-Agnostic Masked Models and + Exploit Inaccurate Categorical Sampling + + +
+ Masked diffusion models (MDMs) have emerged as a popular research topic for +generative modeling of discrete data, thanks to their superior performance over +other discrete diffusion models, and are rivaling the auto-regressive models +(ARMs) for language modeling tasks. The recent effort in simplifying the masked +diffusion framework further leads to alignment with continuous-space diffusion +models and more principled training and sampling recipes. In this paper, +however, we reveal that both training and sampling of MDMs are theoretically +free from the time variable, arguably the key signature of diffusion models, +and are instead equivalent to masked models. The connection on the sampling +aspect is drawn by our proposed first-hitting sampler (FHS). Specifically, we +show that the FHS is theoretically equivalent to MDMs' original generation +process while significantly alleviating the time-consuming categorical sampling +and achieving a 20$\times$ speedup. In addition, our investigation challenges +previous claims that MDMs can surpass ARMs in generative perplexity. We +identify, for the first time, an underlying numerical issue, even with the +32-bit floating-point precision, which results in inaccurate categorical +sampling. We show that the numerical issue lowers the effective temperature +both theoretically and empirically, leading to unfair assessments of MDMs' +generation results in the previous literature. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ Topological Methods in Machine Learning: A Tutorial for Practitioners + + +
+ Topological Machine Learning (TML) is an emerging field that leverages +techniques from algebraic topology to analyze complex data structures in ways +that traditional machine learning methods may not capture. This tutorial +provides a comprehensive introduction to two key TML techniques, persistent +homology and the Mapper algorithm, with an emphasis on practical applications. +Persistent homology captures multi-scale topological features such as clusters, +loops, and voids, while the Mapper algorithm creates an interpretable graph +summarizing high-dimensional data. To enhance accessibility, we adopt a +data-centric approach, enabling readers to gain hands-on experience applying +these techniques to relevant tasks. We provide step-by-step explanations, +implementations, hands-on examples, and case studies to demonstrate how these +tools can be applied to real-world problems. The goal is to equip researchers +and practitioners with the knowledge and resources to incorporate TML into +their work, revealing insights often hidden from conventional machine learning +methods. The tutorial code is available at +https://github.com/cakcora/TopologyForML + +
+
+ comment: 54 pages, 35 figures +
+
+
+
+
+ + ☆ Regional data-driven weather modeling with a global stretched-grid + + +
+ A data-driven model (DDM) suitable for regional weather forecasting +applications is presented. The model extends the Artificial Intelligence +Forecasting System by introducing a stretched-grid architecture that dedicates +higher resolution over a regional area of interest and maintains a lower +resolution elsewhere on the globe. The model is based on graph neural networks, +which naturally affords arbitrary multi-resolution grid configurations. + The model is applied to short-range weather prediction for the Nordics, +producing forecasts at 2.5 km spatial and 6 h temporal resolution. The model is +pre-trained on 43 years of global ERA5 data at 31 km resolution and is further +refined using 3.3 years of 2.5 km resolution operational analyses from the +MetCoOp Ensemble Prediction System (MEPS). The performance of the model is +evaluated using surface observations from measurement stations across Norway +and is compared to short-range weather forecasts from MEPS. The DDM outperforms +both the control run and the ensemble mean of MEPS for 2 m temperature. The +model also produces competitive precipitation and wind speed forecasts, but is +shown to underestimate extreme events. + +
+
+
+
+
+ + ☆ Benchmarking Spurious Bias in Few-Shot Image Classifiers ECCV 2024 + + +
+ Few-shot image classifiers are designed to recognize and classify new data +with minimal supervision and limited data but often show reliance on spurious +correlations between classes and spurious attributes, known as spurious bias. +Spurious correlations commonly hold in certain samples and few-shot classifiers +can suffer from spurious bias induced from them. There is an absence of an +automatic benchmarking system to assess the robustness of few-shot classifiers +against spurious bias. In this paper, we propose a systematic and rigorous +benchmark framework, termed FewSTAB, to fairly demonstrate and quantify varied +degrees of robustness of few-shot classifiers to spurious bias. FewSTAB creates +few-shot evaluation tasks with biased attributes so that using them for +predictions can demonstrate poor performance. To construct these tasks, we +propose attribute-based sample selection strategies based on a pre-trained +vision-language model, eliminating the need for manual dataset curation. This +allows FewSTAB to automatically benchmark spurious bias using any existing test +data. FewSTAB offers evaluation results in a new dimension along with a new +design guideline for building robust classifiers. Moreover, it can benchmark +spurious bias in varied degrees and enable designs for varied degrees of +robustness. Its effectiveness is demonstrated through experiments on ten +few-shot learning methods across three datasets. We hope our framework can +inspire new designs of robust few-shot classifiers. Our code is available at +https://github.com/gtzheng/FewSTAB. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Configurable Foundation Models: Building LLMs from a Modular Perspective + + +
+ Advancements in LLMs have recently unveiled challenges tied to computational +efficiency and continual scalability due to their requirements of huge +parameters, making the applications and evolution of these models on devices +with limited computation resources and scenarios requiring various abilities +increasingly cumbersome. Inspired by modularity within the human brain, there +is a growing tendency to decompose LLMs into numerous functional modules, +allowing for inference with part of modules and dynamic assembly of modules to +tackle complex tasks, such as mixture-of-experts. To highlight the inherent +efficiency and composability of the modular approach, we coin the term brick to +represent each functional module, designating the modularized structure as +configurable foundation models. In this paper, we offer a comprehensive +overview and investigation of the construction, utilization, and limitation of +configurable foundation models. We first formalize modules into emergent bricks +- functional neuron partitions that emerge during the pre-training phase, and +customized bricks - bricks constructed via additional post-training to improve +the capabilities and knowledge of LLMs. Based on diverse functional bricks, we +further present four brick-oriented operations: retrieval and routing, merging, +updating, and growing. These operations allow for dynamic configuration of LLMs +based on instructions to handle complex tasks. To verify our perspective, we +conduct an empirical analysis on widely-used LLMs. We find that the FFN layers +follow modular patterns with functional specialization of neurons and +functional neuron partitions. Finally, we highlight several open issues and +directions for future research. Overall, this paper aims to offer a fresh +modular perspective on existing LLM research and inspire the future creation of +more efficient and scalable foundational models. + +
+
+
+
+
+ + ☆ Hybrid Imitation-Learning Motion Planner for Urban Driving + + +
+ With the release of open source datasets such as nuPlan and Argoverse, the +research around learning-based planners has spread a lot in the last years. +Existing systems have shown excellent capabilities in imitating the human +driver behaviour, but they struggle to guarantee safe closed-loop driving. +Conversely, optimization-based planners offer greater security in short-term +planning scenarios. To confront this challenge, in this paper we propose a +novel hybrid motion planner that integrates both learning-based and +optimization-based techniques. Initially, a multilayer perceptron (MLP) +generates a human-like trajectory, which is then refined by an +optimization-based component. This component not only minimizes tracking errors +but also computes a trajectory that is both kinematically feasible and +collision-free with obstacles and road boundaries. Our model effectively +balances safety and human-likeness, mitigating the trade-off inherent in these +objectives. We validate our approach through simulation experiments and further +demonstrate its efficacy by deploying it in real-world self-driving vehicles. + +
+
+
+
+
+ + ☆ Look Into the LITE in Deep Learning for Time Series Classification + + +
+ Deep learning models have been shown to be a powerful solution for Time +Series Classification (TSC). State-of-the-art architectures, while producing +promising results on the UCR and the UEA archives , present a high number of +trainable parameters. This can lead to long training with high CO2 emission, +power consumption and possible increase in the number of FLoating-point +Operation Per Second (FLOPS). In this paper, we present a new architecture for +TSC, the Light Inception with boosTing tEchnique (LITE) with only 2.34% of the +number of parameters of the state-of-the-art InceptionTime model, while +preserving performance. This architecture, with only 9, 814 trainable +parameters due to the usage of DepthWise Separable Convolutions (DWSC), is +boosted by three techniques: multiplexing, custom filters, and dilated +convolution. The LITE architecture, trained on the UCR, is 2.78 times faster +than InceptionTime and consumes 2.79 times less CO2 and power. To evaluate the +performance of the proposed architecture on multivariate time series data, we +adapt LITE to handle multivariate time series, we call this version LITEMV. To +bring theory into application, we also conducted experiments using LITEMV on +multivariate time series representing human rehabilitation movements, showing +that LITEMV not only is the most efficient model but also the best performing +for this application on the Kimore dataset, a skeleton based human +rehabilitation exercises dataset. Moreover, to address the interpretability of +LITEMV, we present a study using Class Activation Maps to understand the +classification decision taken by the model during evaluation. + +
+
+
+
+
+ + ☆ Building a Scalable, Effective, and Steerable Search and Ranking + Platform + + +
+ Modern e-commerce platforms offer vast product selections, making it +difficult for customers to find items that they like and that are relevant to +their current session intent. This is why it is key for e-commerce platforms to +have near real-time scalable and adaptable personalized ranking and search +systems. While numerous methods exist in the scientific literature for building +such systems, many are unsuitable for large-scale industrial use due to +complexity and performance limitations. Consequently, industrial ranking +systems often resort to computationally efficient yet simplistic retrieval or +candidate generation approaches, which overlook near real-time and +heterogeneous customer signals, which results in a less personalized and +relevant experience. Moreover, related customer experiences are served by +completely different systems, which increases complexity, maintenance, and +inconsistent experiences. + In this paper, we present a personalized, adaptable near real-time ranking +platform that is reusable across various use cases, such as browsing and +search, and that is able to cater to millions of items and customers under +heavy load (thousands of requests per second). We employ transformer-based +models through different ranking layers which can learn complex behavior +patterns directly from customer action sequences while being able to +incorporate temporal (e.g. in-session) and contextual information. We validate +our system through a series of comprehensive offline and online real-world +experiments at a large online e-commerce platform, and we demonstrate its +superiority when compared to existing systems, both in terms of customer +experience as well as in net revenue. Finally, we share the lessons learned +from building a comprehensive, modern ranking platform for use in a large-scale +e-commerce environment. + +
+
+
+
+
+ + ☆ Oops, I Sampled it Again: Reinterpreting Confidence Intervals in + Few-Shot Learning + + +
+ The predominant method for computing confidence intervals (CI) in few-shot +learning (FSL) is based on sampling the tasks with replacement, i.e.\ allowing +the same samples to appear in multiple tasks. This makes the CI misleading in +that it takes into account the randomness of the sampler but not the data +itself. To quantify the extent of this problem, we conduct a comparative +analysis between CIs computed with and without replacement. These reveal a +notable underestimation by the predominant method. This observation calls for a +reevaluation of how we interpret confidence intervals and the resulting +conclusions in FSL comparative studies. Our research demonstrates that the use +of paired tests can partially address this issue. Additionally, we explore +methods to further reduce the (size of the) CI by strategically sampling tasks +of a specific size. We also introduce a new optimized benchmark, which can be +accessed at https://github.com/RafLaf/FSL-benchmark-again + +
+
+
+
+
+ + ☆ SNNAX -- Spiking Neural Networks in JAX + + +
+ Spiking Neural Networks (SNNs) simulators are essential tools to prototype +biologically inspired models and neuromorphic hardware architectures and +predict their performance. For such a tool, ease of use and flexibility are +critical, but so is simulation speed especially given the complexity inherent +to simulating SNN. Here, we present SNNAX, a JAX-based framework for simulating +and training such models with PyTorch-like intuitiveness and JAX-like execution +speed. SNNAX models are easily extended and customized to fit the desired model +specifications and target neuromorphic hardware. Additionally, SNNAX offers key +features for optimizing the training and deployment of SNNs such as flexible +automatic differentiation and just-in-time compilation. We evaluate and compare +SNNAX to other commonly used machine learning (ML) frameworks used for +programming SNNs. We provide key performance metrics, best practices, +documented examples for simulating SNNs in SNNAX, and implement several +benchmarks used in the literature. + +
+
+
+
+
+ + ☆ Exploring Sentiment Dynamics and Predictive Behaviors in Cryptocurrency + Discussions by Few-Shot Learning with Large Language Models + + +
+ This study performs analysis of Predictive statements, Hope speech, and +Regret Detection behaviors within cryptocurrency-related discussions, +leveraging advanced natural language processing techniques. We introduce a +novel classification scheme named "Prediction statements," categorizing +comments into Predictive Incremental, Predictive Decremental, Predictive +Neutral, or Non-Predictive categories. Employing GPT-4o, a cutting-edge large +language model, we explore sentiment dynamics across five prominent +cryptocurrencies: Cardano, Binance, Matic, Fantom, and Ripple. Our analysis +reveals distinct patterns in predictive sentiments, with Matic demonstrating a +notably higher propensity for optimistic predictions. Additionally, we +investigate hope and regret sentiments, uncovering nuanced interplay between +these emotions and predictive behaviors. Despite encountering limitations +related to data volume and resource availability, our study reports valuable +discoveries concerning investor behavior and sentiment trends within the +cryptocurrency market, informing strategic decision-making and future research +endeavors. + +
+
+
+
+
+ + ☆ Obsidian: Cooperative State-Space Exploration for Performant Inference + on Secure ML Accelerators + + +
+ Trusted execution environments (TEEs) for machine learning accelerators are +indispensable in secure and efficient ML inference. Optimizing workloads +through state-space exploration for the accelerator architectures improves +performance and energy consumption. However, such explorations are expensive +and slow due to the large search space. Current research has to use fast +analytical models that forego critical hardware details and cross-layer +opportunities unique to the hardware security primitives. While cycle-accurate +models can theoretically reach better designs, their high runtime cost +restricts them to a smaller state space. + We present Obsidian, an optimization framework for finding the optimal +mapping from ML kernels to a secure ML accelerator. Obsidian addresses the +above challenge by exploring the state space using analytical and +cycle-accurate models cooperatively. The two main exploration components +include: (1) A secure accelerator analytical model, that includes the effect of +secure hardware while traversing the large mapping state space and produce the +best m model mappings; (2) A compiler profiling step on a cycle-accurate model, +that captures runtime bottlenecks to further improve execution runtime, energy +and resource utilization and find the optimal model mapping. + We compare our results to a baseline secure accelerator, comprising of the +state-of-the-art security schemes obtained from guardnn [ 33 ] and sesame [11]. +The analytical model reduces the inference latency by 20.5% for a cloud and +8.4% for an edge deployment with an energy improvement of 24% and 19% +respectively. The cycle-accurate model, further reduces the latency by 9.1% for +a cloud and 12.2% for an edge with an energy improvement of 13.8% and 13.1%. + +
+
+
+
+
+ + ☆ Boosting Certificate Robustness for Time Series Classification with + Efficient Self-Ensemble + + +
+ Recently, the issue of adversarial robustness in the time series domain has +garnered significant attention. However, the available defense mechanisms +remain limited, with adversarial training being the predominant approach, +though it does not provide theoretical guarantees. Randomized Smoothing has +emerged as a standout method due to its ability to certify a provable lower +bound on robustness radius under $\ell_p$-ball attacks. Recognizing its +success, research in the time series domain has started focusing on these +aspects. However, existing research predominantly focuses on time series +forecasting, or under the non-$\ell_p$ robustness in statistic feature +augmentation for time series classification~(TSC). Our review found that +Randomized Smoothing performs modestly in TSC, struggling to provide effective +assurances on datasets with poor robustness. Therefore, we propose a +self-ensemble method to enhance the lower bound of the probability confidence +of predicted labels by reducing the variance of classification margins, thereby +certifying a larger radius. This approach also addresses the computational +overhead issue of Deep Ensemble~(DE) while remaining competitive and, in some +cases, outperforming it in terms of robustness. Both theoretical analysis and +experimental results validate the effectiveness of our method, demonstrating +superior performance in robustness testing compared to baseline approaches. + +
+
+ comment: 6 figures, 4 tables, 10 pages +
+
+
+
+
+ + ☆ UnLearning from Experience to Avoid Spurious Correlations + + +
+ While deep neural networks can achieve state-of-the-art performance in many +tasks, these models are more fragile than they appear. They are prone to +learning spurious correlations in their training data, leading to surprising +failure cases. In this paper, we propose a new approach that addresses the +issue of spurious correlations: UnLearning from Experience (ULE). Our method is +based on using two classification models trained in parallel: student and +teacher models. Both models receive the same batches of training data. The +student model is trained with no constraints and pursues the spurious +correlations in the data. The teacher model is trained to solve the same +classification problem while avoiding the mistakes of the student model. As +training is done in parallel, the better the student model learns the spurious +correlations, the more robust the teacher model becomes. The teacher model uses +the gradient of the student's output with respect to its input to unlearn +mistakes made by the student. We show that our method is effective on the +Waterbirds, CelebA, Spawrious and UrbanCars datasets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Regularized Multi-output Gaussian Convolution Process with Domain + Adaptation + + +
+ Multi-output Gaussian process (MGP) has been attracting increasing attention +as a transfer learning method to model multiple outputs. Despite its high +flexibility and generality, MGP still faces two critical challenges when +applied to transfer learning. The first one is negative transfer, which occurs +when there exists no shared information among the outputs. The second challenge +is the input domain inconsistency, which is commonly studied in transfer +learning yet not explored in MGP. In this paper, we propose a regularized MGP +modeling framework with domain adaptation to overcome these challenges. More +specifically, a sparse covariance matrix of MGP is proposed by using +convolution process, where penalization terms are added to adaptively select +the most informative outputs for knowledge transfer. To deal with the domain +inconsistency, a domain adaptation method is proposed by marginalizing +inconsistent features and expanding missing features to align the input domains +among different outputs. Statistical properties of the proposed method are +provided to guarantee the performance practically and asymptotically. The +proposed framework outperforms state-of-the-art benchmarks in comprehensive +simulation studies and one real case study of a ceramic manufacturing process. +The results demonstrate the effectiveness of our method in dealing with both +the negative transfer and the domain inconsistency. + +
+
+
+
+
+ + ☆ Unifying Causal Representation Learning with the Invariance Principle + + +
+ Causal representation learning aims at recovering latent causal variables +from high-dimensional observations to solve causal downstream tasks, such as +predicting the effect of new interventions or more robust classification. A +plethora of methods have been developed, each tackling carefully crafted +problem settings that lead to different types of identifiability. The folklore +is that these different settings are important, as they are often linked to +different rungs of Pearl's causal hierarchy, although not all neatly fit. Our +main contribution is to show that many existing causal representation learning +approaches methodologically align the representation to known data symmetries. +Identification of the variables is guided by equivalence classes across +different data pockets that are not necessarily causal. This result suggests +important implications, allowing us to unify many existing approaches in a +single method that can mix and match different assumptions, including +non-causal ones, based on the invariances relevant to our application. It also +significantly benefits applicability, which we demonstrate by improving +treatment effect estimation on real-world high-dimensional ecological data. +Overall, this paper clarifies the role of causality assumptions in the +discovery of causal variables and shifts the focus to preserving data +symmetries. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ☆ Tractable Offline Learning of Regular Decision Processes + + +
+ This work studies offline Reinforcement Learning (RL) in a class of +non-Markovian environments called Regular Decision Processes (RDPs). In RDPs, +the unknown dependency of future observations and rewards from the past +interactions can be captured by some hidden finite-state automaton. For this +reason, many RDP algorithms first reconstruct this unknown dependency using +automata learning techniques. In this paper, we show that it is possible to +overcome two strong limitations of previous offline RL algorithms for RDPs, +notably RegORL. This can be accomplished via the introduction of two original +techniques: the development of a new pseudometric based on formal languages, +which removes a problematic dependency on +$L_\infty^\mathsf{p}$-distinguishability parameters, and the adoption of +Count-Min-Sketch (CMS), instead of naive counting. The former reduces the +number of samples required in environments that are characterized by a low +complexity in language-theoretic terms. The latter alleviates the memory +requirements for long planning horizons. We derive the PAC sample complexity +bounds associated to each of these techniques, and we validate the approach +experimentally. + +
+
+ comment: To appear in EWRL 2024 +
+
+
+
+
+ + ☆ Convolutional Neural Networks for Automated Cellular Automaton + Classification + + +
+ The emergent dynamics in spacetime diagrams of cellular automata (CAs) is +often organised by means of a number of behavioural classes. Whilst +classification of elementary CAs is feasible and well-studied, non-elementary +CAs are generally too diverse and numerous to exhaustively classify manually. +In this chapter we treat the spacetime diagram as a digital image, and +implement simple computer vision techniques to perform an automated +classification of elementary cellular automata into the five Li-Packard +classes. In particular, we present a supervised learning task to a +convolutional neural network, in such a way that it may be generalised to +non-elementary CAs. If we want to do so, we must divert the algorithm's focus +away from the underlying 'microscopic' local updates. We first show that +previously developed deep learning approaches have in fact been trained to +identify the local update rule, rather than directly focus on the mesoscopic +patterns that are associated with the particular behavioural classes. By means +of a well-argued neural network design, as well as a number of data +augmentation techniques, we then present a convolutional neural network that +performs nearly perfectly at identifying the behavioural class, without +necessarily first identifying the underlying microscopic dynamics. + +
+
+ comment: 19 pages, 12 figures, book chapter +
+
+
+
+
+ + ☆ Complete and Efficient Covariants for 3D Point Configurations with + Application to Learning Molecular Quantum Properties + + +
+ When modeling physical properties of molecules with machine learning, it is +desirable to incorporate $SO(3)$-covariance. While such models based on low +body order features are not complete, we formulate and prove general +completeness properties for higher order methods, and show that $6k-5$ of these +features are enough for up to $k$ atoms. We also find that the Clebsch--Gordan +operations commonly used in these methods can be replaced by matrix +multiplications without sacrificing completeness, lowering the scaling from +$O(l^6)$ to $O(l^3)$ in the degree of the features. We apply this to quantum +chemistry, but the proposed methods are generally applicable for problems +involving 3D point configurations. + +
+
+
+
+
+ + ☆ Task-Oriented Communication for Graph Data: A Graph Information + Bottleneck Approach + + +
+ Graph data, essential in fields like knowledge representation and social +networks, often involves large networks with many nodes and edges. Transmitting +these graphs can be highly inefficient due to their size and redundancy for +specific tasks. This paper introduces a method to extract a smaller, +task-focused subgraph that maintains key information while reducing +communication overhead. Our approach utilizes graph neural networks (GNNs) and +the graph information bottleneck (GIB) principle to create a compact, +informative, and robust graph representation suitable for transmission. The +challenge lies in the irregular structure of graph data, making GIB +optimization complex. We address this by deriving a tractable variational upper +bound for the objective function. Additionally, we propose the VQ-GIB +mechanism, integrating vector quantization (VQ) to convert subgraph +representations into a discrete codebook sequence, compatible with existing +digital communication systems. Our experiments show that this GIB-based method +significantly lowers communication costs while preserving essential +task-related information. The approach demonstrates robust performance across +various communication channels, suitable for both continuous and discrete +systems. + +
+
+
+
+
+ + ☆ A Data Selection Approach for Enhancing Low Resource Machine Translation + Using Cross-Lingual Sentence Representations + + +
+ Machine translation in low-resource language pairs faces significant +challenges due to the scarcity of parallel corpora and linguistic resources. +This study focuses on the case of English-Marathi language pairs, where +existing datasets are notably noisy, impeding the performance of machine +translation models. To mitigate the impact of data quality issues, we propose a +data filtering approach based on cross-lingual sentence representations. Our +methodology leverages a multilingual SBERT model to filter out problematic +translations in the training data. Specifically, we employ an IndicSBERT +similarity model to assess the semantic equivalence between original and +translated sentences, allowing us to retain linguistically correct translations +while discarding instances with substantial deviations. The results demonstrate +a significant improvement in translation quality over the baseline +post-filtering with IndicSBERT. This illustrates how cross-lingual sentence +representations can reduce errors in machine translation scenarios with limited +resources. By integrating multilingual sentence BERT models into the +translation pipeline, this research contributes to advancing machine +translation techniques in low-resource environments. The proposed method not +only addresses the challenges in English-Marathi language pairs but also +provides a valuable framework for enhancing translation quality in other +low-resource language translation tasks. + +
+
+ comment: Accepted at I2CT 2024 +
+
+
+
+
+ + ☆ Few-shot Multi-Task Learning of Linear Invariant Features with Meta + Subspace Pursuit + + +
+ Data scarcity poses a serious threat to modern machine learning and +artificial intelligence, as their practical success typically relies on the +availability of big datasets. One effective strategy to mitigate the issue of +insufficient data is to first harness information from other data sources +possessing certain similarities in the study design stage, and then employ the +multi-task or meta learning framework in the analysis stage. In this paper, we +focus on multi-task (or multi-source) linear models whose coefficients across +tasks share an invariant low-rank component, a popular structural assumption +considered in the recent multi-task or meta learning literature. Under this +assumption, we propose a new algorithm, called Meta Subspace Pursuit +(abbreviated as Meta-SP), that provably learns this invariant subspace shared +by different tasks. Under this stylized setup for multi-task or meta learning, +we establish both the algorithmic and statistical guarantees of the proposed +method. Extensive numerical experiments are conducted, comparing Meta-SP +against several competing methods, including popular, off-the-shelf +model-agnostic meta learning algorithms such as ANIL. These experiments +demonstrate that Meta-SP achieves superior performance over the competing +methods in various aspects. + +
+
+
+
+
+ + ☆ Decision Transformer for Enhancing Neural Local Search on the Job Shop + Scheduling Problem + + +
+ The job shop scheduling problem (JSSP) and its solution algorithms have been +of enduring interest in both academia and industry for decades. In recent +years, machine learning (ML) is playing an increasingly important role in +advancing existing and building new heuristic solutions for the JSSP, aiming to +find better solutions in shorter computation times. In this paper we build on +top of a state-of-the-art deep reinforcement learning (DRL) agent, called +Neural Local Search (NLS), which can efficiently and effectively control a +large local neighborhood search on the JSSP. In particular, we develop a method +for training the decision transformer (DT) algorithm on search trajectories +taken by a trained NLS agent to further improve upon the learned +decision-making sequences. Our experiments show that the DT successfully learns +local search strategies that are different and, in many cases, more effective +than those of the NLS agent itself. In terms of the tradeoff between solution +quality and acceptable computational time needed for the search, the DT is +particularly superior in application scenarios where longer computational times +are acceptable. In this case, it makes up for the longer inference times +required per search step, which are caused by the larger neural network +architecture, through better quality decisions per step. Thereby, the DT +achieves state-of-the-art results for solving the JSSP with ML-enhanced search. + +
+
+ comment: currently under review for IEEE Transactions on Cybernetics +
+
+
+
+
+ + ☆ Deconfounded Causality-aware Parameter-Efficient Fine-Tuning for + Problem-Solving Improvement of LLMs + + +
+ Large Language Models (LLMs) have demonstrated remarkable efficiency in +tackling various tasks based on human instructions, but recent studies reveal +that these models often fail to achieve satisfactory results on questions +involving reasoning, such as mathematics or physics questions. This phenomenon +is usually attributed to the uncertainty regarding whether these models could +genuinely comprehend the knowledge embedded in the text or merely learn to +replicate the token distribution without a true understanding of the content. +In this paper, we delve into this problem and aim to enhance the reasoning +capabilities of LLMs. First, we investigate if the model has genuine reasoning +capabilities by visualizing the text generation process at the attention and +representation level. Then, we formulate the reasoning process of LLMs into a +causal framework, which provides a formal explanation of the problems we +observe in the visualization. Finally, building upon this causal framework, we +propose Deconfounded Causal Adaptation (DCA), a novel parameter-efficient +fine-tuning (PEFT) method to enhance the model's reasoning capabilities by +encouraging the model to extract the general problem-solving skills and apply +these skills to different questions. Experiments show that our method +outperforms the baseline consistently across multiple benchmarks, and with only +1.2M tunable parameters, we achieve better or comparable results to other +fine-tuning methods. This demonstrates the effectiveness and efficiency of our +method in improving the overall accuracy and reliability of LLMs. + +
+
+
+
+
+ + ☆ Neural timescales from a computational perspective + + +
+ Timescales of neural activity are diverse across and within brain areas, and +experimental observations suggest that neural timescales reflect information in +dynamic environments. However, these observations do not specify how neural +timescales are shaped, nor whether particular timescales are necessary for +neural computations and brain function. Here, we take a complementary +perspective and synthesize three directions where computational methods can +distill the broad set of empirical observations into quantitative and testable +theories: We review (i) how data analysis methods allow us to capture different +timescales of neural dynamics across different recording modalities, (ii) how +computational models provide a mechanistic explanation for the emergence of +diverse timescales, and (iii) how task-optimized models in machine learning +uncover the functional relevance of neural timescales. This integrative +computational approach, combined with empirical findings, would provide a more +holistic understanding of how neural timescales capture the relationship +between brain structure, dynamics, and behavior. + +
+
+ comment: 18 pages, 4 figures, 2 boxes +
+
+
+
+
+ + ☆ Neural Networks with LSTM and GRU in Modeling Active Fires in the Amazon + + +
+ This study presents a comprehensive methodology for modeling and forecasting +the historical time series of fire spots detected by the AQUA_M-T satellite in +the Amazon, Brazil. The approach utilizes a mixed Recurrent Neural Network +(RNN) model, combining Long Short-Term Memory (LSTM) and Gated Recurrent Unit +(GRU) architectures to predict monthly accumulations of daily detected fire +spots. A summary of the data revealed a consistent seasonality over time, with +annual maximum and minimum fire spot values tending to repeat at the same +periods each year. The primary objective is to verify whether the forecasts +capture this inherent seasonality through rigorous statistical analysis. The +methodology involved careful data preparation, model configuration, and +training using cross-validation with two seeds, ensuring that the data +generalizes well to the test and validation sets, and confirming the +convergence of the model parameters. The results indicate that the mixed LSTM +and GRU model offers improved accuracy in forecasting 12 months ahead, +demonstrating its effectiveness in capturing complex temporal patterns and +modeling the observed time series. This research significantly contributes to +the application of deep learning techniques in environmental monitoring, +specifically in fire spot forecasting. In addition to improving forecast +accuracy, the proposed approach highlights the potential for adaptation to +other time series forecasting challenges, opening new avenues for research and +development in machine learning and natural phenomenon prediction. Keywords: +Time Series Forecasting, Recurrent Neural Networks, Deep Learning. + +
+
+ comment: 16 pages, in Portuguese language, 24 figures +
+
+
+
+
+ + ☆ Independence Constrained Disentangled Representation Learning from + Epistemological Perspective + + +
+ Disentangled Representation Learning aims to improve the explainability of +deep learning methods by training a data encoder that identifies semantically +meaningful latent variables in the data generation process. Nevertheless, there +is no consensus regarding a universally accepted definition for the objective +of disentangled representation learning. In particular, there is a considerable +amount of discourse regarding whether should the latent variables be mutually +independent or not. In this paper, we first investigate these arguments on the +interrelationships between latent variables by establishing a conceptual bridge +between Epistemology and Disentangled Representation Learning. Then, inspired +by these interdisciplinary concepts, we introduce a two-level latent space +framework to provide a general solution to the prior arguments on this issue. +Finally, we propose a novel method for disentangled representation learning by +employing an integration of mutual information constraint and independence +constraint within the Generative Adversarial Network (GAN) framework. +Experimental results demonstrate that our proposed method consistently +outperforms baseline approaches in both quantitative and qualitative +evaluations. The method exhibits strong performance across multiple commonly +used metrics and demonstrates a great capability in disentangling various +semantic factors, leading to an improved quality of controllable generation, +which consequently benefits the explainability of the algorithm. + +
+
+
+
+
+ + ☆ Causality-Aware Transformer Networks for Robotic Navigation + + +
+ Recent advances in machine learning algorithms have garnered growing interest +in developing versatile Embodied AI systems. However, current research in this +domain reveals opportunities for improvement. First, the direct adoption of +RNNs and Transformers often overlooks the specific differences between Embodied +AI and traditional sequential data modelling, potentially limiting its +performance in Embodied AI tasks. Second, the reliance on task-specific +configurations, such as pre-trained modules and dataset-specific logic, +compromises the generalizability of these methods. We address these constraints +by initially exploring the unique differences between Embodied AI tasks and +other sequential data tasks through the lens of Causality, presenting a causal +framework to elucidate the inadequacies of conventional sequential methods for +Embodied AI. By leveraging this causal perspective, we propose Causality-Aware +Transformer (CAT) Networks for Navigation, featuring a Causal Understanding +Module to enhance the models's Environmental Understanding capability. +Meanwhile, our method is devoid of task-specific inductive biases and can be +trained in an End-to-End manner, which enhances the method's generalizability +across various contexts. Empirical evaluations demonstrate that our methodology +consistently surpasses benchmark performances across a spectrum of settings, +tasks and simulation environments. Extensive ablation studies reveal that the +performance gains can be attributed to the Causal Understanding Module, which +demonstrates effectiveness and efficiency in both Reinforcement Learning and +Supervised Learning settings. + +
+
+
+
+
+ + ☆ Introduction to Machine Learning + + +
+ This book introduces the mathematical foundations and techniques that lead to +the development and analysis of many of the algorithms that are used in machine +learning. It starts with an introductory chapter that describes notation used +throughout the book and serve at a reminder of basic concepts in calculus, +linear algebra and probability and also introduces some measure theoretic +terminology, which can be used as a reading guide for the sections that use +these tools. The introductory chapters also provide background material on +matrix analysis and optimization. The latter chapter provides theoretical +support to many algorithms that are used in the book, including stochastic +gradient descent, proximal methods, etc. After discussing basic concepts for +statistical prediction, the book includes an introduction to reproducing kernel +theory and Hilbert space techniques, which are used in many places, before +addressing the description of various algorithms for supervised statistical +learning, including linear methods, support vector machines, decision trees, +boosting, or neural networks. The subject then switches to generative methods, +starting with a chapter that presents sampling methods and an introduction to +the theory of Markov chains. The following chapter describe the theory of +graphical models, an introduction to variational methods for models with latent +variables, and to deep-learning based generative models. The next chapters +focus on unsupervised learning methods, for clustering, factor analysis and +manifold learning. The final chapter of the book is theory-oriented and +discusses concentration inequalities and generalization bounds. + +
+
+ comment: textbook +
+
+
+
+
+ + ☆ Learning-Based Error Detection System for Advanced Vehicle Instrument + Cluster Rendering + + +
+ The automotive industry is currently expanding digital display options with +every new model that comes onto the market. This entails not just an expansion +in dimensions, resolution, and customization choices, but also the capability +to employ novel display effects like overlays while assembling the content of +the display cluster. Unfortunately, this raises the need for appropriate +monitoring systems that can detect rendering errors and apply appropriate +countermeasures when required. Classical solutions such as Cyclic Redundancy +Checks (CRC) will soon be no longer viable as any sort of alpha blending, +warping of scaling of content can cause unwanted CRC violations. Therefore, we +propose a novel monitoring approach to verify correctness of displayed content +using telltales (e.g. warning signs) as example. It uses a learning-based +approach to separate "good" telltales, i.e. those that a human driver will +understand correctly, and "corrupted" telltales, i.e. those that will not be +visible or perceived correctly. As a result, it possesses inherent resilience +against individual pixel errors and implicitly supports changing backgrounds, +overlay or scaling effects. This is underlined by our experimental study where +all "corrupted" test patterns were correctly classified, while no false alarms +were triggered. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Conformal Prediction in Dynamic Biological Systems + + +
+ Uncertainty quantification (UQ) is the process of systematically determining +and characterizing the degree of confidence in computational model predictions. +In the context of systems biology, especially with dynamic models, UQ is +crucial because it addresses the challenges posed by nonlinearity and parameter +sensitivity, allowing us to properly understand and extrapolate the behavior of +complex biological systems. Here, we focus on dynamic models represented by +deterministic nonlinear ordinary differential equations. Many current UQ +approaches in this field rely on Bayesian statistical methods. While powerful, +these methods often require strong prior specifications and make parametric +assumptions that may not always hold in biological systems. Additionally, these +methods face challenges in domains where sample sizes are limited, and +statistical inference becomes constrained, with computational speed being a +bottleneck in large models of biological systems. As an alternative, we propose +the use of conformal inference methods, introducing two novel algorithms that, +in some instances, offer non-asymptotic guarantees, enhancing robustness and +scalability across various applications. We demonstrate the efficacy of our +proposed algorithms through several scenarios, highlighting their advantages +over traditional Bayesian approaches. The proposed methods show promising +results for diverse biological data structures and scenarios, offering a +general framework to quantify uncertainty for dynamic models of biological +systems.The software for the methodology and the reproduction of the results is +available at https://zenodo.org/doi/10.5281/zenodo.13644870. + +
+
+
+
+
+ + ☆ AdvSecureNet: A Python Toolkit for Adversarial Machine Learning + + +
+ Machine learning models are vulnerable to adversarial attacks. Several tools +have been developed to research these vulnerabilities, but they often lack +comprehensive features and flexibility. We introduce AdvSecureNet, a PyTorch +based toolkit for adversarial machine learning that is the first to natively +support multi-GPU setups for attacks, defenses, and evaluation. It is the first +toolkit that supports both CLI and API interfaces and external YAML +configuration files to enhance versatility and reproducibility. The toolkit +includes multiple attacks, defenses and evaluation metrics. Rigiorous software +engineering practices are followed to ensure high code quality and +maintainability. The project is available as an open-source project on GitHub +at https://github.com/melihcatal/advsecurenet and installable via PyPI. + +
+
+
+
+
+ + ☆ (Implicit) Ensembles of Ensembles: Epistemic Uncertainty Collapse in + Large Models + + +
+ Epistemic uncertainty is crucial for safety-critical applications and +out-of-distribution detection tasks. Yet, we uncover a paradoxical phenomenon +in deep learning models: an epistemic uncertainty collapse as model complexity +increases, challenging the assumption that larger models invariably offer +better uncertainty quantification. We propose that this stems from implicit +ensembling within large models. To support this hypothesis, we demonstrate +epistemic uncertainty collapse empirically across various architectures, from +explicit ensembles of ensembles and simple MLPs to state-of-the-art vision +models, including ResNets and Vision Transformers -- for the latter, we examine +implicit ensemble extraction and decompose larger models into diverse +sub-models, recovering epistemic uncertainty. We provide theoretical +justification for these phenomena and explore their implications for +uncertainty estimation. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Hypothesizing Missing Causal Variables with LLMs + + +
+ Scientific discovery is a catalyst for human intellectual advances, driven by +the cycle of hypothesis generation, experimental design, data evaluation, and +iterative assumption refinement. This process, while crucial, is expensive and +heavily dependent on the domain knowledge of scientists to generate hypotheses +and navigate the scientific cycle. Central to this is causality, the ability to +establish the relationship between the cause and the effect. Motivated by the +scientific discovery process, in this work, we formulate a novel task where the +input is a partial causal graph with missing variables, and the output is a +hypothesis about the missing variables to complete the partial graph. We design +a benchmark with varying difficulty levels and knowledge assumptions about the +causal graph. With the growing interest in using Large Language Models (LLMs) +to assist in scientific discovery, we benchmark open-source and closed models +on our testbed. We show the strong ability of LLMs to hypothesize the mediation +variables between a cause and its effect. In contrast, they underperform in +hypothesizing the cause and effect variables themselves. We also observe +surprising results where some of the open-source models outperform the closed +GPT-4 model. + +
+
+ comment: Code - https://github.com/ivaxi0s/hypothesizing-causal-variable-llm +
+
+
+
+
+ + ☆ A Fashion Item Recommendation Model in Hyperbolic Space CVPR 2024 + + +
+ In this work, we propose a fashion item recommendation model that +incorporates hyperbolic geometry into user and item representations. Using +hyperbolic space, our model aims to capture implicit hierarchies among items +based on their visual data and users' purchase history. During training, we +apply a multi-task learning framework that considers both hyperbolic and +Euclidean distances in the loss function. Our experiments on three data sets +show that our model performs better than previous models trained in Euclidean +space only, confirming the effectiveness of our model. Our ablation studies +show that multi-task learning plays a key role, and removing the Euclidean loss +substantially deteriorates the model performance. + +
+
+ comment: This work was presented at the CVFAD Workshop at CVPR 2024 +
+
+
+
+
+ + ☆ An Analysis of Linear Complexity Attention Substitutes with BEST-RQ + + +
+ Self-Supervised Learning (SSL) has proven to be effective in various domains, +including speech processing. However, SSL is computationally and memory +expensive. This is in part due the quadratic complexity of multi-head +self-attention (MHSA). Alternatives for MHSA have been proposed and used in the +speech domain, but have yet to be investigated properly in an SSL setting. In +this work, we study the effects of replacing MHSA with recent state-of-the-art +alternatives that have linear complexity, namely, HyperMixing, Fastformer, +SummaryMixing, and Mamba. We evaluate these methods by looking at the speed, +the amount of VRAM consumed, and the performance on the SSL MP3S benchmark. +Results show that these linear alternatives maintain competitive performance +compared to MHSA while, on average, decreasing VRAM consumption by around 20% +to 60% and increasing speed from 7% to 65% for input sequences ranging from 20 +to 80 seconds. + +
+
+ comment: Accepted in the IEEE Soken Language Technology Workshop 2024 +
+
+
+
+
+ + ☆ Multiview Random Vector Functional Link Network for Predicting + DNA-Binding Proteins + + +
+ The identification of DNA-binding proteins (DBPs) is a critical task due to +their significant impact on various biological activities. Understanding the +mechanisms underlying protein-DNA interactions is essential for elucidating +various life activities. In recent years, machine learning-based models have +been prominently utilized for DBP prediction. In this paper, to predict DBPs, +we propose a novel framework termed a multiview random vector functional link +(MvRVFL) network, which fuses neural network architecture with multiview +learning. The proposed MvRVFL model combines the benefits of late and early +fusion, allowing for distinct regularization parameters across different views +while leveraging a closed-form solution to determine unknown parameters +efficiently. The primal objective function incorporates a coupling term aimed +at minimizing a composite of errors stemming from all views. From each of the +three protein views of the DBP datasets, we extract five features. These +features are then fused together by incorporating a hidden feature during the +model training process. The performance of the proposed MvRVFL model on the DBP +dataset surpasses that of baseline models, demonstrating its superior +effectiveness. Furthermore, we extend our assessment to the UCI, KEEL, AwA, and +Corel5k datasets, to establish the practicality of the proposed models. The +consistency error bound, the generalization error bound, and empirical +findings, coupled with rigorous statistical analyses, confirm the superior +generalization capabilities of the MvRVFL model compared to the baseline +models. + +
+
+
+
+
+ + ☆ BMI Prediction from Handwritten English Characters Using a Convolutional + Neural Network + + +
+ A person's Body Mass Index, or BMI, is the most widely used parameter for +assessing their health. BMI is a crucial predictor of potential diseases that +may arise at higher body fat levels because it is correlated with body fat. +Conversely, a community's or an individual's nutritional status can be +determined using the BMI. Although deep learning models are used in several +studies to estimate BMI from face photos and other data, no previous research +established a clear connection between deep learning techniques for handwriting +analysis and BMI prediction. This article addresses this research gap with a +deep learning approach to estimating BMI from handwritten characters by +developing a convolutional neural network (CNN). A dataset containing samples +from 48 people in lowercase English scripts is successfully captured for the +BMI prediction task. The proposed CNN-based approach reports a commendable +accuracy of 99.92%. Performance comparison with other popular CNN architectures +reveals that AlexNet and InceptionV3 achieve the second and third-best +performance, with the accuracy of 99.69% and 99.53%, respectively. + +
+
+
+
+
+ + ☆ Advancing Cyber Incident Timeline Analysis Through Rule Based AI and + Large Language Models + + +
+ Timeline Analysis (TA) is a key part of Timeline Forensics (TF) in Digital +Forensics (DF), focusing primarily on examining and analysing temporal digital +artefacts such as timestamps, derived from event logs, file metadata, and other +related data to correlate events resulting from cyber incidents and reconstruct +their chronological timeline. Traditional tools often struggle to efficiently +process the vast volume and variety of data acquired during DF investigations +and Incident Response (IR) processes. This paper presents a novel framework, +GenDFIR, that combines Rule-Based Artificial Intelligence (R-BAI) algorithms +with Large Language Models (LLMs) to advance and automate the TA process. Our +approach consists of two main stages (1) We use R-BAI to identify and select +anomalous digital artefacts based on predefined rules. (2) The selected +artefacts are then converted into embeddings for processing by an LLM with the +help of a Retrieval-Augmented Generation (RAG) agent. The LLM consequently +leverages its capabilities to perform automated TA on the artefacts and predict +potential incident scenarios. To validate our framework, we evaluate GenDFIR +performance, efficiency, and reliability using various metrics across synthetic +cyber incident simulation scenarios. This paper presents a proof of concept, +where the findings demonstrate the significant potential of integrating R-BAI +and LLMs for TA. This novel approach highlights the power of Generative AI +(GenAI), specifically LLMs, and opens new avenues for advanced threat detection +and incident reconstruction, representing a significant step forward in the +field. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ☆ Low-Resolution Object Recognition with Cross-Resolution Relational + Contrastive Distillation + + +
+ Recognizing objects in low-resolution images is a challenging task due to the +lack of informative details. Recent studies have shown that knowledge +distillation approaches can effectively transfer knowledge from a +high-resolution teacher model to a low-resolution student model by aligning +cross-resolution representations. However, these approaches still face +limitations in adapting to the situation where the recognized objects exhibit +significant representation discrepancies between training and testing images. +In this study, we propose a cross-resolution relational contrastive +distillation approach to facilitate low-resolution object recognition. Our +approach enables the student model to mimic the behavior of a well-trained +teacher model which delivers high accuracy in identifying high-resolution +objects. To extract sufficient knowledge, the student learning is supervised +with contrastive relational distillation loss, which preserves the similarities +in various relational structures in contrastive representation space. In this +manner, the capability of recovering missing details of familiar low-resolution +objects can be effectively enhanced, leading to a better knowledge transfer. +Extensive experiments on low-resolution object classification and +low-resolution face recognition clearly demonstrate the effectiveness and +adaptability of our approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Circuits and Systems + for Video Technology (TCSVT) +
+
+
+
+
+ + ☆ Understanding eGFR Trajectories and Kidney Function Decline via Large + Multimodal Models + + +
+ The estimated Glomerular Filtration Rate (eGFR) is an essential indicator of +kidney function in clinical practice. Although traditional equations and +Machine Learning (ML) models using clinical and laboratory data can estimate +eGFR, accurately predicting future eGFR levels remains a significant challenge +for nephrologists and ML researchers. Recent advances demonstrate that Large +Language Models (LLMs) and Large Multimodal Models (LMMs) can serve as robust +foundation models for diverse applications. This study investigates the +potential of LMMs to predict future eGFR levels with a dataset consisting of +laboratory and clinical values from 50 patients. By integrating various +prompting techniques and ensembles of LMMs, our findings suggest that these +models, when combined with precise prompts and visual representations of eGFR +trajectories, offer predictive performance comparable to existing ML models. +This research extends the application of foundation models and suggests avenues +for future studies to harness these models in addressing complex medical +forecasting challenges. + +
+
+ comment: This preprint version includes corrections of typographical errors + related to numerical values in Table 2, which were present in the version + published at the BDH workshop in MIPR 2024. These corrections do not affect + the overall conclusions of the study +
+
+
+
+
+ + ☆ Sample what you cant compress + + +
+ For learned image representations, basic autoencoders often produce blurry +results. Reconstruction quality can be improved by incorporating additional +penalties such as adversarial (GAN) and perceptual losses. Arguably, these +approaches lack a principled interpretation. Concurrently, in generative +settings diffusion has demonstrated a remarkable ability to create crisp, high +quality results and has solid theoretical underpinnings (from variational +inference to direct study as the Fisher Divergence). Our work combines +autoencoder representation learning with diffusion and is, to our knowledge, +the first to demonstrate the efficacy of jointly learning a continuous encoder +and decoder under a diffusion-based loss. We demonstrate that this approach +yields better reconstruction quality as compared to GAN-based autoencoders +while being easier to tune. We also show that the resulting representation is +easier to model with a latent diffusion model as compared to the representation +obtained from a state-of-the-art GAN-based loss. Since our decoder is +stochastic, it can generate details not encoded in the otherwise deterministic +latent representation; we therefore name our approach "Sample what you can't +compress", or SWYCC for short. + +
+
+
+
+
+ + ☆ Training Universal Vocoders with Feature Smoothing-Based Augmentation + Methods for High-Quality TTS Systems + + +
+ While universal vocoders have achieved proficient waveform generation across +diverse voices, their integration into text-to-speech (TTS) tasks often results +in degraded synthetic quality. To address this challenge, we present a novel +augmentation technique for training universal vocoders. Our training scheme +randomly applies linear smoothing filters to input acoustic features, +facilitating vocoder generalization across a wide range of smoothings. It +significantly mitigates the training-inference mismatch, enhancing the +naturalness of synthetic output even when the acoustic model produces overly +smoothed features. Notably, our method is applicable to any vocoder without +requiring architectural modifications or dependencies on specific acoustic +models. The experimental results validate the superiority of our vocoder over +conventional methods, achieving 11.99% and 12.05% improvements in mean opinion +scores when integrated with Tacotron 2 and FastSpeech 2 TTS acoustic models, +respectively. + +
+
+ comment: 4 pages, 4 figures, for demo samples, see + https://sytronik.github.io/demos/voc_smth_aug/ +
+
+
+
+
+ + ☆ Continual Diffuser (CoD): Mastering Continual Offline Reinforcement + Learning with Experience Rehearsal + + +
+ Artificial neural networks, especially recent diffusion-based models, have +shown remarkable superiority in gaming, control, and QA systems, where the +training tasks' datasets are usually static. However, in real-world +applications, such as robotic control of reinforcement learning (RL), the tasks +are changing, and new tasks arise in a sequential order. This situation poses +the new challenge of plasticity-stability trade-off for training an agent who +can adapt to task changes and retain acquired knowledge. In view of this, we +propose a rehearsal-based continual diffusion model, called Continual Diffuser +(CoD), to endow the diffuser with the capabilities of quick adaptation +(plasticity) and lasting retention (stability). Specifically, we first +construct an offline benchmark that contains 90 tasks from multiple domains. +Then, we train the CoD on each task with sequential modeling and conditional +generation for making decisions. Next, we preserve a small portion of previous +datasets as the rehearsal buffer and replay it to retain the acquired +knowledge. Extensive experiments on a series of tasks show CoD can achieve a +promising plasticity-stability trade-off and outperform existing +diffusion-based methods and other representative baselines on most tasks. + +
+
+
+
+
+ + ☆ CoAst: Validation-Free Contribution Assessment for Federated Learning + based on Cross-Round Valuation + + +
+ In the federated learning (FL) process, since the data held by each +participant is different, it is necessary to figure out which participant has a +higher contribution to the model performance. Effective contribution assessment +can help motivate data owners to participate in the FL training. Research works +in this field can be divided into two directions based on whether a validation +dataset is required. Validation-based methods need to use representative +validation data to measure the model accuracy, which is difficult to obtain in +practical FL scenarios. Existing validation-free methods assess the +contribution based on the parameters and gradients of local models and the +global model in a single training round, which is easily compromised by the +stochasticity of model training. In this work, we propose CoAst, a practical +method to assess the FL participants' contribution without access to any +validation data. The core idea of CoAst involves two aspects: one is to only +count the most important part of model parameters through a weights +quantization, and the other is a cross-round valuation based on the similarity +between the current local parameters and the global parameter updates in +several subsequent communication rounds. Extensive experiments show that CoAst +has comparable assessment reliability to existing validation-based methods and +outperforms existing validation-free methods. + +
+
+
+
+
+ + ☆ Reliable Deep Diffusion Tensor Estimation: Rethinking the Power of + Data-Driven Optimization Routine + + +
+ Diffusion tensor imaging (DTI) holds significant importance in clinical +diagnosis and neuroscience research. However, conventional model-based fitting +methods often suffer from sensitivity to noise, leading to decreased accuracy +in estimating DTI parameters. While traditional data-driven deep learning +methods have shown potential in terms of accuracy and efficiency, their limited +generalization to out-of-training-distribution data impedes their broader +application due to the diverse scan protocols used across centers, scanners, +and studies. This work aims to tackle these challenges and promote the use of +DTI by introducing a data-driven optimization-based method termed DoDTI. DoDTI +combines the weighted linear least squares fitting algorithm and regularization +by denoising technique. The former fits DW images from diverse acquisition +settings into diffusion tensor field, while the latter applies a deep +learning-based denoiser to regularize the diffusion tensor field instead of the +DW images, which is free from the limitation of fixed-channel assignment of the +network. The optimization object is solved using the alternating direction +method of multipliers and then unrolled to construct a deep neural network, +leveraging a data-driven strategy to learn network parameters. Extensive +validation experiments are conducted utilizing both internally simulated +datasets and externally obtained in-vivo datasets. The results, encompassing +both qualitative and quantitative analyses, showcase that the proposed method +attains state-of-the-art performance in DTI parameter estimation. Notably, it +demonstrates superior generalization, accuracy, and efficiency, rendering it +highly reliable for widespread application in the field. + +
+
+
+
+
+ + ☆ Adversarial Attacks on Machine Learning-Aided Visualizations + + +
+ Research in ML4VIS investigates how to use machine learning (ML) techniques +to generate visualizations, and the field is rapidly growing with high societal +impact. However, as with any computational pipeline that employs ML processes, +ML4VIS approaches are susceptible to a range of ML-specific adversarial +attacks. These attacks can manipulate visualization generations, causing +analysts to be tricked and their judgments to be impaired. Due to a lack of +synthesis from both visualization and ML perspectives, this security aspect is +largely overlooked by the current ML4VIS literature. To bridge this gap, we +investigate the potential vulnerabilities of ML-aided visualizations from +adversarial attacks using a holistic lens of both visualization and ML +perspectives. We first identify the attack surface (i.e., attack entry points) +that is unique in ML-aided visualizations. We then exemplify five different +adversarial attacks. These examples highlight the range of possible attacks +when considering the attack surface and multiple different adversary +capabilities. Our results show that adversaries can induce various attacks, +such as creating arbitrary and deceptive visualizations, by systematically +identifying input attributes that are influential in ML inferences. Based on +our observations of the attack surface characteristics and the attack examples, +we underline the importance of comprehensive studies of security issues and +defense mechanisms as a call of urgency for the ML4VIS community. + +
+
+ comment: This is the author's version of the article that has been accepted by + the Journal of Visualization +
+
+
+
+
+ + ☆ Volumetric Surfaces: Representing Fuzzy Geometries with Multiple Meshes + + +
+ High-quality real-time view synthesis methods are based on volume rendering, +splatting, or surface rendering. While surface-based methods generally are the +fastest, they cannot faithfully model fuzzy geometry like hair. In turn, +alpha-blending techniques excel at representing fuzzy materials but require an +unbounded number of samples per ray (P1). Further overheads are induced by +empty space skipping in volume rendering (P2) and sorting input primitives in +splatting (P3). These problems are exacerbated on low-performance graphics +hardware, e.g. on mobile devices. We present a novel representation for +real-time view synthesis where the (P1) number of sampling locations is small +and bounded, (P2) sampling locations are efficiently found via rasterization, +and (P3) rendering is sorting-free. We achieve this by representing objects as +semi-transparent multi-layer meshes, rendered in fixed layer order from +outermost to innermost. We model mesh layers as SDF shells with optimal spacing +learned during training. After baking, we fit UV textures to the corresponding +meshes. We show that our method can represent challenging fuzzy objects while +achieving higher frame rates than volume-based and splatting-based methods on +low-end and mobile devices. + +
+
+
+
+
+ + ☆ Demographic parity in regression and classification within the + unawareness framework + + +
+ This paper explores the theoretical foundations of fair regression under the +constraint of demographic parity within the unawareness framework, where +disparate treatment is prohibited, extending existing results where such +treatment is permitted. Specifically, we aim to characterize the optimal fair +regression function when minimizing the quadratic loss. Our results reveal that +this function is given by the solution to a barycenter problem with optimal +transport costs. Additionally, we study the connection between optimal fair +cost-sensitive classification, and optimal fair regression. We demonstrate that +nestedness of the decision sets of the classifiers is both necessary and +sufficient to establish a form of equivalence between classification and +regression. Under this nestedness assumption, the optimal classifiers can be +derived by applying thresholds to the optimal fair regression function; +conversely, the optimal fair regression function is characterized by the family +of cost-sensitive classifiers. + +
+
+
+
+
+ + ☆ ForeCal: Random Forest-based Calibration for DNNs + + +
+ Deep neural network(DNN) based classifiers do extremely well in +discriminating between observations, resulting in higher ROC AUC and accuracy +metrics, but their outputs are often miscalibrated with respect to true event +likelihoods. Post-hoc calibration algorithms are often used to calibrate the +outputs of these classifiers. Methods like Isotonic regression, Platt scaling, +and Temperature scaling have been shown to be effective in some cases but are +limited by their parametric assumptions and/or their inability to capture +complex non-linear relationships. We propose ForeCal - a novel post-hoc +calibration algorithm based on Random forests. ForeCal exploits two unique +properties of Random forests: the ability to enforce weak monotonicity and +range-preservation. It is more powerful in achieving calibration than current +state-of-the-art methods, is non-parametric, and can incorporate exogenous +information as features to learn a better calibration function. Through +experiments on 43 diverse datasets from the UCI ML repository, we show that +ForeCal outperforms existing methods in terms of Expected Calibration +Error(ECE) with minimal impact on the discriminative power of the base DNN as +measured by AUC. + +
+
+
+
+
+ + ☆ Adversarial Learning for Neural PDE Solvers with Sparse Data + + +
+ Neural network solvers for partial differential equations (PDEs) have made +significant progress, yet they continue to face challenges related to data +scarcity and model robustness. Traditional data augmentation methods, which +leverage symmetry or invariance, impose strong assumptions on physical systems +that often do not hold in dynamic and complex real-world applications. To +address this research gap, this study introduces a universal learning strategy +for neural network PDEs, named Systematic Model Augmentation for Robust +Training (SMART). By focusing on challenging and improving the model's +weaknesses, SMART reduces generalization error during training under +data-scarce conditions, leading to significant improvements in prediction +accuracy across various PDE scenarios. The effectiveness of the proposed method +is demonstrated through both theoretical analysis and extensive +experimentation. The code will be available. + +
+
+
+
+
+ + ☆ Transfer-based Adversarial Poisoning Attacks for Online (MIMO-)Deep + Receviers + + +
+ Recently, the design of wireless receivers using deep neural networks (DNNs), +known as deep receivers, has attracted extensive attention for ensuring +reliable communication in complex channel environments. To adapt quickly to +dynamic channels, online learning has been adopted to update the weights of +deep receivers with over-the-air data (e.g., pilots). However, the fragility of +neural models and the openness of wireless channels expose these systems to +malicious attacks. To this end, understanding these attack methods is essential +for robust receiver design.In this paper, we propose a transfer-based +adversarial poisoning attack method for online receivers.Without knowledge of +the attack target, adversarial perturbations are injected to the pilots, +poisoning the online deep receiver and impairing its ability to adapt to +dynamic channels and nonlinear effects. In particular, our attack method +targets Deep Soft Interference Cancellation (DeepSIC)[1] using online +meta-learning.As a classical model-driven deep receiver, DeepSIC incorporates +wireless domain knowledge into its architecture. This integration allows it to +adapt efficiently to time-varying channels with only a small number of pilots, +achieving optimal performance in a multi-input and multi-output (MIMO) +scenario.The deep receiver in this scenario has a number of applications in the +field of wireless communication, which motivates our study of the attack +methods targeting it.Specifically, we demonstrate the effectiveness of our +attack in simulations on synthetic linear, synthetic nonlinear, static, and +COST 2100 channels. Simulation results indicate that the proposed poisoning +attack significantly reduces the performance of online receivers in rapidly +changing scenarios. + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ☆ Large Language Models as Efficient Reward Function Searchers for + Custom-Environment Multi-Objective Reinforcement Learning + + +
+ Leveraging large language models (LLMs) for designing reward functions +demonstrates significant potential. However, achieving effective design and +improvement of reward functions in reinforcement learning (RL) tasks with +complex custom environments and multiple requirements presents considerable +challenges. In this paper, we enable LLMs to be effective white-box searchers, +highlighting their advanced semantic understanding capabilities. Specifically, +we generate reward components for each explicit user requirement and employ the +reward critic to identify the correct code form. Then, LLMs assign weights to +the reward components to balance their values and iteratively search and +optimize these weights based on the context provided by the training log +analyzer, while adaptively determining the search step size. We applied the +framework to an underwater information collection RL task without direct human +feedback or reward examples (zero-shot). The reward critic successfully correct +the reward code with only one feedback for each requirement, effectively +preventing irreparable errors that can occur when reward function feedback is +provided in aggregate. The effective initialization of weights enables the +acquisition of different reward functions within the Pareto solution set +without weight search. Even in the case where a weight is 100 times off, fewer +than four iterations are needed to obtain solutions that meet user +requirements. The framework also works well with most prompts utilizing GPT-3.5 +Turbo, since it does not require advanced numerical understanding or +calculation. + +
+
+
+
+
+ + ☆ Diffusion Models Learn Low-Dimensional Distributions via Subspace + Clustering + + +
+ Recent empirical studies have demonstrated that diffusion models can +effectively learn the image distribution and generate new samples. Remarkably, +these models can achieve this even with a small number of training samples +despite a large image dimension, circumventing the curse of dimensionality. In +this work, we provide theoretical insights into this phenomenon by leveraging +key empirical observations: (i) the low intrinsic dimensionality of image data, +(ii) a union of manifold structure of image data, and (iii) the low-rank +property of the denoising autoencoder in trained diffusion models. These +observations motivate us to assume the underlying data distribution of image +data as a mixture of low-rank Gaussians and to parameterize the denoising +autoencoder as a low-rank model according to the score function of the assumed +distribution. With these setups, we rigorously show that optimizing the +training loss of diffusion models is equivalent to solving the canonical +subspace clustering problem over the training samples. Based on this +equivalence, we further show that the minimal number of samples required to +learn the underlying distribution scales linearly with the intrinsic dimensions +under the above data and model assumptions. This insight sheds light on why +diffusion models can break the curse of dimensionality and exhibit the phase +transition in learning distributions. Moreover, we empirically establish a +correspondence between the subspaces and the semantic representations of image +data, facilitating image editing. We validate these results with corroborated +experimental results on both simulated distributions and image datasets. + +
+
+ comment: 39 pages, 9 figures +
+
+
+
+
+ + ☆ Deep Adaptive Interest Network: Personalized Recommendation with + Context-Aware Learning + + +
+ In personalized recommendation systems, accurately capturing users' evolving +interests and combining them with contextual information is a critical research +area. This paper proposes a novel model called the Deep Adaptive Interest +Network (DAIN), which dynamically models users' interests while incorporating +context-aware learning mechanisms to achieve precise and adaptive personalized +recommendations. DAIN leverages deep learning techniques to build an adaptive +interest network structure that can capture users' interest changes in +real-time while further optimizing recommendation results by integrating +contextual information. Experiments conducted on several public datasets +demonstrate that DAIN excels in both recommendation performance and +computational efficiency. This research not only provides a new solution for +personalized recommendation systems but also offers fresh insights into the +application of context-aware learning in recommendation systems. + +
+
+
+
+
+ + ☆ Relative-Translation Invariant Wasserstein Distance + + +
+ We introduce a new family of distances, relative-translation invariant +Wasserstein distances ($RW_p$), for measuring the similarity of two probability +distributions under distribution shift. Generalizing it from the classical +optimal transport model, we show that $RW_p$ distances are also real distance +metrics defined on the quotient set $\mathcal{P}_p(\mathbb{R}^n)/\sim$ and +invariant to distribution translations. When $p=2$, the $RW_2$ distance enjoys +more exciting properties, including decomposability of the optimal transport +model, translation-invariance of the $RW_2$ distance, and a Pythagorean +relationship between $RW_2$ and the classical quadratic Wasserstein distance +($W_2$). Based on these properties, we show that a distribution shift, measured +by $W_2$ distance, can be explained in the bias-variance perspective. In +addition, we propose a variant of the Sinkhorn algorithm, named $RW_2$ Sinkhorn +algorithm, for efficiently calculating $RW_2$ distance, coupling solutions, as +well as $W_2$ distance. We also provide the analysis of numerical stability and +time complexity for the proposed algorithm. Finally, we validate the $RW_2$ +distance metric and the algorithm performance with three experiments. We +conduct one numerical validation for the $RW_2$ Sinkhorn algorithm and show two +real-world applications demonstrating the effectiveness of using $RW_2$ under +distribution shift: digits recognition and similar thunderstorm detection. The +experimental results report that our proposed algorithm significantly improves +the computational efficiency of Sinkhorn in certain practical applications, and +the $RW_2$ distance is robust to distribution translations compared with +baselines. + +
+
+
+
+
+ + ☆ Abstractive Text Summarization: State of the Art, Challenges, and + Improvements + + +
+ Specifically focusing on the landscape of abstractive text summarization, as +opposed to extractive techniques, this survey presents a comprehensive +overview, delving into state-of-the-art techniques, prevailing challenges, and +prospective research directions. We categorize the techniques into traditional +sequence-to-sequence models, pre-trained large language models, reinforcement +learning, hierarchical methods, and multi-modal summarization. Unlike prior +works that did not examine complexities, scalability and comparisons of +techniques in detail, this review takes a comprehensive approach encompassing +state-of-the-art methods, challenges, solutions, comparisons, limitations and +charts out future improvements - providing researchers an extensive overview to +advance abstractive summarization research. We provide vital comparison tables +across techniques categorized - offering insights into model complexity, +scalability and appropriate applications. The paper highlights challenges such +as inadequate meaning representation, factual consistency, controllable text +summarization, cross-lingual summarization, and evaluation metrics, among +others. Solutions leveraging knowledge incorporation and other innovative +strategies are proposed to address these challenges. The paper concludes by +highlighting emerging research areas like factual inconsistency, +domain-specific, cross-lingual, multilingual, and long-document summarization, +as well as handling noisy data. Our objective is to provide researchers and +practitioners with a structured overview of the domain, enabling them to better +understand the current landscape and identify potential areas for further +research and improvement. + +
+
+ comment: 9 Tables, 7 Figures +
+
+
+
+
+ + ☆ Adaptive Class Emergence Training: Enhancing Neural Network Stability + and Generalization through Progressive Target Evolution + + +
+ Recent advancements in artificial intelligence, particularly deep neural +networks, have pushed the boundaries of what is achievable in complex tasks. +Traditional methods for training neural networks in classification problems +often rely on static target outputs, such as one-hot encoded vectors, which can +lead to unstable optimization and difficulties in handling non-linearities +within data. In this paper, we propose a novel training methodology that +progressively evolves the target outputs from a null vector to one-hot encoded +vectors throughout the training process. This gradual transition allows the +network to adapt more smoothly to the increasing complexity of the +classification task, maintaining an equilibrium state that reduces the risk of +overfitting and enhances generalization. Our approach, inspired by concepts +from structural equilibrium in finite element analysis, has been validated +through extensive experiments on both synthetic and real-world datasets. The +results demonstrate that our method achieves faster convergence, improved +accuracy, and better generalization, especially in scenarios with high data +complexity and noise. This progressive training framework offers a robust +alternative to classical methods, opening new perspectives for more efficient +and stable neural network training. + +
+
+ comment: 15 pages, 9 figures, 2 tables +
+
+
+
+
+ + ☆ Learning Privacy-Preserving Student Networks via + Discriminative-Generative Distillation + + +
+ While deep models have proved successful in learning rich knowledge from +massive well-annotated data, they may pose a privacy leakage risk in practical +deployment. It is necessary to find an effective trade-off between high utility +and strong privacy. In this work, we propose a discriminative-generative +distillation approach to learn privacy-preserving deep models. Our key idea is +taking models as bridge to distill knowledge from private data and then +transfer it to learn a student network via two streams. First, discriminative +stream trains a baseline classifier on private data and an ensemble of teachers +on multiple disjoint private subsets, respectively. Then, generative stream +takes the classifier as a fixed discriminator and trains a generator in a +data-free manner. After that, the generator is used to generate massive +synthetic data which are further applied to train a variational autoencoder +(VAE). Among these synthetic data, a few of them are fed into the teacher +ensemble to query labels via differentially private aggregation, while most of +them are embedded to the trained VAE for reconstructing synthetic data. +Finally, a semi-supervised student learning is performed to simultaneously +handle two tasks: knowledge transfer from the teachers with distillation on few +privately labeled synthetic data, and knowledge enhancement with tangent-normal +adversarial regularization on many triples of reconstructed synthetic data. In +this way, our approach can control query cost over private data and mitigate +accuracy degradation in a unified manner, leading to a privacy-preserving +student model. Extensive experiments and analysis clearly show the +effectiveness of the proposed approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ☆ Building Math Agents with Multi-Turn Iterative Preference Learning + + +
+ Recent studies have shown that large language models' (LLMs) mathematical +problem-solving capabilities can be enhanced by integrating external tools, +such as code interpreters, and employing multi-turn Chain-of-Thought (CoT) +reasoning. While current methods focus on synthetic data generation and +Supervised Fine-Tuning (SFT), this paper studies the complementary direct +preference learning approach to further improve model performance. However, +existing direct preference learning algorithms are originally designed for the +single-turn chat task, and do not fully address the complexities of multi-turn +reasoning and external tool integration required for tool-integrated +mathematical reasoning tasks. To fill in this gap, we introduce a multi-turn +direct preference learning framework, tailored for this context, that leverages +feedback from code interpreters and optimizes trajectory-level preferences. +This framework includes multi-turn DPO and multi-turn KTO as specific +implementations. The effectiveness of our framework is validated through +training of various language models using an augmented prompt set from the +GSM8K and MATH datasets. Our results demonstrate substantial improvements: a +supervised fine-tuned Gemma-1.1-it-7B model's performance increased from 77.5% +to 83.9% on GSM8K and from 46.1% to 51.2% on MATH. Similarly, a Gemma-2-it-9B +model improved from 84.1% to 86.3% on GSM8K and from 51.0% to 54.5% on MATH. + +
+
+ comment: A multi-turn direct preference learning framework for tool-integrated + reasoning tasks +
+
+
+
+
+ + ☆ Gaussian Rate-Distortion-Perception Coding and Entropy-Constrained + Scalar Quantization + + +
+ This paper investigates the best known bounds on the quadratic Gaussian +distortion-rate-perception function with limited common randomness for the +Kullback-Leibler divergence-based perception measure, as well as their +counterparts for the squared Wasserstein-2 distance-based perception measure, +recently established by Xie et al. These bounds are shown to be nondegenerate +in the sense that they cannot be deduced from each other via a refined version +of Talagrand's transportation inequality. On the other hand, an improved lower +bound is established when the perception measure is given by the squared +Wasserstein-2 distance. In addition, it is revealed by exploiting the +connection between rate-distortion-perception coding and entropy-constrained +scalar quantization that all the aforementioned bounds are generally not tight +in the weak perception constraint regime. + +
+
+
+
+
+ + ☆ Exploring Low-Dimensional Subspaces in Diffusion Models for Controllable + Image Editing + + +
+ Recently, diffusion models have emerged as a powerful class of generative +models. Despite their success, there is still limited understanding of their +semantic spaces. This makes it challenging to achieve precise and disentangled +image generation without additional training, especially in an unsupervised +way. In this work, we improve the understanding of their semantic spaces from +intriguing observations: among a certain range of noise levels, (1) the learned +posterior mean predictor (PMP) in the diffusion model is locally linear, and +(2) the singular vectors of its Jacobian lie in low-dimensional semantic +subspaces. We provide a solid theoretical basis to justify the linearity and +low-rankness in the PMP. These insights allow us to propose an unsupervised, +single-step, training-free LOw-rank COntrollable image editing (LOCO Edit) +method for precise local editing in diffusion models. LOCO Edit identified +editing directions with nice properties: homogeneity, transferability, +composability, and linearity. These properties of LOCO Edit benefit greatly +from the low-dimensional semantic subspace. Our method can further be extended +to unsupervised or text-supervised editing in various text-to-image diffusion +models (T-LOCO Edit). Finally, extensive empirical experiments demonstrate the +effectiveness and efficiency of LOCO Edit. The codes will be released at +https://github.com/ChicyChen/LOCO-Edit. + +
+
+
+
+
+ + ☆ Optimal Neural Network Approximation for High-Dimensional Continuous + Functions + + +
+ Recently, the authors of Shen Yang Zhang (JMLR, 2022) developed a neural +network with width $36d(2d + 1)$ and depth $11$, which utilizes a special +activation function called the elementary universal activation function, to +achieve the super approximation property for functions in $C([a,b]^d)$. That +is, the constructed network only requires a fixed number of neurons to +approximate a $d$-variate continuous function on a $d$-dimensional hypercube +with arbitrary accuracy. Their network uses $\mathcal{O}(d^2)$ fixed neurons. +One natural question to address is whether we can reduce the number of these +neurons in such a network. By leveraging a variant of the Kolmogorov +Superposition Theorem, our analysis shows that there is a neural network +generated by the elementary universal activation function with only $366d +365$ +fixed, intrinsic (non-repeated) neurons that attains this super approximation +property. Furthermore, we present a family of continuous functions that +requires at least width $d$, and therefore at least $d$ intrinsic neurons, to +achieve arbitrary accuracy in its approximation. This shows that the +requirement of $\mathcal{O}(d)$ intrinsic neurons is optimal in the sense that +it grows linearly with the input dimension $d$, unlike some approximation +methods where parameters may grow exponentially with $d$. + +
+
+
+
+
+ + ☆ Machine Learning Applications to Computational Plasma Physics and + Reduced-Order Plasma Modeling: A Perspective + + +
+ Machine learning (ML) provides a broad spectrum of tools and architectures +that enable the transformation of data from simulations and experiments into +useful and explainable science, thereby augmenting domain knowledge. +Furthermore, ML-enhanced numerical modelling can revamp scientific computing +for real-world complex engineering systems, creating unique opportunities to +examine the operation of the technologies in detail and automate their +optimization and control. In recent years, ML applications have seen +significant growth across various scientific domains, particularly in fluid +mechanics, where ML has shown great promise in enhancing computational modeling +of fluid flows. In contrast, ML applications in numerical plasma physics +research remain relatively limited in scope and extent. Despite this, the close +relationship between fluid mechanics and plasma physics presents a valuable +opportunity to create a roadmap for transferring ML advances in fluid flow +modeling to computational plasma physics. This Perspective aims to outline such +a roadmap. We begin by discussing some general fundamental aspects of ML, +including the various categories of ML algorithms and the different types of +problems that can be solved with the help of ML. With regard to each problem +type, we then present specific examples from the use of ML in computational +fluid dynamics, reviewing several insightful prior efforts. We also review +recent ML applications in plasma physics for each problem type. The paper +discusses promising future directions and development pathways for ML in plasma +modelling within the different application areas. Additionally, we point out +prominent challenges that must be addressed to realize ML's full potential in +computational plasma physics, including the need for cost-effective +high-fidelity simulation tools for extensive data generation. + +
+
+ comment: 42 pages, 20 figures +
+
+
+
+
+ + ☆ Understanding the Role of Functional Diversity in Weight-Ensembling with + Ingredient Selection and Multidimensional Scaling ICML 2024 + + +
+ Weight-ensembles are formed when the parameters of multiple neural networks +are directly averaged into a single model. They have demonstrated +generalization capability in-distribution (ID) and out-of-distribution (OOD) +which is not completely understood, though they are thought to successfully +exploit functional diversity allotted by each distinct model. Given a +collection of models, it is also unclear which combination leads to the optimal +weight-ensemble; the SOTA is a linear-time ``greedy" method. We introduce two +novel weight-ensembling approaches to study the link between performance +dynamics and the nature of how each method decides to use apply the +functionally diverse components, akin to diversity-encouragement in the +prediction-ensemble literature. We develop a visualization tool to explain how +each algorithm explores various domains defined via pairwise-distances to +further investigate selection and algorithms' convergence. Empirical analyses +shed perspectives which reinforce how high-diversity enhances weight-ensembling +while qualifying the extent to which diversity alone improves accuracy. We also +demonstrate that sampling positionally distinct models can contribute just as +meaningfully to improvements in a weight-ensemble. + +
+
+ comment: Published at the ICML 2024 (Vienna, Austria) Workshop on Foundation + Models in the Wild +
+
+
+
+
+ + ☆ Robust Federated Finetuning of Foundation Models via Alternating + Minimization of LoRA ICML2024 + + +
+ Parameter-Efficient Fine-Tuning (PEFT) has risen as an innovative training +strategy that updates only a select few model parameters, significantly +lowering both computational and memory demands. PEFT also helps to decrease +data transfer in federated learning settings, where communication depends on +the size of updates. In this work, we explore the constraints of previous +studies that integrate a well-known PEFT method named LoRA with federated +fine-tuning, then introduce RoLoRA, a robust federated fine-tuning framework +that utilizes an alternating minimization approach for LoRA, providing greater +robustness against decreasing fine-tuning parameters and increasing data +heterogeneity. Our results indicate that RoLoRA not only presents the +communication benefits but also substantially enhances the robustness and +effectiveness in multiple federated fine-tuning scenarios. + +
+
+ comment: Presented at ES-FOMO-II@ICML2024 +
+
+
+
+
+ + ☆ NUDGE: Lightweight Non-Parametric Fine-Tuning of Embeddings for + Retrieval + + +
+ $k$-Nearest Neighbor search on dense vector embeddings ($k$-NN retrieval) +from pre-trained embedding models is the predominant retrieval method for text +and images, as well as Retrieval-Augmented Generation (RAG) pipelines. In +practice, application developers often fine-tune the embeddings to improve +their accuracy on the dataset and query workload in hand. Existing approaches +either fine-tune the pre-trained model itself or, more efficiently, but at the +cost of accuracy, train adaptor models to transform the output of the +pre-trained model. We present NUDGE, a family of novel non-parametric embedding +fine-tuning approaches that are significantly more accurate and efficient than +both sets of existing approaches. NUDGE directly modifies the embeddings of +data records to maximize the accuracy of $k$-NN retrieval. We present a +thorough theoretical and experimental study of NUDGE's non-parametric approach. +We show that even though the underlying problem is NP-Hard, constrained +variations can be solved efficiently. These constraints additionally ensure +that the changes to the embeddings are modest, avoiding large distortions to +the semantics learned during pre-training. In experiments across five +pre-trained models and nine standard text and image retrieval datasets, NUDGE +runs in minutes and often improves NDCG@10 by more than 10% over existing +fine-tuning methods. On average, NUDGE provides 3.3x and 4.3x higher increase +in accuracy and runs 200x and 3x faster, respectively, over fine-tuning the +pre-trained model and training adaptors. + +
+
+
+
+
+ + ☆ Optimal sampling for least-squares approximation + + +
+ Least-squares approximation is one of the most important methods for +recovering an unknown function from data. While in many applications the data +is fixed, in many others there is substantial freedom to choose where to +sample. In this paper, we review recent progress on optimal sampling for +(weighted) least-squares approximation in arbitrary linear spaces. We introduce +the Christoffel function as a key quantity in the analysis of (weighted) +least-squares approximation from random samples, then show how it can be used +to construct sampling strategies that possess near-optimal sample complexity: +namely, the number of samples scales log-linearly in $n$, the dimension of the +approximation space. We discuss a series of variations, extensions and further +topics, and throughout highlight connections to approximation theory, machine +learning, information-based complexity and numerical linear algebra. Finally, +motivated by various contemporary applications, we consider a generalization of +the classical setting where the samples need not be pointwise samples of a +scalar-valued function, and the approximation space need not be linear. We show +that even in this significantly more general setting suitable generalizations +of the Christoffel function still determine the sample complexity. This +provides a unified procedure for designing improved sampling strategies for +general recovery problems. This article is largely self-contained, and intended +to be accessible to nonspecialists. + +
+
+
+
+
+ + ☆ Data-driven 2D stationary quantum droplets and wave propagations in the + amended GP equation with two potentials via deep neural networks learning + + +
+ In this paper, we develop a systematic deep learning approach to solve +two-dimensional (2D) stationary quantum droplets (QDs) and investigate their +wave propagation in the 2D amended Gross-Pitaevskii equation with +Lee-Huang-Yang correction and two kinds of potentials. Firstly, we use the +initial-value iterative neural network (IINN) algorithm for 2D stationary +quantum droplets of stationary equations. Then the learned stationary QDs are +used as the initial value conditions for physics-informed neural networks +(PINNs) to explore their evolutions in the some space-time region. Especially, +we consider two types of potentials, one is the 2D quadruple-well Gaussian +potential and the other is the PT-symmetric HO-Gaussian potential, which lead +to spontaneous symmetry breaking and the generation of multi-component QDs. The +used deep learning method can also be applied to study wave propagations of +other nonlinear physical models. + +
+
+ comment: 17 pages, 12 figures (Proc. R. Soc. A, accepted for publication). + arXiv admin note: text overlap with arXiv:2409.01124 +
+
+
+
+
+ + ☆ Subsidy design for better social outcomes + + +
+ Overcoming the impact of selfish behavior of rational players in multiagent +systems is a fundamental problem in game theory. Without any intervention from +a central agent, strategic users take actions in order to maximize their +personal utility, which can lead to extremely inefficient overall system +performance, often indicated by a high Price of Anarchy. Recent work (Lin et +al. 2021) investigated and formalized yet another undesirable behavior of +rational agents, that of avoiding freely available information about the game +for selfish reasons, leading to worse social outcomes. A central planner can +significantly mitigate these issues by injecting a subsidy to reduce certain +costs associated with the system and obtain net gains in the system +performance. Crucially, the planner needs to determine how to allocate this +subsidy effectively. + We formally show that designing subsidies that perfectly optimize the social +good, in terms of minimizing the Price of Anarchy or preventing the information +avoidance behavior, is computationally hard under standard complexity theoretic +assumptions. On the positive side, we show that we can learn provably good +values of subsidy in repeated games coming from the same domain. This +data-driven subsidy design approach avoids solving computationally hard +problems for unseen games by learning over polynomially many games. We also +show that optimal subsidy can be learned with no-regret given an online +sequence of games, under mild assumptions on the cost matrix. Our study focuses +on two distinct games: a Bayesian extension of the well-studied fair +cost-sharing game, and a component maintenance game with engineering +applications. + +
+
+ comment: 30 pages, 3 figures, 5 tables +
+
+
+
+
+ + ☆ Generative artificial intelligence for computational chemistry: a + roadmap to predicting emergent phenomena + + +
+ The recent surge in Generative Artificial Intelligence (AI) has introduced +exciting possibilities for computational chemistry. Generative AI methods have +made significant progress in sampling molecular structures across chemical +species, developing force fields, and speeding up simulations. This Perspective +offers a structured overview, beginning with the fundamental theoretical +concepts in both Generative AI and computational chemistry. It then covers +widely used Generative AI methods, including autoencoders, generative +adversarial networks, reinforcement learning, flow models and language models, +and highlights their selected applications in diverse areas including force +field development, and protein/RNA structure prediction. A key focus is on the +challenges these methods face before they become truly predictive, particularly +in predicting emergent chemical phenomena. We believe that the ultimate goal of +a simulation method or theory is to predict phenomena not seen before, and that +Generative AI should be subject to these same standards before it is deemed +useful for chemistry. We suggest that to overcome these challenges, future AI +models need to integrate core chemical principles, especially from statistical +mechanics. + +
+
+
+
+
+ + ☆ Probing self-attention in self-supervised speech models for + cross-linguistic differences + + +
+ Speech models have gained traction thanks to increase in accuracy from novel +transformer architectures. While this impressive increase in performance across +automatic speech recognition (ASR) benchmarks is noteworthy, there is still +much that is unknown about the use of attention mechanisms for speech-related +tasks. For example, while it is assumed that these models are learning +language-independent (i.e., universal) speech representations, there has not +yet been an in-depth exploration of what it would mean for the models to be +language-independent. In the current paper, we explore this question within the +realm of self-attention mechanisms of one small self-supervised speech +transformer model (TERA). We find that even with a small model, the attention +heads learned are diverse ranging from almost entirely diagonal to almost +entirely global regardless of the training language. We highlight some notable +differences in attention patterns between Turkish and English and demonstrate +that the models do learn important phonological information during pretraining. +We also present a head ablation study which shows that models across languages +primarily rely on diagonal heads to classify phonemes. + +
+
+ comment: 10 pages, 18 figures +
+
+
+
+
+ + ☆ RoboKoop: Efficient Control Conditioned Representations from Visual + Input in Robotics using Koopman Operator + + +
+ Developing agents that can perform complex control tasks from +high-dimensional observations is a core ability of autonomous agents that +requires underlying robust task control policies and adapting the underlying +visual representations to the task. Most existing policies need a lot of +training samples and treat this problem from the lens of two-stage learning +with a controller learned on top of pre-trained vision models. We approach this +problem from the lens of Koopman theory and learn visual representations from +robotic agents conditioned on specific downstream tasks in the context of +learning stabilizing control for the agent. We introduce a Contrastive Spectral +Koopman Embedding network that allows us to learn efficient linearized visual +representations from the agent's visual data in a high dimensional latent space +and utilizes reinforcement learning to perform off-policy control on top of the +extracted representations with a linear controller. Our method enhances +stability and control in gradient dynamics over time, significantly +outperforming existing approaches by improving efficiency and accuracy in +learning task policies over extended horizons. + +
+
+ comment: Accepted to the $8^{th}$ Conference on Robot Learning (CoRL 2024) +
+
+
+
+
+ + ☆ Leveraging Interpretability in the Transformer to Automate the Proactive + Scaling of Cloud Resources + + +
+ Modern web services adopt cloud-native principles to leverage the advantages +of microservices. To consistently guarantee high Quality of Service (QoS) +according to Service Level Agreements (SLAs), ensure satisfactory user +experiences, and minimize operational costs, each microservice must be +provisioned with the right amount of resources. However, accurately +provisioning microservices with adequate resources is complex and depends on +many factors, including workload intensity and the complex interconnections +between microservices. To address this challenge, we develop a model that +captures the relationship between an end-to-end latency, requests at the +front-end level, and resource utilization. We then use the developed model to +predict the end-to-end latency. Our solution leverages the Temporal Fusion +Transformer (TFT), an attention-based architecture equipped with +interpretability features. When the prediction results indicate SLA +non-compliance, we use the feature importance provided by the TFT as covariates +in Kernel Ridge Regression (KRR), with the response variable being the desired +latency, to learn the parameters associated with the feature importance. These +learned parameters reflect the adjustments required to the features to ensure +SLA compliance. We demonstrate the merit of our approach with a +microservice-based application and provide a roadmap to deployment. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Backdoor defense, learnability and obfuscation + + +
+ We introduce a formal notion of defendability against backdoors using a game +between an attacker and a defender. In this game, the attacker modifies a +function to behave differently on a particular input known as the "trigger", +while behaving the same almost everywhere else. The defender then attempts to +detect the trigger at evaluation time. If the defender succeeds with high +enough probability, then the function class is said to be defendable. The key +constraint on the attacker that makes defense possible is that the attacker's +strategy must work for a randomly-chosen trigger. + Our definition is simple and does not explicitly mention learning, yet we +demonstrate that it is closely connected to learnability. In the +computationally unbounded setting, we use a voting algorithm of Hanneke et al. +(2022) to show that defendability is essentially determined by the VC dimension +of the function class, in much the same way as PAC learnability. In the +computationally bounded setting, we use a similar argument to show that +efficient PAC learnability implies efficient defendability, but not conversely. +On the other hand, we use indistinguishability obfuscation to show that the +class of polynomial size circuits is not efficiently defendable. Finally, we +present polynomial size decision trees as a natural example for which defense +is strictly easier than learning. Thus, we identify efficient defendability as +a notable intermediate concept in between efficient learnability and +obfuscation. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Better Verified Explanations with Applications to Incorrectness and + Out-of-Distribution Detection + + +
+ Building on VeriX (Verified eXplainability, arXiv:2212.01051), a system for +producing optimal verified explanations for machine learning model outputs, we +present VeriX+, which significantly improves both the size and the generation +time of verified explanations. We introduce a bound propagation-based +sensitivity technique to improve the size, and a binary search-based traversal +with confidence ranking for improving time -- the two techniques are orthogonal +and can be used independently or together. We also show how to adapt the +QuickXplain (Junker 2004) algorithm to our setting to provide a trade-off +between size and time. Experimental evaluations on standard benchmarks +demonstrate significant improvements on both metrics, e.g., a size reduction of +38% on the GTSRB dataset and a time reduction of 90% on MNIST. We also explore +applications of our verified explanations and show that explanation size is a +useful proxy for both incorrectness detection and out-of-distribution +detection. + +
+
+
+
+
+ + ☆ An Introduction to Centralized Training for Decentralized Execution in + Cooperative Multi-Agent Reinforcement Learning + + +
+ Multi-agent reinforcement learning (MARL) has exploded in popularity in +recent years. Many approaches have been developed but they can be divided into +three main types: centralized training and execution (CTE), centralized +training for decentralized execution (CTDE), and Decentralized training and +execution (DTE). + CTDE methods are the most common as they can use centralized information +during training but execute in a decentralized manner -- using only information +available to that agent during execution. CTDE is the only paradigm that +requires a separate training phase where any available information (e.g., other +agent policies, underlying states) can be used. As a result, they can be more +scalable than CTE methods, do not require communication during execution, and +can often perform well. CTDE fits most naturally with the cooperative case, but +can be potentially applied in competitive or mixed settings depending on what +information is assumed to be observed. + This text is an introduction to CTDE in cooperative MARL. It is meant to +explain the setting, basic concepts, and common methods. It does not cover all +work in CTDE MARL as the subarea is quite extensive. I have included work that +I believe is important for understanding the main concepts in the subarea and +apologize to those that I have omitted. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2405.06161 +
+
+
+
+
+ + ☆ Can Your Generative Model Detect Out-of-Distribution Covariate Shift? ECCV 2024 + + +
+ Detecting Out-of-Distribution~(OOD) sensory data and covariate distribution +shift aims to identify new test examples with different high-level image +statistics to the captured, normal and In-Distribution (ID) set. Existing OOD +detection literature largely focuses on semantic shift with little-to-no +consensus over covariate shift. Generative models capture the ID data in an +unsupervised manner, enabling them to effectively identify samples that deviate +significantly from this learned distribution, irrespective of the downstream +task. In this work, we elucidate the ability of generative models to detect and +quantify domain-specific covariate shift through extensive analyses that +involves a variety of models. To this end, we conjecture that it is sufficient +to detect most occurring sensory faults (anomalies and deviations in global +signals statistics) by solely modeling high-frequency signal-dependent and +independent details. We propose a novel method, CovariateFlow, for OOD +detection, specifically tailored to covariate heteroscedastic high-frequency +image-components using conditional Normalizing Flows (cNFs). Our results on +CIFAR10 vs. CIFAR10-C and ImageNet200 vs. ImageNet200-C demonstrate the +effectiveness of the method by accurately detecting OOD covariate shift. This +work contributes to enhancing the fidelity of imaging systems and aiding +machine learning models in OOD detection in the presence of covariate shift. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Graph Neural Networks with Limited Labeled Data by Actively + Distilling Knowledge from Large Language Models + + +
+ Graphs are pervasive in the real-world, such as social network analysis, +bioinformatics, and knowledge graphs. Graph neural networks (GNNs) have great +ability in node classification, a fundamental task on graphs. Unfortunately, +conventional GNNs still face challenges in scenarios with few labeled nodes, +despite the prevalence of few-shot node classification tasks in real-world +applications. To address this challenge, various approaches have been proposed, +including graph meta-learning, transfer learning, and methods based on Large +Language Models (LLMs). However, traditional meta-learning and transfer +learning methods often require prior knowledge from base classes or fail to +exploit the potential advantages of unlabeled nodes. Meanwhile, LLM-based +methods may overlook the zero-shot capabilities of LLMs and rely heavily on the +quality of generated contexts. In this paper, we propose a novel approach that +integrates LLMs and GNNs, leveraging the zero-shot inference and reasoning +capabilities of LLMs and employing a Graph-LLM-based active learning paradigm +to enhance GNNs' performance. Extensive experiments demonstrate the +effectiveness of our model in improving node classification accuracy with +considerably limited labeled data, surpassing state-of-the-art baselines by +significant margins. + +
+
+ comment: 10 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ Decentralized Intelligence Network (DIN) + + +
+ Decentralized Intelligence Network (DIN) is a theoretical framework designed +to address challenges in AI development, particularly focusing on data +fragmentation and siloing issues. It facilitates effective AI training within +sovereign data networks by overcoming barriers to accessing diverse data +sources, leveraging: 1) personal data stores to ensure data sovereignty, where +data remains securely within Participants' control; 2) a scalable federated +learning protocol implemented on a public blockchain for decentralized AI +training, where only model parameter updates are shared, keeping data within +the personal data stores; and 3) a scalable, trustless cryptographic rewards +mechanism on a public blockchain to incentivize participation and ensure fair +reward distribution through a decentralized auditing protocol. This approach +guarantees that no entity can prevent or control access to training data or +influence financial benefits, as coordination and reward distribution are +managed on the public blockchain with an immutable record. The framework +supports effective AI training by allowing Participants to maintain control +over their data, benefit financially, and contribute to a decentralized, +scalable ecosystem that leverages collective AI to develop beneficial +algorithms. + +
+
+ comment: 16 pages, 1 figure. DIN was presented by the author as a speaker at + the Summit on Responsible Decentralized Intelligence - Future of + Decentralization and AI, hosted by Berkeley RDI on August 6, 2024, at the + Verizon Center, Cornell Tech Campus, Roosevelt Island, NYC +
+
+
+
+
+ + ♻ ☆ Kolmogorov n-Widths for Multitask Physics-Informed Machine Learning + (PIML) Methods: Towards Robust Metrics + + +
+ Physics-informed machine learning (PIML) as a means of solving partial +differential equations (PDE) has garnered much attention in the Computational +Science and Engineering (CS&E) world. This topic encompasses a broad array of +methods and models aimed at solving a single or a collection of PDE problems, +called multitask learning. PIML is characterized by the incorporation of +physical laws into the training process of machine learning models in lieu of +large data when solving PDE problems. Despite the overall success of this +collection of methods, it remains incredibly difficult to analyze, benchmark, +and generally compare one approach to another. Using Kolmogorov n-widths as a +measure of effectiveness of approximating functions, we judiciously apply this +metric in the comparison of various multitask PIML architectures. We compute +lower accuracy bounds and analyze the model's learned basis functions on +various PDE problems. This is the first objective metric for comparing +multitask PIML architectures and helps remove uncertainty in model validation +from selective sampling and overfitting. We also identify avenues of +improvement for model architectures, such as the choice of activation function, +which can drastically affect model generalization to "worst-case" scenarios, +which is not observed when reporting task-specific errors. We also incorporate +this metric into the optimization process through regularization, which +improves the models' generalizability over the multitask PDE problem. + +
+
+
+
+
+ + ♻ ☆ Hybrid Decentralized Optimization: Leveraging Both First- and + Zeroth-Order Optimizers for Faster Convergence + + +
+ Distributed optimization is the standard way of speeding up machine learning +training, and most of the research in the area focuses on distributed +first-order, gradient-based methods. Yet, there are settings where some +computationally-bounded nodes may not be able to implement first-order, +gradient-based optimization, while they could still contribute to joint +optimization tasks. In this paper, we initiate the study of hybrid +decentralized optimization, studying settings where nodes with zeroth-order and +first-order optimization capabilities co-exist in a distributed system, and +attempt to jointly solve an optimization task over some data distribution. We +essentially show that, under reasonable parameter settings, such a system can +not only withstand noisier zeroth-order agents but can even benefit from +integrating such agents into the optimization process, rather than ignoring +their information. At the core of our approach is a new analysis of distributed +optimization with noisy and possibly-biased gradient estimators, which may be +of independent interest. Our results hold for both convex and non-convex +objectives. Experimental results on standard optimization tasks confirm our +analysis, showing that hybrid first-zeroth order optimization can be practical, +even when training deep neural networks. + +
+
+ comment: Shayan Talaei and Matin Ansaripour contributed equally to this work +
+
+
+
+
+ + ♻ ☆ The Need for Guardrails with Large Language Models in Medical + Safety-Critical Settings: An Artificial Intelligence Application in the + Pharmacovigilance Ecosystem + + +
+ Large language models (LLMs) are useful tools with the capacity for +performing specific types of knowledge work at an effective scale. However, LLM +deployments in high-risk and safety-critical domains pose unique challenges, +notably the issue of ``hallucination,'' where LLMs can generate fabricated +information. This is particularly concerning in settings such as drug safety, +where inaccuracies could lead to patient harm. To mitigate these risks, we have +developed and demonstrated a proof of concept suite of guardrails specifically +designed to mitigate certain types of hallucinations and errors for drug +safety, and potentially applicable to other medical safety-critical contexts. +These guardrails include mechanisms to detect anomalous documents to prevent +the ingestion of inappropriate data, identify incorrect drug names or adverse +event terms, and convey uncertainty in generated content. We integrated these +guardrails with an LLM fine-tuned for a text-to-text task, which involves +converting both structured and unstructured data within adverse event reports +into natural language. This method was applied to translate individual case +safety reports, demonstrating effective application in a pharmacovigilance +processing task. Our guardrail framework offers a set of tools with broad +applicability across various domains, ensuring LLMs can be safely used in +high-risk situations by eliminating the occurrence of key errors, including the +generation of incorrect pharmacovigilance-related terms, thus adhering to +stringent regulatory and quality standards in medical safety-critical +environments. + +
+
+ comment: 27 pages, 6 figures, 4 tables and supplementary material provided +
+
+
+
+
+ + ♻ ☆ GenoCraft: A Comprehensive, User-Friendly Web-Based Platform for + High-Throughput Omics Data Analysis and Visualization + + +
+ The surge in high-throughput omics data has reshaped the landscape of +biological research, underlining the need for powerful, user-friendly data +analysis and interpretation tools. This paper presents GenoCraft, a web-based +comprehensive software solution designed to handle the entire pipeline of omics +data processing. GenoCraft offers a unified platform featuring advanced +bioinformatics tools, covering all aspects of omics data analysis. It +encompasses a range of functionalities, such as normalization, quality control, +differential analysis, network analysis, pathway analysis, and diverse +visualization techniques. This software makes state-of-the-art omics data +analysis more accessible to a wider range of users. With GenoCraft, researchers +and data scientists have access to an array of cutting-edge bioinformatics +tools under a user-friendly interface, making it a valuable resource for +managing and analyzing large-scale omics data. The API with an interactive web +interface is publicly available at https://genocraft.stanford. edu/. We also +release all the codes in https://github.com/futianfan/GenoCraft. + +
+
+
+
+
+ + ♻ ☆ $μ$GUIDE: a framework for quantitative imaging via generalized + uncertainty-driven inference using deep learning + + +
+ This work proposes $\mu$GUIDE: a general Bayesian framework to estimate +posterior distributions of tissue microstructure parameters from any given +biophysical model or MRI signal representation, with exemplar demonstration in +diffusion-weighted MRI. Harnessing a new deep learning architecture for +automatic signal feature selection combined with simulation-based inference and +efficient sampling of the posterior distributions, $\mu$GUIDE bypasses the high +computational and time cost of conventional Bayesian approaches and does not +rely on acquisition constraints to define model-specific summary statistics. +The obtained posterior distributions allow to highlight degeneracies present in +the model definition and quantify the uncertainty and ambiguity of the +estimated parameters. + +
+
+
+
+
+ + ♻ ☆ Partially Observable Multi-Agent Reinforcement Learning with Information + Sharing ICML 2023 + + +
+ We study provable multi-agent reinforcement learning (RL) in the general +framework of partially observable stochastic games (POSGs). To circumvent the +known hardness results and the use of computationally intractable oracles, we +advocate leveraging the potential \emph{information-sharing} among agents, a +common practice in empirical multi-agent RL, and a standard model for +multi-agent control systems with communications. We first establish several +computational complexity results to justify the necessity of +information-sharing, as well as the observability assumption that has enabled +quasi-efficient single-agent RL with partial observations, for efficiently +solving POSGs. {Inspired by the inefficiency of planning in the ground-truth +model,} we then propose to further \emph{approximate} the shared common +information to construct an {approximate model} of the POSG, in which planning +an approximate \emph{equilibrium} (in terms of solving the original POSG) can +be quasi-efficient, i.e., of quasi-polynomial-time, under the aforementioned +assumptions. Furthermore, we develop a partially observable multi-agent RL +algorithm that is \emph{both} statistically and computationally +quasi-efficient. {Finally, beyond equilibrium learning, we extend our +algorithmic framework to finding the \emph{team-optimal solution} in +cooperative POSGs, i.e., decentralized partially observable Markov decision +processes, a much more challenging goal. We establish concrete computational +and sample complexities under several common structural assumptions of the +model.} We hope our study could open up the possibilities of leveraging and +even designing different \emph{information structures}, a well-studied notion +in control theory, for developing both sample- and computation-efficient +partially observable multi-agent RL. + +
+
+ comment: Journal extension of the conference version at ICML 2023. Changed to + the more general reward function form, added new results for learning in + Dec-POMDPs, and streamlined proof outlines +
+
+
+
+
+ + ♻ ☆ Domain Decomposition-based coupling of Operator Inference reduced order + models via the Schwarz alternating method + + +
+ This paper presents and evaluates an approach for coupling together +subdomain-local reduced order models (ROMs) constructed via non-intrusive +operator inference (OpInf) with each other and with subdomain-local full order +models (FOMs), following a domain decomposition of the spatial geometry on +which a given partial differential equation (PDE) is posed. Joining +subdomain-local models is accomplished using the overlapping Schwarz +alternating method, a minimally-intrusive multiscale coupling technique that +works by transforming a monolithic problem into a sequence of subdomain-local +problems, which communicate through transmission boundary conditions imposed on +the subdomain interfaces. After formulating the overlapping Schwarz alternating +method for OpInf ROMs, termed OpInf-Schwarz, we evaluate the method's accuracy +and efficiency on several test cases involving the heat equation in two spatial +dimensions. We demonstrate that the method is capable of coupling together +arbitrary combinations of OpInf ROMs and FOMs, and that speed-ups over a +monolithic FOM are possible when performing OpInf ROM coupling. + +
+
+
+
+
+ + ♻ ☆ Simple and Scalable Strategies to Continually Pre-train Large Language + Models + + +
+ Large language models (LLMs) are routinely pre-trained on billions of tokens, +only to start the process over again once new data becomes available. A much +more efficient solution is to continually pre-train these models, saving +significant compute compared to re-training. However, the distribution shift +induced by new data typically results in degraded performance on previous data +or poor adaptation to the new data. In this work, we show that a simple and +scalable combination of learning rate (LR) re-warming, LR re-decaying, and +replay of previous data is sufficient to match the performance of fully +re-training from scratch on all available data, as measured by the final loss +and the average score on several language model (LM) evaluation benchmarks. +Specifically, we show this for a weak but realistic distribution shift between +two commonly used LLM pre-training datasets (English$\rightarrow$English) and a +stronger distribution shift (English$\rightarrow$German) at the $405$M +parameter model scale with large dataset sizes (hundreds of billions of +tokens). Selecting the weak but realistic shift for larger-scale experiments, +we also find that our continual learning strategies match the re-training +baseline for a 10B parameter LLM. Our results demonstrate that LLMs can be +successfully updated via simple and scalable continual learning strategies, +matching the re-training baseline using only a fraction of the compute. +Finally, inspired by previous work, we propose alternatives to the cosine +learning rate schedule that help circumvent forgetting induced by LR re-warming +and that are not bound to a fixed token budget. + +
+
+
+
+
+ + ♻ ☆ Convolutional L2LFlows: Generating Accurate Showers in Highly Granular + Calorimeters Using Convolutional Normalizing Flows + + +
+ In the quest to build generative surrogate models as computationally +efficient alternatives to rule-based simulations, the quality of the generated +samples remains a crucial frontier. So far, normalizing flows have been among +the models with the best fidelity. However, as the latent space in such models +is required to have the same dimensionality as the data space, scaling up +normalizing flows to high dimensional datasets is not straightforward. The +prior L2LFlows approach successfully used a series of separate normalizing +flows and sequence of conditioning steps to circumvent this problem. In this +work, we extend L2LFlows to simulate showers with a 9-times larger profile in +the lateral direction. To achieve this, we introduce convolutional layers and +U-Net-type connections, move from masked autoregressive flows to coupling +layers, and demonstrate the successful modelling of showers in the ILD +Electromagnetic Calorimeter as well as Dataset 3 from the public CaloChallenge +dataset. + +
+
+
+
+
+ + ♻ ☆ Multi-Agent Reinforcement Learning from Human Feedback: Data Coverage + and Algorithmic Techniques + + +
+ We initiate the study of Multi-Agent Reinforcement Learning from Human +Feedback (MARLHF), exploring both theoretical foundations and empirical +validations. We define the task as identifying Nash equilibrium from a +preference-only offline dataset in general-sum games, a problem marked by the +challenge of sparse feedback signals. Our theory establishes the upper +complexity bounds for Nash Equilibrium in effective MARLHF, demonstrating that +single-policy coverage is inadequate and highlighting the importance of +unilateral dataset coverage. These theoretical insights are verified through +comprehensive experiments. To enhance the practical performance, we further +introduce two algorithmic techniques. (1) We propose a Mean Squared Error (MSE) +regularization along the time axis to achieve a more uniform reward +distribution and improve reward learning outcomes. (2) We utilize imitation +learning to approximate the reference policy, ensuring stability and +effectiveness in training. Our findings underscore the multifaceted approach +required for MARLHF, paving the way for effective preference-based multi-agent +systems. + +
+
+
+
+
+ + ♻ ☆ Revisiting Character-level Adversarial Attacks for Language Models ICML 2024 + + +
+ Adversarial attacks in Natural Language Processing apply perturbations in the +character or token levels. Token-level attacks, gaining prominence for their +use of gradient-based methods, are susceptible to altering sentence semantics, +leading to invalid adversarial examples. While character-level attacks easily +maintain semantics, they have received less attention as they cannot easily +adopt popular gradient-based methods, and are thought to be easy to defend. +Challenging these beliefs, we introduce Charmer, an efficient query-based +adversarial attack capable of achieving high attack success rate (ASR) while +generating highly similar adversarial examples. Our method successfully targets +both small (BERT) and large (Llama 2) models. Specifically, on BERT with SST-2, +Charmer improves the ASR in 4.84% points and the USE similarity in 8% points +with respect to the previous art. Our implementation is available in +https://github.com/LIONS-EPFL/Charmer. + +
+
+ comment: Accepted in ICML 2024 +
+
+
+
+
+ + ♻ ☆ Privacy-aware Berrut Approximated Coded Computing for Federated Learning + + +
+ Federated Learning (FL) is an interesting strategy that enables the +collaborative training of an AI model among different data owners without +revealing their private datasets. Even so, FL has some privacy vulnerabilities +that have been tried to be overcome by applying some techniques like +Differential Privacy (DP), Homomorphic Encryption (HE), or Secure Multi-Party +Computation (SMPC). However, these techniques have some important drawbacks +that might narrow their range of application: problems to work with non-linear +functions and to operate large matrix multiplications and high communication +and computational costs to manage semi-honest nodes. In this context, we +propose a solution to guarantee privacy in FL schemes that simultaneously +solves the previously mentioned problems. Our proposal is based on the Berrut +Approximated Coded Computing, a technique from the Coded Distributed Computing +paradigm, adapted to a Secret Sharing configuration, to provide input privacy +to FL in a scalable way. It can be applied for computing non-linear functions +and treats the special case of distributed matrix multiplication, a key +primitive at the core of many automated learning tasks. Because of these +characteristics, it could be applied in a wide range of FL scenarios, since it +is independent of the machine learning models or aggregation algorithms used in +the FL scheme. We provide analysis of the achieved privacy and complexity of +our solution and, due to the extensive numerical results performed, a good +trade-off between privacy and precision can be observed. + +
+
+
+
+
+ + ♻ ☆ A Systematic Bias of Machine Learning Regression Models and Its + Correction: an Application to Imaging-based Brain Age Prediction + + +
+ Machine learning models for continuous outcomes often yield systematically +biased predictions, particularly for values that largely deviate from the mean. +Specifically, predictions for large-valued outcomes tend to be negatively +biased (underestimating actual values), while those for small-valued outcomes +are positively biased (overestimating actual values). We refer to this linear +central tendency warped bias as the "systematic bias of machine learning +regression". In this paper, we first demonstrate that this systematic +prediction bias persists across various machine learning regression models, and +then delve into its theoretical underpinnings. To address this issue, we +propose a general constrained optimization approach designed to correct this +bias and develop computationally efficient implementation algorithms. +Simulation results indicate that our correction method effectively eliminates +the bias from the predicted outcomes. We apply the proposed approach to the +prediction of brain age using neuroimaging data. In comparison to competing +machine learning regression models, our method effectively addresses the +longstanding issue of "systematic bias of machine learning regression" in +neuroimaging-based brain age calculation, yielding unbiased predictions of +brain age. + +
+
+
+
+
+ + ♻ ☆ Pre-processing and Compression: Understanding Hidden Representation + Refinement Across Imaging Domains via Intrinsic Dimension + + +
+ In recent years, there has been interest in how geometric properties such as +intrinsic dimension (ID) of a neural network's hidden representations change +through its layers, and how such properties are predictive of important model +behavior such as generalization ability. However, evidence has begun to emerge +that such behavior can change significantly depending on the domain of the +network's training data, such as natural versus medical images. Here, we +further this inquiry by exploring how the ID of a network's learned +representations changes through its layers, in essence, characterizing how the +network successively refines the information content of input data to be used +for predictions. Analyzing eleven natural and medical image datasets across six +network architectures, we find that how ID changes through the network differs +noticeably between natural and medical image models. Specifically, medical +image models peak in representation ID earlier in the network, implying a +difference in the image features and their abstractness that are typically used +for downstream tasks in these domains. Additionally, we discover a strong +correlation of this peak representation ID with the ID of the data in its input +space, implying that the intrinsic information content of a model's learned +representations is guided by that of the data it was trained on. Overall, our +findings emphasize notable discrepancies in network behavior between natural +and non-natural imaging domains regarding hidden representation information +content, and provide further insights into how a network's learned features are +shaped by its training data. + +
+
+
+
+
+ + ♻ ☆ Energy-Efficient Channel Decoding for Wireless Federated Learning: + Convergence Analysis and Adaptive Design + + +
+ One of the most critical challenges for deploying distributed learning +solutions, such as federated learning (FL), in wireless networks is the limited +battery capacity of mobile clients. While it is a common belief that the major +energy consumption of mobile clients comes from the uplink data transmission, +this paper presents a novel finding, namely channel decoding also contributes +significantly to the overall energy consumption of mobile clients in FL. +Motivated by this new observation, we propose an energy-efficient adaptive +channel decoding scheme that leverages the intrinsic robustness of FL to model +errors. In particular, the robustness is exploited to reduce the energy +consumption of channel decoders at mobile clients by adaptively adjusting the +number of decoding iterations. We theoretically prove that wireless FL with +communication errors can converge at the same rate as the case with error-free +communication provided the bit error rate (BER) is properly constrained. An +adaptive channel decoding scheme is then proposed to improve the energy +efficiency of wireless FL systems. Experimental results demonstrate that the +proposed method maintains the same learning accuracy while reducing the channel +decoding energy consumption by ~20% when compared to an existing approach. + +
+
+ comment: This paper has been accepted by the IEEE TWC. Copyright may be + transferred without notice, after which this version may no longer be + accessible +
+
+
+
+
+ + ♻ ☆ Negation Blindness in Large Language Models: Unveiling the NO Syndrome + in Image Generation + + +
+ Foundational Large Language Models (LLMs) have changed the way we perceive +technology. They have been shown to excel in tasks ranging from poem writing +and coding to essay generation and puzzle solving. With the incorporation of +image generation capability, they have become more comprehensive and versatile +AI tools. At the same time, researchers are striving to identify the +limitations of these tools to improve them further. Currently identified flaws +include hallucination, biases, and bypassing restricted commands to generate +harmful content. In the present work, we have identified a fundamental +limitation related to the image generation ability of LLMs, and termed it The +NO Syndrome. This negation blindness refers to LLMs inability to correctly +comprehend NO related natural language prompts to generate the desired images. +Interestingly, all tested LLMs including GPT-4, Gemini, and Copilot were found +to be suffering from this syndrome. To demonstrate the generalization of this +limitation, we carried out simulation experiments and conducted entropy-based +and benchmark statistical analysis tests on various LLMs in multiple languages, +including English, Hindi, and French. We conclude that the NO syndrome is a +significant flaw in current LLMs that needs to be addressed. A related finding +of this study showed a consistent discrepancy between image and textual +responses as a result of this NO syndrome. We posit that the introduction of a +negation context-aware reinforcement learning based feedback loop between the +LLMs textual response and generated image could help ensure the generated text +is based on both the LLMs correct contextual understanding of the negation +query and the generated visual output. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Different Victims, Same Layout: Email Visual Similarity Detection for + Enhanced Email Protection CCS 2024 + + +
+ In the pursuit of an effective spam detection system, the focus has often +been on identifying known spam patterns either through rule-based detection +systems or machine learning (ML) solutions that rely on keywords. However, both +systems are susceptible to evasion techniques and zero-day attacks that can be +achieved at low cost. Therefore, an email that bypassed the defense system once +can do it again in the following days, even though rules are updated or the ML +models are retrained. The recurrence of failures to detect emails that exhibit +layout similarities to previously undetected spam is concerning for customers +and can erode their trust in a company. Our observations show that threat +actors reuse email kits extensively and can bypass detection with little +effort, for example, by making changes to the content of emails. In this work, +we propose an email visual similarity detection approach, named Pisco, to +improve the detection capabilities of an email threat defense system. We apply +our proof of concept to some real-world samples received from different +sources. Our results show that email kits are being reused extensively and +visually similar emails are sent to our customers at various time intervals. +Therefore, this method could be very helpful in situations where detection +engines that rely on textual features and keywords are bypassed, an occurrence +our observations show happens frequently. + +
+
+ comment: To be published in the proceedings of the ACM Conference on Computer + and Communications Security (ACM CCS 2024) +
+
+
+
+
+ + ♻ ☆ Fast and interpretable Support Vector Classification based on the + truncated ANOVA decomposition + + +
+ Support Vector Machines (SVMs) are an important tool for performing +classification on scattered data, where one usually has to deal with many data +points in high-dimensional spaces. We propose solving SVMs in primal form using +feature maps based on trigonometric functions or wavelets. In small dimensional +settings the Fast Fourier Transform (FFT) and related methods are a powerful +tool in order to deal with the considered basis functions. For growing +dimensions the classical FFT-based methods become inefficient due to the curse +of dimensionality. Therefore, we restrict ourselves to multivariate basis +functions, each of which only depends on a small number of dimensions. This is +motivated by the well-known sparsity of effects and recent results regarding +the reconstruction of functions from scattered data in terms of truncated +analysis of variance (ANOVA) decompositions, which makes the resulting model +even interpretable in terms of importance of the features as well as their +couplings. The usage of small superposition dimensions has the consequence that +the computational effort no longer grows exponentially but only polynomially +with respect to the dimension. In order to enforce sparsity regarding the basis +coefficients, we use the frequently applied $\ell_2$-norm and, in addition, +$\ell_1$-norm regularization. The found classifying function, which is the +linear combination of basis functions, and its variance can then be analyzed in +terms of the classical ANOVA decomposition of functions. Based on numerical +examples we show that we are able to recover the signum of a function that +perfectly fits our model assumptions. Furthermore, we perform classification on +different artificial and real-world data sets. We obtain better results with +$\ell_1$-norm regularization, both in terms of accuracy and clarity of +interpretability. + +
+
+
+
+
+ + ♻ ☆ The future of cosmological likelihood-based inference: accelerated + high-dimensional parameter estimation and model comparison + + +
+ We advocate for a new paradigm of cosmological likelihood-based inference, +leveraging recent developments in machine learning and its underlying +technology, to accelerate Bayesian inference in high-dimensional settings. +Specifically, we combine (i) emulation, where a machine learning model is +trained to mimic cosmological observables, e.g. CosmoPower-JAX; (ii) +differentiable and probabilistic programming, e.g. JAX and NumPyro, +respectively; (iii) scalable Markov chain Monte Carlo (MCMC) sampling +techniques that exploit gradients, e.g. Hamiltonian Monte Carlo; and (iv) +decoupled and scalable Bayesian model selection techniques that compute the +Bayesian evidence purely from posterior samples, e.g. the learned harmonic mean +implemented in harmonic. This paradigm allows us to carry out a complete +Bayesian analysis, including both parameter estimation and model selection, in +a fraction of the time of traditional approaches. First, we demonstrate the +application of this paradigm on a simulated cosmic shear analysis for a Stage +IV survey in 37- and 39-dimensional parameter spaces, comparing $\Lambda$CDM +and a dynamical dark energy model ($w_0w_a$CDM). We recover posterior contours +and evidence estimates that are in excellent agreement with those computed by +the traditional nested sampling approach while reducing the computational cost +from 8 months on 48 CPU cores to 2 days on 12 GPUs. Second, we consider a joint +analysis between three simulated next-generation surveys, each performing a +3x2pt analysis, resulting in 157- and 159-dimensional parameter spaces. +Standard nested sampling techniques are simply unlikely to be feasible in this +high-dimensional setting, requiring a projected 12 years of compute time on 48 +CPU cores; on the other hand, the proposed approach only requires 8 days of +compute time on 24 GPUs. All packages used in our analyses are publicly +available. + +
+
+ comment: 14 pages, 6 figures. Accepted for publication in the Open Journal of + Astrophysics. Codes available at + https://github.com/alessiospuriomancini/cosmopower, + https://github.com/dpiras/cosmopower-jax, + https://github.com/astro-informatics/harmonic/ +
+
+
+
+
+ + ♻ ☆ A possible late-time transition of $M_B$ inferred via neural networks + + +
+ The strengthening of tensions in the cosmological parameters has led to a +reconsideration of fundamental aspects of standard cosmology. The tension in +the Hubble constant can also be viewed as a tension between local and early +Universe constraints on the absolute magnitude $M_B$ of Type Ia supernova. In +this work, we reconsider the possibility of a variation of this parameter in a +model-independent way. We employ neural networks to agnostically constrain the +value of the absolute magnitude as well as assess the impact and statistical +significance of a variation in $M_B$ with redshift from the Pantheon+ +compilation, together with a thorough analysis of the neural network +architecture. We find an indication for a possible transition redshift at the +$z\approx 1$ region. + +
+
+ comment: 13 pages, 9 sets of figures, 2 tables. To appear in JCAP +
+
+
+
+
+ + ♻ ☆ Variational Mode Decomposition and Linear Embeddings are What You Need + For Time-Series Forecasting + + +
+ Time-series forecasting often faces challenges due to data volatility, which +can lead to inaccurate predictions. Variational Mode Decomposition (VMD) has +emerged as a promising technique to mitigate volatility by decomposing data +into distinct modes, thereby enhancing forecast accuracy. In this study, we +integrate VMD with linear models to develop a robust forecasting framework. Our +approach is evaluated on 13 diverse datasets, including ETTm2, WindTurbine, M4, +and 10 air quality datasets from various Southeast Asian cities. The +effectiveness of the VMD strategy is assessed by comparing Root Mean Squared +Error (RMSE) values from models utilizing VMD against those without it. +Additionally, we benchmark linear-based models against well-known neural +network architectures such as LSTM, Bidirectional LSTM, and RNN. The results +demonstrate a significant reduction in RMSE across nearly all models following +VMD application. Notably, the Linear + VMD model achieved the lowest average +RMSE in univariate forecasting at 0.619. In multivariate forecasting, the +DLinear + VMD model consistently outperformed others, attaining the lowest RMSE +across all datasets with an average of 0.019. These findings underscore the +effectiveness of combining VMD with linear models for superior time-series +forecasting. + +
+
+ comment: For associated repository, see + https://github.com/Espalemit/VMD-With-LTSF-Linear.git +
+
+
+
+
+ + ♻ ☆ GT-CausIn: a novel causal-based insight for traffic prediction + + +
+ Traffic forecasting is an important application of spatiotemporal series +prediction. Among different methods, graph neural networks have achieved so far +the most promising results, learning relations between graph nodes then becomes +a crucial task. However, improvement space is very limited when these relations +are learned in a node-to-node manner. The challenge stems from (1) obscure +temporal dependencies between different stations, (2) difficulties in defining +variables beyond the node level, and (3) no ready-made method to validate the +learned relations. To confront these challenges, we define legitimate traffic +causal variables to discover the causal relation inside the traffic network, +which is carefully checked with statistic tools and case analysis. We then +present a novel model named Graph Spatial-Temporal Network Based on Causal +Insight (GT-CausIn), where prior learned causal information is integrated with +graph diffusion layers and temporal convolutional network (TCN) layers. +Experiments are carried out on two real-world traffic datasets: PEMS-BAY and +METR-LA, which show that GT-CausIn significantly outperforms the +state-of-the-art models on mid-term and long-term prediction. + +
+
+
+
+
+ + ♻ ☆ When Does Visual Prompting Outperform Linear Probing for Vision-Language + Models? A Likelihood Perspective + + +
+ Adapting pre-trained models to new tasks can exhibit varying effectiveness +across datasets. Visual prompting, a state-of-the-art parameter-efficient +transfer learning method, can significantly improve the performance of +out-of-distribution tasks. On the other hand, linear probing, a standard +transfer learning method, can sometimes become the best approach. We propose a +log-likelihood ratio (LLR) approach to analyze the comparative benefits of +visual prompting and linear probing. By employing the LLR score alongside +resource-efficient visual prompts approximations, our cost-effective measure +attains up to a 100-fold reduction in run time compared to full training, while +achieving prediction accuracies up to 91%. The source code is available at +https://github.com/IBM/VP-LLR. + +
+
+
+
+
+ + ♻ ☆ Pseudo Replay-based Class Continual Learning for Online New Category + Anomaly Detection in Additive Manufacturing + + +
+ The incorporation of advanced sensors and machine learning techniques has +enabled modern manufacturing enterprises to perform data-driven +classification-based anomaly detection based on the sensor data collected in +manufacturing processes. However, one critical challenge is that newly +presented defect category may manifest as the manufacturing process continues, +resulting in monitoring performance deterioration of previously trained machine +learning models. Hence, there is an increasing need for empowering machine +learning models to learn continually. Among all continual learning methods, +memory-based continual learning has the best performance but faces the +constraints of data storage capacity. To address this issue, this paper +develops a novel pseudo replay-based continual learning framework by +integrating class incremental learning and oversampling-based data generation. +Without storing all the data, the developed framework could generate +high-quality data representing previous classes to train machine learning model +incrementally when new category anomaly occurs. In addition, it could even +enhance the monitoring performance since it also effectively improves the data +quality. The effectiveness of the proposed framework is validated in three +cases studies, which leverages supervised classification problem for anomaly +detection. The experimental results show that the developed method is very +promising in detecting novel anomaly while maintaining a good performance on +the previous task and brings up more flexibility in model architecture. + +
+
+
+
+
+ + ♻ ☆ Navigating the Maize: Cyclic and conditional computational graphs for + molecular simulation + + +
+ Many computational chemistry and molecular simulation workflows can be +expressed as graphs. This abstraction is useful to modularize and potentially +reuse existing components, as well as provide parallelization and ease +reproducibility. Existing tools represent the computation as a directed acyclic +graph (DAG), thus allowing efficient execution by parallelization of concurrent +branches. These systems can, however, generally not express cyclic and +conditional workflows. We therefore developed Maize, a workflow manager for +cyclic and conditional graphs based on the principles of flow-based +programming. By running each node of the graph concurrently in separate +processes and allowing communication at any time through dedicated inter-node +channels, arbitrary graph structures can be executed. We demonstrate the +effectiveness of the tool on a dynamic active learning task in computational +drug design, involving the use of a small molecule generative model and an +associated scoring system, and on a reactivity prediction pipeline using +quantum-chemistry and semiempirical approaches. + +
+
+
+
+
+ + ♻ ☆ DNN-GDITD: Out-of-distribution detection via Deep Neural Network based + Gaussian Descriptor for Imbalanced Tabular Data + + +
+ Classification tasks present challenges due to class imbalances and evolving +data distributions. Addressing these issues requires a robust method to handle +imbalances while effectively detecting out-of-distribution (OOD) samples not +encountered during training. This study introduces a novel OOD detection +algorithm designed for tabular datasets, titled Deep Neural Network-based +Gaussian Descriptor for Imbalanced Tabular Data (DNN-GDITD). The DNN-GDITD +algorithm can be placed on top of any DNN to facilitate better classification +of imbalanced data and OOD detection using spherical decision boundaries. Using +a combination of Push, Score-based, and focal losses, DNN-GDITD assigns +confidence scores to test data points, categorizing them as known classes or as +an OOD sample. Extensive experimentation on tabular datasets demonstrates the +effectiveness of DNN-GDITD compared to three OOD algorithms. Evaluation +encompasses imbalanced and balanced scenarios on diverse tabular datasets, +including a synthetic financial dispute dataset and publicly available tabular +datasets like Gas Sensor, Drive Diagnosis, and MNIST, showcasing DNN-GDITD's +versatility. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN + for Precise Facial Expression Intensity Estimation + + +
+ This paper presents MMA-MRNNet, a novel deep learning architecture for +dynamic multi-output Facial Expression Intensity Estimation (FEIE) from video +data. Traditional approaches to this task often rely on complex 3-D CNNs, which +require extensive pre-training and assume that facial expressions are uniformly +distributed across all frames of a video. These methods struggle to handle +videos of varying lengths, often resorting to ad-hoc strategies that either +discard valuable information or introduce bias. MMA-MRNNet addresses these +challenges through a two-stage process. First, the Multiple Models of Affect +(MMA) extractor component is a Multi-Task Learning CNN that concurrently +estimates valence-arousal, recognizes basic facial expressions, and detects +action units in each frame. These representations are then processed by a +Masked RNN component, which captures temporal dependencies and dynamically +updates weights according to the true length of the input video, ensuring that +only the most relevant features are used for the final prediction. The proposed +unimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction +dataset and demonstrated significantly superior performance, surpassing +state-of-the-art methods by a wide margin, regardless of whether they were +unimodal, multimodal, or ensemble approaches. Finally, we demonstrated the +effectiveness of the MMA component of our proposed method across multiple +in-the-wild datasets, where it consistently outperformed all state-of-the-art +methods across various metrics. + +
+
+
+
+
+ + ♻ ☆ What Formal Languages Can Transformers Express? A Survey + + +
+ As transformers have gained prominence in natural language processing, some +researchers have investigated theoretically what problems they can and cannot +solve, by treating problems as formal languages. Exploring such questions can +help clarify the power of transformers relative to other models of computation, +their fundamental capabilities and limits, and the impact of architectural +choices. Work in this subarea has made considerable progress in recent years. +Here, we undertake a comprehensive survey of this work, documenting the diverse +assumptions that underlie different results and providing a unified framework +for harmonizing seemingly contradictory findings. + +
+
+ comment: One minor correction in {\S}5.1 +
+
+
+
+
+ + ♻ ☆ Decision-Focused Learning: Foundations, State of the Art, Benchmark and + Future Opportunities + + +
+ Decision-focused learning (DFL) is an emerging paradigm that integrates +machine learning (ML) and constrained optimization to enhance decision quality +by training ML models in an end-to-end system. This approach shows significant +potential to revolutionize combinatorial decision-making in real-world +applications that operate under uncertainty, where estimating unknown +parameters within decision models is a major challenge. This paper presents a +comprehensive review of DFL, providing an in-depth analysis of both +gradient-based and gradient-free techniques used to combine ML and constrained +optimization. It evaluates the strengths and limitations of these techniques +and includes an extensive empirical evaluation of eleven methods across seven +problems. The survey also offers insights into recent advancements and future +research directions in DFL. + Code and benchmark: https://github.com/PredOpt/predopt-benchmarks + +
+
+ comment: Experimental Survey and Benchmarking +
+
+
+
+
+ + ♻ ☆ Can Vehicle Motion Planning Generalize to Realistic Long-tail Scenarios? + + +
+ Real-world autonomous driving systems must make safe decisions in the face of +rare and diverse traffic scenarios. Current state-of-the-art planners are +mostly evaluated on real-world datasets like nuScenes (open-loop) or nuPlan +(closed-loop). In particular, nuPlan seems to be an expressive evaluation +method since it is based on real-world data and closed-loop, yet it mostly +covers basic driving scenarios. This makes it difficult to judge a planner's +capabilities to generalize to rarely-seen situations. Therefore, we propose a +novel closed-loop benchmark interPlan containing several edge cases and +challenging driving scenarios. We assess existing state-of-the-art planners on +our benchmark and show that neither rule-based nor learning-based planners can +safely navigate the interPlan scenarios. A recently evolving direction is the +usage of foundation models like large language models (LLM) to handle +generalization. We evaluate an LLM-only planner and introduce a novel hybrid +planner that combines an LLM-based behavior planner with a rule-based motion +planner that achieves state-of-the-art performance on our benchmark. + +
+
+
+
+
+ + ♻ ☆ In the Search for Optimal Multi-view Learning Models for Crop + Classification with Global Remote Sensing Data + + +
+ Studying and analyzing cropland is a difficult task due to its dynamic and +heterogeneous growth behavior. Usually, diverse data sources can be collected +for its estimation. Although deep learning models have proven to excel in the +crop classification task, they face substantial challenges when dealing with +multiple inputs, named Multi-View Learning (MVL). The methods used in the MVL +scenario can be structured based on the encoder architecture, the fusion +strategy, and the optimization technique. The literature has primarily focused +on using specific encoder architectures for local regions, lacking a deeper +exploration of other components in the MVL methodology. In contrast, we +investigate the simultaneous selection of the fusion strategy and encoder +architecture, assessing global-scale cropland and crop-type classifications. We +use a range of five fusion strategies (Input, Feature, Decision, Ensemble, +Hybrid) and five temporal encoders (LSTM, GRU, TempCNN, TAE, L-TAE) as possible +configurations in the MVL method. We use the CropHarvest dataset for +validation, which provides optical, radar, weather time series, and topographic +information as input data. We found that in scenarios with a limited number of +labeled samples, a unique configuration is insufficient for all the cases. +Instead, a specialized combination should be meticulously sought, including an +encoder and fusion strategy. To streamline this search process, we suggest +identifying the optimal encoder architecture tailored for a particular fusion +strategy, and then determining the most suitable fusion strategy for the +classification task. We provide a methodological framework for researchers +exploring crop classification through an MVL methodology. + +
+
+ comment: submitted to journal +
+
+
+
+
+ + ♻ ☆ Increasing the Robustness of Model Predictions to Missing Sensors in + Earth Observation ACL + + +
+ Multi-sensor ML models for EO aim to enhance prediction accuracy by +integrating data from various sources. However, the presence of missing data +poses a significant challenge, particularly in non-persistent sensors that can +be affected by external factors. Existing literature has explored strategies +like temporal dropout and sensor-invariant models to address the generalization +to missing data issues. Inspired by these works, we study two novel methods +tailored for multi-sensor scenarios, namely Input Sensor Dropout (ISensD) and +Ensemble Sensor Invariant (ESensI). Through experimentation on three +multi-sensor temporal EO datasets, we demonstrate that these methods +effectively increase the robustness of model predictions to missing sensors. +Particularly, we focus on how the predictive performance of models drops when +sensors are missing at different levels. We observe that ensemble multi-sensor +models are the most robust to the lack of sensors. In addition, the sensor +dropout component in ISensD shows promising robustness results. + +
+
+ comment: Accepted at the MACLEAN workshop in the ECML/PKDD 2024 +
+
+
+
+
+ + ♻ ☆ Scalable Glacier Mapping using Deep Learning and Open Earth Observation + Data Matches the Accuracy of Manual Delineation + + +
+ Accurate global glacier mapping is critical for understanding climate change +impacts. Despite its importance, automated glacier mapping at a global scale +remains largely unexplored. Here we address this gap and propose +Glacier-VisionTransformer-U-Net (GlaViTU), a convolutional-transformer deep +learning model, and five strategies for multitemporal global-scale glacier +mapping using open satellite imagery. Assessing the spatial, temporal and +cross-sensor generalisation shows that our best strategy achieves intersection +over union >0.85 on previously unobserved images in most cases, which drops to +>0.75 for debris-rich areas such as High-Mountain Asia and increases to >0.90 +for regions dominated by clean ice. A comparative validation against human +expert uncertainties in terms of area and distance deviations underscores +GlaViTU performance, approaching or matching expert-level delineation. Adding +synthetic aperture radar data, namely, backscatter and interferometric +coherence, increases the accuracy in all regions where available. The +calibrated confidence for glacier extents is reported making the predictions +more reliable and interpretable. We also release a benchmark dataset that +covers 9% of glaciers worldwide. Our results support efforts towards automated +multitemporal and global glacier mapping. + +
+
+ comment: after major revision, expanded validation +
+
+
+
+
+ + ♻ ☆ Smart E-commerce Recommendations with Semantic AI + + +
+ In e-commerce, web mining for page recommendations is widely used but often +fails to meet user needs. To address this, we propose a novel solution +combining semantic web mining with BP neural networks. We process user search +logs to extract five key features: content priority, time spent, user feedback, +recommendation semantics, and input deviation. These features are then fed into +a BP neural network to classify and prioritize web pages. The prioritized pages +are recommended to users. Using book sales pages for testing, our results +demonstrate that this solution can quickly and accurately identify the pages +users need. Our approach ensures that recommendations are more relevant and +tailored to individual preferences, enhancing the online shopping experience. +By leveraging advanced semantic analysis and neural network techniques, we +bridge the gap between user expectations and actual recommendations. This +innovative method not only improves accuracy but also speeds up the +recommendation process, making it a valuable tool for e-commerce platforms +aiming to boost user satisfaction and engagement. Additionally, our system +ability to handle large datasets and provide real-time recommendations makes it +a scalable and efficient solution for modern e-commerce challenges. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ A Hybrid Framework for Spatial Interpolation: Merging Data-driven with + Domain Knowledge + + +
+ Estimating spatially distributed information through the interpolation of +scattered observation datasets often overlooks the critical role of domain +knowledge in understanding spatial dependencies. Additionally, the features of +these data sets are typically limited to the spatial coordinates of the +scattered observation locations. In this paper, we propose a hybrid framework +that integrates data-driven spatial dependency feature extraction with +rule-assisted spatial dependency function mapping to augment domain knowledge. +We demonstrate the superior performance of our framework in two comparative +application scenarios, highlighting its ability to capture more localized +spatial features in the reconstructed distribution fields. Furthermore, we +underscore its potential to enhance nonlinear estimation capabilities through +the application of transformed fuzzy rules and to quantify the inherent +uncertainties associated with the observation data sets. Our framework +introduces an innovative approach to spatial information estimation by +synergistically combining observational data with rule-assisted domain +knowledge. + +
+
+ comment: 21 pages, 13 figures; typos corrected, references updated +
+
+
+
+
+ + ♻ ☆ Open Implementation and Study of BEST-RQ for Speech Processing ICASSP 2024 + + +
+ Self-Supervised Learning (SSL) has proven to be useful in various speech +tasks. However, these methods are generally very demanding in terms of data, +memory, and computational resources. BERT-based Speech pre-Training with +Random-projection Quantizer (BEST-RQ), is an SSL method that has shown great +performance on Automatic Speech Recognition (ASR) while being simpler than +other SSL methods, such as wav2vec 2.0. Despite BEST-RQ's great performance, +details are lacking in the original paper, such as the amount of GPU/TPU hours +used in pre-training, and there is no official easy-to-use open-source +implementation. Furthermore, BEST-RQ has not been evaluated on other downstream +tasks aside from ASR and speech translation. In this work, we describe a +re-implementation of a Random-projection quantizer and perform a preliminary +study with a comparison to wav2vec 2.0 on four downstream tasks. We discuss the +details and differences of our implementation. We show that a random projection +quantizer can achieve similar downstream performance as wav2vec 2.0 while +decreasing training time by over a factor of two. + +
+
+ comment: Accepted in IEEE ICASSP 2024 workshop on Self-supervision in Audio, + Speech and Beyond (SASB 2024) +
+
+
+
+
+ + ♻ ☆ Moderate Adaptive Linear Units (MoLU) + + +
+ We propose a new high-performance activation function, Moderate Adaptive +Linear Units (MoLU), for the deep neural network. The MoLU is a simple, +beautiful and powerful activation function that can be a good main activation +function among hundreds of activation functions. Because the MoLU is made up of +the elementary functions, not only it is a diffeomorphism (i.e. analytic over +whole domains), but also it reduces the training time. + +
+
+ comment: 4 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Prompt Compression with Context-Aware Sentence Encoding for Fast and + Improved LLM Inference + + +
+ Large language models (LLMs) have triggered a new stream of research focusing +on compressing the context length to reduce the computational cost while +ensuring the retention of helpful information for LLMs to answer the given +question. Token-based removal methods are one of the most prominent approaches +in this direction, but risk losing the semantics of the context caused by +intermediate token removal, especially under high compression ratios, while +also facing challenges in computational efficiency. In this work, we propose +context-aware prompt compression (CPC), a sentence-level prompt compression +technique where its key innovation is a novel context-aware sentence encoder +that provides a relevance score for each sentence for a given question. To +train this encoder, we generate a new dataset consisting of questions, +positives, and negative pairs where positives are sentences relevant to the +question, while negatives are irrelevant context sentences. We train the +encoder in a contrastive setup to learn context-aware sentence representations. +Our method considerably outperforms prior works on prompt compression on +benchmark datasets and is up to 10.93x faster at inference compared to the best +token-level compression method. We also find better improvement for shorter +length constraints in most benchmarks, showing the effectiveness of our +proposed solution in the compression of relevant information in a shorter +context. Finally, we release the code and the dataset for quick reproducibility +and further development: https://github.com/Workday/cpc. + +
+
+
+
+
+ + ♻ ☆ Simultaneous Training of First- and Second-Order Optimizers in + Population-Based Reinforcement Learning + + +
+ The tuning of hyperparameters in reinforcement learning (RL) is critical, as +these parameters significantly impact an agent's performance and learning +efficiency. Dynamic adjustment of hyperparameters during the training process +can significantly enhance both the performance and stability of learning. +Population-based training (PBT) provides a method to achieve this by +continuously tuning hyperparameters throughout the training. This ongoing +adjustment enables models to adapt to different learning stages, resulting in +faster convergence and overall improved performance. In this paper, we propose +an enhancement to PBT by simultaneously utilizing both first- and second-order +optimizers within a single population. We conducted a series of experiments +using the TD3 algorithm across various MuJoCo environments. Our results, for +the first time, empirically demonstrate the potential of incorporating +second-order optimizers within PBT-based RL. Specifically, the combination of +the K-FAC optimizer with Adam led to up to a 10% improvement in overall +performance compared to PBT using only Adam. Additionally, in environments +where Adam occasionally fails, such as the Swimmer environment, the mixed +population with K-FAC exhibited more reliable learning outcomes, offering a +significant advantage in training stability without a substantial increase in +computational time. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ SparQ Attention: Bandwidth-Efficient LLM Inference + + +
+ The computational difficulties of large language model (LLM) inference remain +a significant obstacle to their widespread deployment. The need for many +applications to support long input sequences and process them in large batches +typically causes token-generation to be bottlenecked by data transfer. For this +reason, we introduce SparQ Attention, a technique for increasing the inference +throughput of LLMs by utilising memory bandwidth more efficiently within the +attention layers, through selective fetching of the cached history. Our +proposed technique can be applied directly to off-the-shelf LLMs during +inference, without requiring any modification to the pre-training setup or +additional fine-tuning. We show that SparQ Attention brings up to 8x savings in +attention data transfers without substantial drops in accuracy, by evaluating +Llama 2 and 3, Mistral, Gemma and Pythia models on a wide range of downstream +tasks. + +
+
+
+
+
+ + ♻ ☆ Enhancing Sindhi Word Segmentation using Subword Representation Learning + and Position-aware Self-attention + + +
+ Sindhi word segmentation is a challenging task due to space omission and +insertion issues. The Sindhi language itself adds to this complexity. It's +cursive and consists of characters with inherent joining and non-joining +properties, independent of word boundaries. Existing Sindhi word segmentation +methods rely on designing and combining hand-crafted features. However, these +methods have limitations, such as difficulty handling out-of-vocabulary words, +limited robustness for other languages, and inefficiency with large amounts of +noisy or raw text. Neural network-based models, in contrast, can automatically +capture word boundary information without requiring prior knowledge. In this +paper, we propose a Subword-Guided Neural Word Segmenter (SGNWS) that addresses +word segmentation as a sequence labeling task. The SGNWS model incorporates +subword representation learning through a bidirectional long short-term memory +encoder, position-aware self-attention, and a conditional random field. Our +empirical results demonstrate that the SGNWS model achieves state-of-the-art +performance in Sindhi word segmentation on six datasets. + +
+
+ comment: Journal Paper, 14 pages +
+
+
+
+
+ + ♻ ☆ BiKC: Keypose-Conditioned Consistency Policy for Bimanual Robotic + Manipulation + + +
+ Bimanual manipulation tasks typically involve multiple stages which require +efficient interactions between two arms, posing step-wise and stage-wise +challenges for imitation learning systems. Specifically, failure and delay of +one step will broadcast through time, hinder success and efficiency of each +sub-stage task, and thereby overall task performance. Although recent works +have made strides in addressing certain challenges, few approaches explicitly +consider the multi-stage nature of bimanual tasks while simultaneously +emphasizing the importance of inference speed. In this paper, we introduce a +novel keypose-conditioned consistency policy tailored for bimanual +manipulation. It is a hierarchical imitation learning framework that consists +of a high-level keypose predictor and a low-level trajectory generator. The +predicted keyposes provide guidance for trajectory generation and also mark the +completion of one sub-stage task. The trajectory generator is designed as a +consistency model trained from scratch without distillation, which generates +action sequences conditioning on current observations and predicted keyposes +with fast inference speed. Simulated and real-world experimental results +demonstrate that the proposed approach surpasses baseline methods in terms of +success rate and operational efficiency. Codes are available at +https://github.com/ManUtdMoon/BiKC. + +
+
+ comment: Accepted by The 16th International Workshop on the Algorithmic + Foundations of Robotics (WAFR 2024) +
+
+
+
+
+ + ♻ ☆ NetMamba: Efficient Network Traffic Classification via Pre-training + Unidirectional Mamba + + +
+ Network traffic classification is a crucial research area aiming to enhance +service quality, streamline network management, and bolster cybersecurity. To +address the growing complexity of transmission encryption techniques, various +machine learning and deep learning methods have been proposed. However, +existing approaches face two main challenges. Firstly, they struggle with model +inefficiency due to the quadratic complexity of the widely used Transformer +architecture. Secondly, they suffer from inadequate traffic representation +because of discarding important byte information while retaining unwanted +biases. To address these challenges, we propose NetMamba, an efficient +linear-time state space model equipped with a comprehensive traffic +representation scheme. We adopt a specially selected and improved +unidirectional Mamba architecture for the networking field, instead of the +Transformer, to address efficiency issues. In addition, we design a traffic +representation scheme to extract valid information from massive traffic data +while removing biased information. Evaluation experiments on six public +datasets encompassing three main classification tasks showcase NetMamba's +superior classification performance compared to state-of-the-art baselines. It +achieves an accuracy rate of nearly 99% (some over 99%) in all tasks. +Additionally, NetMamba demonstrates excellent efficiency, improving inference +speed by up to 60 times while maintaining comparably low memory usage. +Furthermore, NetMamba exhibits superior few-shot learning abilities, achieving +better classification performance with fewer labeled data. To the best of our +knowledge, NetMamba is the first model to tailor the Mamba architecture for +networking. + +
+
+
+
+
+ + ♻ ☆ From Categories to Classifiers: Name-Only Continual Learning by + Exploring the Web + + +
+ Continual Learning (CL) often relies on the availability of extensive +annotated datasets, an assumption that is unrealistically time-consuming and +costly in practice. We explore a novel paradigm termed name-only continual +learning where time and cost constraints prohibit manual annotation. In this +scenario, learners adapt to new category shifts using only category names +without the luxury of annotated training data. Our proposed solution leverages +the expansive and ever-evolving internet to query and download uncurated +webly-supervised data for image classification. We investigate the reliability +of our web data and find them comparable, and in some cases superior, to +manually annotated datasets. Additionally, we show that by harnessing the web, +we can create support sets that surpass state-of-the-art name-only +classification that create support sets using generative models or image +retrieval from LAION-5B, achieving up to 25% boost in accuracy. When applied +across varied continual learning contexts, our method consistently exhibits a +small performance gap in comparison to models trained on manually annotated +datasets. We present EvoTrends, a class-incremental dataset made from the web +to capture real-world trends, created in just minutes. Overall, this paper +underscores the potential of using uncurated webly-supervised data to mitigate +the challenges associated with manual data labeling in continual learning. + +
+
+
+
+
+ + ♻ ☆ A Systematic Review on Sleep Stage Classification and Sleep Disorder + Detection Using Artificial Intelligence + + +
+ Sleep is vital for people's physical and mental health, and sound sleep can +help them focus on daily activities. Therefore, a sleep study that includes +sleep patterns and sleep disorders is crucial to enhancing our knowledge about +individuals' health status. This study aims to provide a comprehensive, +systematic review of the recent literature to analyze the different approaches +and their outcomes in sleep studies, which includes works on "sleep stages +classification" and "sleep disorder detection" using AI. In this review, 183 +articles were initially selected from different journals, among which 80 +records were enlisted for explicit review, ranging from 2016 to 2023. Brain +waves were the most commonly employed body parameters for sleep staging and +disorder studies (almost 29% of the research used brain activity signals +exclusively, and 77% combined with the other signals). The convolutional neural +network (CNN), the most widely used of the 34 distinct artificial intelligence +models, comprised 27%. The other models included the long short-term memory +(LSTM), support vector machine (SVM), random forest (RF), and recurrent neural +network (RNN), which consisted of 11%, 6%, 6%, and 5% sequentially. For +performance metrics, accuracy was widely used for a maximum of 83.75% of the +cases, the F1 score of 45%, Kappa of 36.25%, Sensitivity of 31.25%, and +Specificity of 30% of cases, along with the other metrics. This article would +help physicians and researchers get the gist of AI's contribution to sleep +studies and the feasibility of their intended work. + +
+
+ comment: 39 pages, 11 Figures, 8 Tables +
+
+
+
+
+ + ♻ ☆ The Fault in our Stars: Quality Assessment of Code Generation Benchmarks SC + + +
+ Large Language Models (LLMs) are gaining popularity among software engineers. +A crucial aspect of developing effective code generation LLMs is to evaluate +these models using a robust benchmark. Evaluation benchmarks with quality +issues can provide a false sense of performance. In this work, we conduct the +first-of-its-kind study of the quality of prompts within benchmarks used to +compare the performance of different code generation models. To conduct this +study, we analyzed 3,566 prompts from 9 code generation benchmarks to identify +quality issues in them. We also investigated whether fixing the identified +quality issues in the benchmarks' prompts affects a model's performance. We +also studied memorization issues of the evaluation dataset, which can put into +question a benchmark's trustworthiness. We found that code generation +evaluation benchmarks mainly focused on Python and coding exercises and had +very limited contextual dependencies to challenge the model. These datasets and +the developers' prompts suffer from quality issues like spelling and +grammatical errors, unclear sentences to express developers' intent, and not +using proper documentation style. Fixing all these issues in the benchmarks can +lead to a better performance for Python code generation, but not a significant +improvement was observed for Java code generation. We also found evidence that +GPT-3.5-Turbo and CodeGen-2.5 models may have data contamination issues. + +
+
+ comment: Accepted at the 24th IEEE International Conference on Source Code + Analysis and Manipulation(SCAM 2024) Research Track +
+
+
+
+
+ + ♻ ☆ Sample Complexity of Variance-reduced Distributionally Robust Q-learning + + +
+ Dynamic decision-making under distributional shifts is of fundamental +interest in theory and applications of reinforcement learning: The distribution +of the environment in which the data is collected can differ from that of the +environment in which the model is deployed. This paper presents two novel +model-free algorithms, namely the distributionally robust Q-learning and its +variance-reduced counterpart, that can effectively learn a robust policy +despite distributional shifts. These algorithms are designed to efficiently +approximate the $q$-function of an infinite-horizon $\gamma$-discounted robust +Markov decision process with Kullback-Leibler ambiguity set to an entry-wise +$\epsilon$-degree of precision. Further, the variance-reduced distributionally +robust Q-learning combines the synchronous Q-learning with variance-reduction +techniques to enhance its performance. Consequently, we establish that it +attains a minimax sample complexity upper bound of $\tilde +O(|\mathbf{S}||\mathbf{A}|(1-\gamma)^{-4}\epsilon^{-2})$, where $\mathbf{S}$ +and $\mathbf{A}$ denote the state and action spaces. This is the first +complexity result that is independent of the ambiguity size $\delta$, thereby +providing new complexity theoretic insights. Additionally, a series of +numerical experiments confirm the theoretical findings and the efficiency of +the algorithms in handling distributional shifts. + +
+
+
+
+
+ + ♻ ☆ Predicting and Interpreting Energy Barriers of Metallic Glasses with + Graph Neural Networks ICML 2024 + + +
+ Metallic Glasses (MGs) are widely used materials that are stronger than steel +while being shapeable as plastic. While understanding the structure-property +relationship of MGs remains a challenge in materials science, studying their +energy barriers (EBs) as an intermediary step shows promise. In this work, we +utilize Graph Neural Networks (GNNs) to model MGs and study EBs. We contribute +a new dataset for EB prediction and a novel Symmetrized GNN (SymGNN) model that +is E(3)-invariant in expectation. SymGNN handles invariance by aggregating over +orthogonal transformations of the graph structure. When applied to EB +prediction, SymGNN are more accurate than molecular dynamics (MD) +local-sampling methods and other machine-learning models. Compared to precise +MD simulations, SymGNN reduces the inference time on new MGs from roughly 41 +days to less than one second. We apply explanation algorithms to reveal the +relationship between structures and EBs. The structures that we identify +through explanations match the medium-range order (MRO) hypothesis and possess +unique topological properties. Our work enables effective prediction and +interpretation of MG EBs, bolstering material science research. + +
+
+ comment: ICML 2024. Code available at https://github.com/haoyuli02/SymGNN +
+
+
+
+
+ + ♻ ☆ Semi-Decentralized Federated Edge Learning for Fast Convergence on + Non-IID Data + + +
+ Federated edge learning (FEEL) has emerged as an effective approach to reduce +the large communication latency in Cloud-based machine learning solutions, +while preserving data privacy. Unfortunately, the learning performance of FEEL +may be compromised due to limited training data in a single edge cluster. In +this paper, we investigate a novel framework of FEEL, namely semi-decentralized +federated edge learning (SD-FEEL). By allowing model aggregation across +different edge clusters, SD-FEEL enjoys the benefit of FEEL in reducing the +training latency, while improving the learning performance by accessing richer +training data from multiple edge clusters. A training algorithm for SD-FEEL +with three main procedures in each round is presented, including local model +updates, intra-cluster and inter-cluster model aggregations, which is proved to +converge on non-independent and identically distributed (non-IID) data. We also +characterize the interplay between the network topology of the edge servers and +the communication overhead of inter-cluster model aggregation on the training +performance. Experiment results corroborate our analysis and demonstrate the +effectiveness of SD-FFEL in achieving faster convergence than traditional +federated learning architectures. Besides, guidelines on choosing critical +hyper-parameters of the training algorithm are also provided. + +
+
+
+
+
+ + ♻ ☆ EnsLoss: Stochastic Calibrated Loss Ensembles for Preventing Overfitting + in Classification + + +
+ Empirical risk minimization (ERM) with a computationally feasible surrogate +loss is a widely accepted approach for classification. Notably, the convexity +and calibration (CC) properties of a loss function ensure consistency of ERM in +maximizing accuracy, thereby offering a wide range of options for surrogate +losses. In this article, we propose a novel ensemble method, namely EnsLoss, +which extends the ensemble learning concept to combine loss functions within +the ERM framework. A key feature of our method is the consideration on +preserving the "legitimacy" of the combined losses, i.e., ensuring the CC +properties. Specifically, we first transform the CC conditions of losses into +loss-derivatives, thereby bypassing the need for explicit loss functions and +directly generating calibrated loss-derivatives. Therefore, inspired by +Dropout, EnsLoss enables loss ensembles through one training process with +doubly stochastic gradient descent (i.e., random batch samples and random +calibrated loss-derivatives). We theoretically establish the statistical +consistency of our approach and provide insights into its benefits. The +numerical effectiveness of EnsLoss compared to fixed loss methods is +demonstrated through experiments on a broad range of 14 OpenML tabular datasets +and 46 image datasets with various deep learning architectures. Python +repository and source code are available on GitHub at +https://github.com/statmlben/ensloss. + +
+
+ comment: 31 pages; 4 figures +
+
+
+
+
+ + ♻ ☆ A Confidence Interval for the $\ell_2$ Expected Calibration Error + + +
+ Recent advances in machine learning have significantly improved prediction +accuracy in various applications. However, ensuring the calibration of +probabilistic predictions remains a significant challenge. Despite efforts to +enhance model calibration, the rigorous statistical evaluation of model +calibration remains less explored. In this work, we develop confidence +intervals the $\ell_2$ Expected Calibration Error (ECE). We consider +top-1-to-$k$ calibration, which includes both the popular notion of confidence +calibration as well as full calibration. For a debiased estimator of the ECE, +we show asymptotic normality, but with different convergence rates and +asymptotic variances for calibrated and miscalibrated models. We develop +methods to construct asymptotically valid confidence intervals for the ECE, +accounting for this behavior as well as non-negativity. Our theoretical +findings are supported through extensive experiments, showing that our methods +produce valid confidence intervals with shorter lengths compared to those +obtained by resampling-based methods. + +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Classify Power Quality Signals Using Vision + Transformers + + +
+ With the rapid integration of electronically interfaced renewable energy +resources and loads into smart grids, there is increasing interest in power +quality disturbances (PQD) classification to enhance the security and +efficiency of these grids. This paper introduces a new approach to PQD +classification based on the Vision Transformer (ViT) model. When a PQD occurs, +the proposed approach first converts the power quality signal into an image and +then utilizes a pre-trained ViT to accurately determine the class of the PQD. +Unlike most previous works, which were limited to a few disturbance classes or +small datasets, the proposed method is trained and tested on a large dataset +with 17 disturbance classes. Our experimental results show that the proposed +ViT-based approach achieves PQD classification precision and recall of 98.28% +and 97.98%, respectively, outperforming recently proposed techniques applied to +the same dataset. + +
+
+ comment: IECON 2024-50th Annual Conference of the IEEE Industrial Electronics + Society, Chicago, U.S.A, 2024, pp. 1-6 +
+
+
+
+
+ + ♻ ☆ Graph-Based Bidirectional Transformer Decision Threshold Adjustment + Algorithm for Class-Imbalanced Molecular Data + + +
+ Data sets with imbalanced class sizes, where one class size is much smaller +than that of others, occur exceedingly often in many applications, including +those with biological foundations, such as disease diagnosis and drug +discovery. Therefore, it is extremely important to be able to identify data +elements of classes of various sizes, as a failure to do so can result in heavy +costs. Nonetheless, many data classification procedures do not perform well on +imbalanced data sets as they often fail to detect elements belonging to +underrepresented classes. In this work, we propose the BTDT-MBO algorithm, +incorporating Merriman-Bence-Osher (MBO) approaches and a bidirectional +transformer, as well as distance correlation and decision threshold +adjustments, for data classification tasks on highly imbalanced molecular data +sets, where the sizes of the classes vary greatly. The proposed technique not +only integrates adjustments in the classification threshold for the MBO +algorithm in order to help deal with the class imbalance, but also uses a +bidirectional transformer procedure based on an attention mechanism for +self-supervised learning. In addition, the model implements distance +correlation as a weight function for the similarity graph-based framework on +which the adjusted MBO algorithm operates. The proposed method is validated +using six molecular data sets and compared to other related techniques. The +computational experiments show that the proposed technique is superior to +competing approaches even in the case of a high class imbalance ratio. + +
+
+
+
+
+ + ♻ ☆ CCPL: Cross-modal Contrastive Protein Learning ICPR 2024 + + +
+ Effective protein representation learning is crucial for predicting protein +functions. Traditional methods often pretrain protein language models on large, +unlabeled amino acid sequences, followed by finetuning on labeled data. While +effective, these methods underutilize the potential of protein structures, +which are vital for function determination. Common structural representation +techniques rely heavily on annotated data, limiting their generalizability. +Moreover, structural pretraining methods, similar to natural language +pretraining, can distort actual protein structures. In this work, we introduce +a novel unsupervised protein structure representation pretraining method, +cross-modal contrastive protein learning (CCPL). CCPL leverages a robust +protein language model and uses unsupervised contrastive alignment to enhance +structure learning, incorporating self-supervised structural constraints to +maintain intrinsic structural information. We evaluated our model across +various benchmarks, demonstrating the framework's superiority. + +
+
+ comment: Accepted to ICPR 2024 +
+
+
+
+
+ + ♻ ☆ Diffusion-Driven Data Replay: A Novel Approach to Combat Forgetting in + Federated Class Continual Learning ECCV 2024 + + +
+ Federated Class Continual Learning (FCCL) merges the challenges of +distributed client learning with the need for seamless adaptation to new +classes without forgetting old ones. The key challenge in FCCL is catastrophic +forgetting, an issue that has been explored to some extent in Continual +Learning (CL). However, due to privacy preservation requirements, some +conventional methods, such as experience replay, are not directly applicable to +FCCL. Existing FCCL methods mitigate forgetting by generating historical data +through federated training of GANs or data-free knowledge distillation. +However, these approaches often suffer from unstable training of generators or +low-quality generated data, limiting their guidance for the model. To address +this challenge, we propose a novel method of data replay based on diffusion +models. Instead of training a diffusion model, we employ a pre-trained +conditional diffusion model to reverse-engineer each class, searching the +corresponding input conditions for each class within the model's input space, +significantly reducing computational resources and time consumption while +ensuring effective generation. Furthermore, we enhance the classifier's domain +generalization ability on generated and real data through contrastive learning, +indirectly improving the representational capability of generated data for real +data. Comprehensive experiments demonstrate that our method significantly +outperforms existing baselines. Code is available at +https://github.com/jinglin-liang/DDDR. + +
+
+ comment: Accepted by ECCV 2024 Oral +
+
+
+
+
+ + ♻ ☆ Small noise analysis for Tikhonov and RKHS regularizations + + +
+ Regularization plays a pivotal role in ill-posed machine learning and inverse +problems. However, the fundamental comparative analysis of various +regularization norms remains open. We establish a small noise analysis +framework to assess the effects of norms in Tikhonov and RKHS regularizations, +in the context of ill-posed linear inverse problems with Gaussian noise. This +framework studies the convergence rates of regularized estimators in the small +noise limit and reveals the potential instability of the conventional +L2-regularizer. We solve such instability by proposing an innovative class of +adaptive fractional RKHS regularizers, which covers the L2 Tikhonov and RKHS +regularizations by adjusting the fractional smoothness parameter. A surprising +insight is that over-smoothing via these fractional RKHSs consistently yields +optimal convergence rates, but the optimal hyper-parameter may decay too fast +to be selected in practice. + +
+
+
+
+
+ + ♻ ☆ Stacked ensemble\-based mutagenicity prediction model using multiple + modalities with graph attention network + + +
+ Mutagenicity is a concern due to its association with genetic mutations which +can result in a variety of negative consequences, including the development of +cancer. Earlier identification of mutagenic compounds in the drug development +process is therefore crucial for preventing the progression of unsafe +candidates and reducing development costs. While computational techniques, +especially machine learning models have become increasingly prevalent for this +endpoint, they rely on a single modality. In this work, we introduce a novel +stacked ensemble based mutagenicity prediction model which incorporate multiple +modalities such as simplified molecular input line entry system (SMILES) and +molecular graph. These modalities capture diverse information about molecules +such as substructural, physicochemical, geometrical and topological. To derive +substructural, geometrical and physicochemical information, we use SMILES, +while topological information is extracted through a graph attention network +(GAT) via molecular graph. Our model uses a stacked ensemble of machine +learning classifiers to make predictions using these multiple features. We +employ the explainable artificial intelligence (XAI) technique SHAP (Shapley +Additive Explanations) to determine the significance of each classifier and the +most relevant features in the prediction. We demonstrate that our method +surpasses SOTA methods on two standard datasets across various metrics. +Notably, we achieve an area under the curve of 95.21\% on the Hansen benchmark +dataset, affirming the efficacy of our method in predicting mutagenicity. We +believe that this research will captivate the interest of both clinicians and +computational biologists engaged in translational research. + +
+
+ comment: Submitted to a journal +
+
+
+
+
+ + ♻ ☆ OpenVLA: An Open-Source Vision-Language-Action Model + + +
+ Large policies pretrained on a combination of Internet-scale vision-language +data and diverse robot demonstrations have the potential to change how we teach +robots new skills: rather than training new behaviors from scratch, we can +fine-tune such vision-language-action (VLA) models to obtain robust, +generalizable policies for visuomotor control. Yet, widespread adoption of VLAs +for robotics has been challenging as 1) existing VLAs are largely closed and +inaccessible to the public, and 2) prior work fails to explore methods for +efficiently fine-tuning VLAs for new tasks, a key component for adoption. +Addressing these challenges, we introduce OpenVLA, a 7B-parameter open-source +VLA trained on a diverse collection of 970k real-world robot demonstrations. +OpenVLA builds on a Llama 2 language model combined with a visual encoder that +fuses pretrained features from DINOv2 and SigLIP. As a product of the added +data diversity and new model components, OpenVLA demonstrates strong results +for generalist manipulation, outperforming closed models such as RT-2-X (55B) +by 16.5% in absolute task success rate across 29 tasks and multiple robot +embodiments, with 7x fewer parameters. We further show that we can effectively +fine-tune OpenVLA for new settings, with especially strong generalization +results in multi-task environments involving multiple objects and strong +language grounding abilities, and outperform expressive from-scratch imitation +learning methods such as Diffusion Policy by 20.4%. We also explore compute +efficiency; as a separate contribution, we show that OpenVLA can be fine-tuned +on consumer GPUs via modern low-rank adaptation methods and served efficiently +via quantization without a hit to downstream success rate. Finally, we release +model checkpoints, fine-tuning notebooks, and our PyTorch codebase with +built-in support for training VLAs at scale on Open X-Embodiment datasets. + +
+
+ comment: Website: https://openvla.github.io/ +
+
+
+
+
+ + ♻ ☆ SELF-[IN]CORRECT: LLMs Struggle with Discriminating Self-Generated + Responses + + +
+ Can LLMs consistently improve their previous outputs for better results? For +this to be true, LLMs would need to be better at discriminating among +previously-generated alternatives, than generating initial responses. We +explore the validity of this hypothesis in practice. We first formulate a +unified framework that allows us to compare the generative and discriminative +capability of any model on any task. In our resulting experimental analysis of +several open-source and industrial LLMs, we observe that models are not +reliably better at discriminating among previously-generated alternatives than +generating initial responses. This finding challenges the notion that LLMs may +be able to enhance their performance only through their own judgment. + +
+
+
+
+
+ + ♻ ☆ Thresholded Lexicographic Ordered Multiobjective Reinforcement Learning ECAI 2024 + + +
+ Lexicographic multi-objective problems, which impose a lexicographic +importance order over the objectives, arise in many real-life scenarios. +Existing Reinforcement Learning work directly addressing lexicographic tasks +has been scarce. The few proposed approaches were all noted to be heuristics +without theoretical guarantees as the Bellman equation is not applicable to +them. Additionally, the practical applicability of these prior approaches also +suffers from various issues such as not being able to reach the goal state. +While some of these issues have been known before, in this work we investigate +further shortcomings, and propose fixes for improving practical performance in +many cases. We also present a policy optimization approach using our +Lexicographic Projection Optimization (LPO) algorithm that has the potential to +address these theoretical and practical concerns. Finally, we demonstrate our +proposed algorithms on benchmark problems. + +
+
+ comment: Full version of ECAI 2024 paper +
+
+
+
+
+ + ♻ ☆ LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet + + +
+ Recent large language model (LLM) defenses have greatly improved models' +ability to refuse harmful queries, even when adversarially attacked. However, +LLM defenses are primarily evaluated against automated adversarial attacks in a +single turn of conversation, an insufficient threat model for real-world +malicious use. We demonstrate that multi-turn human jailbreaks uncover +significant vulnerabilities, exceeding 70% attack success rate (ASR) on +HarmBench against defenses that report single-digit ASRs with automated +single-turn attacks. Human jailbreaks also reveal vulnerabilities in machine +unlearning defenses, successfully recovering dual-use biosecurity knowledge +from unlearned models. We compile these results into Multi-Turn Human +Jailbreaks (MHJ), a dataset of 2,912 prompts across 537 multi-turn jailbreaks. +We publicly release MHJ alongside a compendium of jailbreak tactics developed +across dozens of commercial red teaming engagements, supporting research +towards stronger LLM defenses. + +
+
+
+
+
+ + ♻ ☆ Anchored Preference Optimization and Contrastive Revisions: Addressing + Underspecification in Alignment + + +
+ Large Language Models (LLMs) are often aligned using contrastive alignment +objectives and preference pair datasets. The interaction between model, paired +data, and objective makes alignment a complicated procedure, sometimes +producing subpar results. We study this and find that (i) preference data gives +a better learning signal when the underlying responses are contrastive, and +(ii) alignment objectives lead to better performance when they specify more +control over the model during training. Based on these insights, we introduce +Contrastive Learning from AI Revisions (CLAIR), a data-creation method which +leads to more contrastive preference pairs, and Anchored Preference +Optimization (APO), a controllable and more stable alignment objective. We +align Llama-3-8B-Instruct using various comparable datasets and alignment +objectives and measure MixEval-Hard scores, which correlate highly with human +judgments. The CLAIR preferences lead to the strongest performance out of all +datasets, and APO consistently outperforms less controllable objectives. Our +best model, trained on 32K CLAIR preferences with APO, improves +Llama-3-8B-Instruct by 7.65%, closing the gap with GPT4-turbo by 45%. Our code +is available at https://github.com/ContextualAI/CLAIR_and_APO. + +
+
+
+
+
+ + ♻ ☆ From Lab to Field: Real-World Evaluation of an AI-Driven Smart Video + Solution to Enhance Community Safety + + +
+ This article adopts and evaluates an AI-enabled Smart Video Solution (SVS) +designed to enhance safety in the real world. The system integrates with +existing infrastructure camera networks, leveraging recent advancements in AI +for easy adoption. Prioritizing privacy and ethical standards, pose based data +is used for downstream AI tasks such as anomaly detection. Cloud-based +infrastructure and mobile app are deployed, enabling real-time alerts within +communities. The SVS employs innovative data representation and visualization +techniques, such as the Occupancy Indicator, Statistical Anomaly Detection, +Bird's Eye View, and Heatmaps, to understand pedestrian behaviors and enhance +public safety. Evaluation of the SVS demonstrates its capacity to convert +complex computer vision outputs into actionable insights for stakeholders, +community partners, law enforcement, urban planners, and social scientists. +This article presents a comprehensive real-world deployment and evaluation of +the SVS, implemented in a community college environment across 16 cameras. The +system integrates AI-driven visual processing, supported by statistical +analysis, database management, cloud communication, and user notifications. +Additionally, the article evaluates the end-to-end latency from the moment an +AI algorithm detects anomalous behavior in real-time at the camera level to the +time stakeholders receive a notification. The results demonstrate the system's +robustness, effectively managing 16 CCTV cameras with a consistent throughput +of 16.5 frames per second (FPS) over a 21-hour period and an average end-to-end +latency of 26.76 seconds between anomaly detection and alert issuance. + +
+
+
+
+
+ + ♻ ☆ Spectral-Aware Augmentation for Enhanced Graph Representation Learning + + +
+ Graph Contrastive Learning (GCL) has demonstrated remarkable effectiveness in +learning representations on graphs in recent years. To generate ideal +augmentation views, the augmentation generation methods should preserve +essential information while discarding less relevant details for downstream +tasks. However, current augmentation methods usually involve random topology +corruption in the spatial domain, which fails to adequately address information +spread across different frequencies in the spectral domain. Our preliminary +study highlights this issue, demonstrating that spatial random perturbations +impact all frequency bands almost uniformly. Given that task-relevant +information typically resides in specific spectral regions that vary across +graphs, this one-size-fits-all approach can pose challenges. We argue that +indiscriminate spatial random perturbation might unintentionally weaken +task-relevant information, reducing its effectiveness. + To tackle this challenge, we propose applying perturbations selectively, +focusing on information specific to different frequencies across diverse +graphs. In this paper, we present GASSER, a model that applies tailored +perturbations to specific frequencies of graph structures in the spectral +domain, guided by spectral hints. Through extensive experimentation and +theoretical analysis, we demonstrate that the augmentation views generated by +GASSER are adaptive, controllable, and intuitively aligned with the homophily +ratios and spectrum of graph structures. + +
+
+
+
+
+ + ♻ ☆ GCEPNet: Graph Convolution-Enhanced Expectation Propagation for Massive + MIMO Detection + + +
+ Massive MIMO (multiple-input multiple-output) detection is an important topic +in wireless communication and various machine learning based methods have been +developed recently for this task. Expectation Propagation (EP) and its variants +are widely used for MIMO detection and have achieved the best performance. +However, EP-based solvers fail to capture the correlation between unknown +variables, leading to a loss of information, and in addition, they are +computationally expensive. In this paper, we show that the real-valued system +can be modeled as spectral signal convolution on graph, through which the +correlation between unknown variables can be captured. Based on such analysis, +we propose graph convolution-enhanced expectation propagation (GCEPNet). +GCEPNet incorporates data-dependent attention scores into Chebyshev polynomial +for powerful graph convolution with better generalization capacity. It enables +a better estimation of the cavity distribution for EP and empirically achieves +the state-of-the-art (SOTA) MIMO detection performance with much faster +inference speed. To our knowledge, we are the first to shed light on the +connection between the system model and graph convolution, and the first to +design the data-dependent coefficients for graph convolution. + +
+
+ comment: In IEEE GLOBECOM 2024 Conference Proceedings +
+
+
+
+
+ + ♻ ☆ OceanNet: A principled neural operator-based digital twin for regional + oceans + + +
+ While data-driven approaches demonstrate great potential in atmospheric +modeling and weather forecasting, ocean modeling poses distinct challenges due +to complex bathymetry, land, vertical structure, and flow non-linearity. This +study introduces OceanNet, a principled neural operator-based digital twin for +ocean circulation. OceanNet uses a Fourier neural operator and +predictor-evaluate-corrector integration scheme to mitigate autoregressive +error growth and enhance stability over extended time scales. A spectral +regularizer counteracts spectral bias at smaller scales. OceanNet is applied to +the northwest Atlantic Ocean western boundary current (the Gulf Stream), +focusing on the task of seasonal prediction for Loop Current eddies and the +Gulf Stream meander. Trained using historical sea surface height (SSH) data, +OceanNet demonstrates competitive forecast skill by outperforming SSH +predictions by an uncoupled, state-of-the-art dynamical ocean model forecast, +reducing computation by 500,000 times. These accomplishments demonstrate the +potential of physics-inspired deep neural operators as cost-effective +alternatives to high-resolution numerical ocean models. + +
+
+ comment: Supplementary information can be found in: + https://drive.google.com/file/d/1NoxJLa967naJT787a5-IfZ7f_MmRuZMP/view?usp=sharing +
+
+
+
+
+ + ♻ ☆ Counterpart Fairness -- Addressing Systematic between-group Differences + in Fairness Evaluation + + +
+ When using machine learning (ML) to aid decision-making, it is critical to +ensure that an algorithmic decision is fair and does not discriminate against +specific individuals/groups, particularly those from underprivileged +populations. Existing group fairness methods aim to ensure equal outcomes (such +as loan approval rates) across groups delineated by protected variables like +race or gender. However, these methods overlook the intricate, inherent +differences among these groups that could influence outcomes. The confounding +factors, which are non-protected variables but manifest systematic differences, +can significantly affect fairness evaluation. Therefore, we recommend a more +refined and comprehensive approach that accounts for both the systematic +differences within groups and the multifaceted, intertwined confounding +effects. We proposed a fairness metric based on counterparts (i.e., individuals +who are similar with respect to the task of interest) from different groups, +whose group identities cannot be distinguished algorithmically by exploring +confounding factors. We developed a propensity-score-based method for +identifying counterparts, avoiding the issue of comparing "oranges" with +"apples". In addition, we introduced a counterpart-based statistical fairness +index, called Counterpart-Fairness (CFair), to assess the fairness of ML +models. Various empirical studies were conducted to validate the effectiveness +of CFair. + +
+
+ comment: 24 pages, 9 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Towards Understanding Neural Collapse: The Effects of Batch + Normalization and Weight Decay + + +
+ Neural Collapse (NC) is a geometric structure recently observed at the +terminal phase of training deep neural networks, which states that last-layer +feature vectors for the same class would "collapse" to a single point, while +features of different classes become equally separated. We demonstrate that +batch normalization (BN) and weight decay (WD) critically influence the +emergence of NC. In the near-optimal loss regime, we establish an asymptotic +lower bound on the emergence of NC that depends only on the WD value, training +loss, and the presence of last-layer BN. Our experiments substantiate +theoretical insights by showing that models demonstrate a stronger presence of +NC with BN, appropriate WD values, lower loss, and lower last-layer feature +norm. Our findings offer a novel perspective in studying the role of BN and WD +in shaping neural network features. + +
+
+
+
+
+ + ♻ ☆ Language-Guided World Models: A Model-Based Approach to AI Control ACL 2024 + + +
+ This paper introduces the concept of Language-Guided World Models (LWMs) -- +probabilistic models that can simulate environments by reading texts. Agents +equipped with these models provide humans with more extensive and efficient +control, allowing them to simultaneously alter agent behaviors in multiple +tasks via natural verbal communication. In this work, we take initial steps in +developing robust LWMs that can generalize to compositionally novel language +descriptions. We design a challenging world modeling benchmark based on the +game of MESSENGER (Hanjie et al., 2021), featuring evaluation settings that +require varying degrees of compositional generalization. Our experiments reveal +the lack of generalizability of the state-of-the-art Transformer model, as it +offers marginal improvements in simulation quality over a no-text baseline. We +devise a more robust model by fusing the Transformer with the EMMA attention +mechanism (Hanjie et al., 2021). Our model substantially outperforms the +Transformer and approaches the performance of a model with an oracle semantic +parsing and grounding capability. To demonstrate the practicality of this model +in improving AI safety and transparency, we simulate a scenario in which the +model enables an agent to present plans to a human before execution, and to +revise plans based on their language feedback. + +
+
+ comment: SpLU-RoboNLP workshop at ACL 2024 +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ LongLLaVA: Scaling Multi-modal LLMs to 1000 Images Efficiently via + Hybrid Architecture + + +
+ Expanding the long-context capabilities of Multi-modal Large Language +Models~(MLLMs) is crucial for video understanding, high-resolution image +understanding, and multi-modal agents. This involves a series of systematic +optimizations, including model architecture, data construction and training +strategy, particularly addressing challenges such as \textit{degraded +performance with more images} and \textit{high computational costs}. In this +paper, we adapt the model architecture to a hybrid of Mamba and Transformer +blocks, approach data construction with both temporal and spatial dependencies +among multiple images and employ a progressive training strategy. The released +model \textbf{LongLLaVA}~(\textbf{Long}-Context \textbf{L}arge +\textbf{L}anguage \textbf{a}nd \textbf{V}ision \textbf{A}ssistant) is the first +hybrid MLLM, which achieved a better balance between efficiency and +effectiveness. LongLLaVA not only achieves competitive results across various +benchmarks, but also maintains high throughput and low memory consumption. +Especially, it could process nearly a thousand images on a single A100 80GB +GPU, showing promising application prospects for a wide range of tasks. + +
+
+ comment: 19 pages, 7 figures, 6 tables +
+
+
+
+
+ + ☆ Multi-Track MusicLDM: Towards Versatile Music Generation with Latent + Diffusion Model + + +
+ Diffusion models have shown promising results in cross-modal generation tasks +involving audio and music, such as text-to-sound and text-to-music generation. +These text-controlled music generation models typically focus on generating +music by capturing global musical attributes like genre and mood. However, +music composition is a complex, multilayered task that often involves musical +arrangement as an integral part of the process. This process involves composing +each instrument to align with existing ones in terms of beat, dynamics, +harmony, and melody, requiring greater precision and control over tracks than +text prompts usually provide. In this work, we address these challenges by +extending the MusicLDM, a latent diffusion model for music, into a multi-track +generative model. By learning the joint probability of tracks sharing a +context, our model is capable of generating music across several tracks that +correspond well to each other, either conditionally or unconditionally. +Additionally, our model is capable of arrangement generation, where the model +can generate any subset of tracks given the others (e.g., generating a piano +track complementing given bass and drum tracks). We compared our model with an +existing multi-track generative model and demonstrated that our model achieves +considerable improvements across objective metrics for both total and +arrangement generation tasks. + +
+
+
+
+
+ + ☆ ExpLLM: Towards Chain of Thought for Facial Expression Recognition + + +
+ Facial expression recognition (FER) is a critical task in multimedia with +significant implications across various domains. However, analyzing the causes +of facial expressions is essential for accurately recognizing them. Current +approaches, such as those based on facial action units (AUs), typically provide +AU names and intensities but lack insight into the interactions and +relationships between AUs and the overall expression. In this paper, we propose +a novel method called ExpLLM, which leverages large language models to generate +an accurate chain of thought (CoT) for facial expression recognition. +Specifically, we have designed the CoT mechanism from three key perspectives: +key observations, overall emotional interpretation, and conclusion. The key +observations describe the AU's name, intensity, and associated emotions. The +overall emotional interpretation provides an analysis based on multiple AUs and +their interactions, identifying the dominant emotions and their relationships. +Finally, the conclusion presents the final expression label derived from the +preceding analysis. Furthermore, we also introduce the Exp-CoT Engine, designed +to construct this expression CoT and generate instruction-description data for +training our ExpLLM. Extensive experiments on the RAF-DB and AffectNet datasets +demonstrate that ExpLLM outperforms current state-of-the-art FER methods. +ExpLLM also surpasses the latest GPT-4o in expression CoT generation, +particularly in recognizing micro-expressions where GPT-4o frequently fails. + +
+
+ comment: project page: https://starhiking.github.io/ExpLLM_Page/ +
+
+
+
+
+ + ☆ PoseTalk: Text-and-Audio-based Pose Control and Motion Refinement for + One-Shot Talking Head Generation + + +
+ While previous audio-driven talking head generation (THG) methods generate +head poses from driving audio, the generated poses or lips cannot match the +audio well or are not editable. In this study, we propose \textbf{PoseTalk}, a +THG system that can freely generate lip-synchronized talking head videos with +free head poses conditioned on text prompts and audio. The core insight of our +method is using head pose to connect visual, linguistic, and audio signals. +First, we propose to generate poses from both audio and text prompts, where the +audio offers short-term variations and rhythm correspondence of the head +movements and the text prompts describe the long-term semantics of head +motions. To achieve this goal, we devise a Pose Latent Diffusion (PLD) model to +generate motion latent from text prompts and audio cues in a pose latent space. +Second, we observe a loss-imbalance problem: the loss for the lip region +contributes less than 4\% of the total reconstruction loss caused by both pose +and lip, making optimization lean towards head movements rather than lip +shapes. To address this issue, we propose a refinement-based learning strategy +to synthesize natural talking videos using two cascaded networks, i.e., +CoarseNet, and RefineNet. The CoarseNet estimates coarse motions to produce +animated images in novel poses and the RefineNet focuses on learning finer lip +motions by progressively estimating lip motions from low-to-high resolutions, +yielding improved lip-synchronization performance. Experiments demonstrate our +pose prediction strategy achieves better pose diversity and realness compared +to text-only or audio-only, and our video generator model outperforms +state-of-the-art methods in synthesizing talking videos with natural head +motions. Project: https://junleen.github.io/projects/posetalk. + +
+
+ comment: 7+5 pages, 15 figures +
+
+
+
+
+ + ☆ Low-Resolution Object Recognition with Cross-Resolution Relational + Contrastive Distillation + + +
+ Recognizing objects in low-resolution images is a challenging task due to the +lack of informative details. Recent studies have shown that knowledge +distillation approaches can effectively transfer knowledge from a +high-resolution teacher model to a low-resolution student model by aligning +cross-resolution representations. However, these approaches still face +limitations in adapting to the situation where the recognized objects exhibit +significant representation discrepancies between training and testing images. +In this study, we propose a cross-resolution relational contrastive +distillation approach to facilitate low-resolution object recognition. Our +approach enables the student model to mimic the behavior of a well-trained +teacher model which delivers high accuracy in identifying high-resolution +objects. To extract sufficient knowledge, the student learning is supervised +with contrastive relational distillation loss, which preserves the similarities +in various relational structures in contrastive representation space. In this +manner, the capability of recovering missing details of familiar low-resolution +objects can be effectively enhanced, leading to a better knowledge transfer. +Extensive experiments on low-resolution object classification and +low-resolution face recognition clearly demonstrate the effectiveness and +adaptability of our approach. + +
+
+ comment: This paper is accepted by IEEE Transactions on Circuits and Systems + for Video Technology (TCSVT) +
+
+
+
+
+ + ☆ FrameCorr: Adaptive, Autoencoder-based Neural Compression for Video + Reconstruction in Resource and Timing Constrained Network Settings + + +
+ Despite the growing adoption of video processing via Internet of Things (IoT) +devices due to their cost-effectiveness, transmitting captured data to nearby +servers poses challenges due to varying timing constraints and scarcity of +network bandwidth. Existing video compression methods face difficulties in +recovering compressed data when incomplete data is provided. Here, we introduce +\emph{\project}, a deep-learning based solution that utilizes previously +received data to predict the missing segments of a frame, enabling the +reconstruction of a frame from partially received data. + +
+
+
+
+
+ + ☆ Coral Model Generation from Single Images for Virtual Reality + Applications + + +
+ With the rapid development of VR technology, the demand for high-quality 3D +models is increasing. Traditional methods struggle with efficiency and quality +in large-scale customization. This paper introduces a deep-learning framework +that generates high-precision 3D coral models from a single image. Using the +Coral dataset, the framework extracts geometric and texture features, performs +3D reconstruction, and optimizes design and material blending. Advanced +optimization and polygon count control ensure shape accuracy, detail retention, +and flexible output for various complexities, catering to high-quality +rendering and real-time interaction needs.The project incorporates Explainable +AI (XAI) to transform AI-generated models into interactive "artworks," best +viewed in VR and XR. This enhances model interpretability and human-machine +collaboration. Real-time feedback in VR interactions displays information like +coral species and habitat, enriching user experience. The generated models +surpass traditional methods in detail, visual quality, and efficiency. This +research offers an intelligent approach to 3D content creation for VR, lowering +production barriers, and promoting widespread VR applications. Additionally, +integrating XAI provides new insights into AI-generated visual content and +advances research in 3D vision interpretability. + +
+
+ comment: In Proceedings of Explainable AI for the Arts Workshop 2024 (XAIxArts + 2024) arXiv:2406.14485 +
+
+
+
+
+ + ♻ ☆ Hand1000: Generating Realistic Hands from Text with Only 1,000 Images + + +
+ Text-to-image generation models have achieved remarkable advancements in +recent years, aiming to produce realistic images from textual descriptions. +However, these models often struggle with generating anatomically accurate +representations of human hands. The resulting images frequently exhibit issues +such as incorrect numbers of fingers, unnatural twisting or interlacing of +fingers, or blurred and indistinct hands. These issues stem from the inherent +complexity of hand structures and the difficulty in aligning textual +descriptions with precise visual depictions of hands. To address these +challenges, we propose a novel approach named Hand1000 that enables the +generation of realistic hand images with target gesture using only 1,000 +training samples. The training of Hand1000 is divided into three stages with +the first stage aiming to enhance the model's understanding of hand anatomy by +using a pre-trained hand gesture recognition model to extract gesture +representation. The second stage further optimizes text embedding by +incorporating the extracted hand gesture representation, to improve alignment +between the textual descriptions and the generated hand images. The third stage +utilizes the optimized embedding to fine-tune the Stable Diffusion model to +generate realistic hand images. In addition, we construct the first publicly +available dataset specifically designed for text-to-hand image generation. +Based on the existing hand gesture recognition dataset, we adopt advanced image +captioning models and LLaMA3 to generate high-quality textual descriptions +enriched with detailed gesture information. Extensive experiments demonstrate +that Hand1000 significantly outperforms existing models in producing +anatomically correct hand images while faithfully representing other details in +the text, such as faces, clothing, and colors. + +
+
+ comment: Project page https://haozhuo-zhang.github.io/Hand1000-project-page/ +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 72 + +
+
+
+ + ☆ Arctic-SnowCoder: Demystifying High-Quality Data in Code Pretraining + + +
+ Recent studies have been increasingly demonstrating that high-quality data is +crucial for effective pretraining of language models. However, the precise +definition of "high-quality" remains underexplored. Focusing on the code +domain, we introduce Arctic-SnowCoder-1.3B, a data-efficient base code model +pretrained on 555B tokens through three phases of progressively refined data: +(1) general pretraining with 500B standard-quality code tokens, preprocessed +through basic filtering, deduplication, and decontamination, (2) continued +pretraining with 50B high-quality tokens, selected from phase one by a +BERT-style quality annotator trained to distinguish good code from random data, +using positive examples drawn from high-quality code files, along with +instruction data from Magicoder and StarCoder2-Instruct, and (3) enhanced +pretraining with 5B synthetic data created by Llama-3.1-70B using phase two +data as seeds, adapting the Magicoder approach for pretraining. Despite being +trained on a limited dataset, Arctic-SnowCoder achieves state-of-the-art +performance on BigCodeBench, a coding benchmark focusing on practical and +challenging programming tasks, compared to similarly sized models trained on no +more than 1T tokens, outperforming Phi-1.5-1.3B by 36%. Across all evaluated +benchmarks, Arctic-SnowCoder-1.3B beats StarCoderBase-3B pretrained on 1T +tokens. Additionally, it matches the performance of leading small base code +models trained on trillions of tokens. For example, Arctic-SnowCoder-1.3B +surpasses StarCoder2-3B, pretrained on over 3.3T tokens, on HumanEval+, a +benchmark that evaluates function-level code generation, and remains +competitive on BigCodeBench. Our evaluation presents a comprehensive analysis +justifying various design choices for Arctic-SnowCoder. Most importantly, we +find that the key to high-quality data is its alignment with the distribution +of downstream applications. + +
+
+
+
+
+ + ☆ Optimal L-Systems for Stochastic L-system Inference Problems + + +
+ This paper presents two novel theorems that address two open problems in +stochastic Lindenmayer-system (L-system) inference, specifically focusing on +the construction of an optimal stochastic L-system capable of generating a +given sequence of strings. The first theorem delineates a method for crafting a +stochastic L-system that maximizes the likelihood of producing a given sequence +of words through a singular derivation. Furthermore, the second theorem +determines the stochastic L-systems with the highest probability of producing a +given sequence of words with multiple possible derivations. From these, we +introduce an algorithm to infer an optimal stochastic L-system from a given +sequence. This algorithm incorporates sophisticated optimization techniques, +such as interior point methods, ensuring production of a stochastically optimal +stochastic L-system suitable for generating the given sequence. This allows for +the use of using stochastic L-systems as model for machine learning using only +positive data for training. + +
+
+
+
+
+ + ☆ MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in + LLMs + + +
+ Existing benchmarks for large language models (LLMs) increasingly struggle to +differentiate between top-performing models, underscoring the need for more +challenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced +benchmark building upon MMLU-Pro to assess shortcut learning and higher-order +reasoning in LLMs. By incorporating questions with multiple correct answers +across diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex +reasoning and resist simplistic problem-solving strategies. Our results show +that MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous +test of model discrimination, particularly in multi-correct answer scenarios. +We introduce novel metrics like shortcut selection ratio and correct pair +identification ratio, offering deeper insights into model behavior and +anchoring bias. Evaluations of five state-of-the-art LLMs reveal significant +performance gaps, highlighting variations in reasoning abilities and bias +susceptibility. We release the dataset and evaluation codes at +\url{https://github.com/asgsaeid/mmlu-pro-plus}. + +
+
+
+
+
+ + ☆ Therapy as an NLP Task: Psychologists' Comparison of LLMs and Human + Peers in CBT + + +
+ Wider access to therapeutic care is one of the biggest challenges in mental +health treatment. Due to institutional barriers, some people seeking mental +health support have turned to large language models (LLMs) for personalized +therapy, even though these models are largely unsanctioned and untested. We +investigate the potential and limitations of using LLMs as providers of +evidence-based therapy by using mixed methods clinical metrics. Using HELPERT, +a prompt run on a large language model using the same process and training as a +comparative group of peer counselors, we replicated publicly accessible mental +health conversations rooted in Cognitive Behavioral Therapy (CBT) to compare +session dynamics and counselor's CBT-based behaviors between original peer +support sessions and their reconstructed HELPERT sessions. Two licensed, +CBT-trained clinical psychologists evaluated the sessions using the Cognitive +Therapy Rating Scale and provided qualitative feedback. Our findings show that +the peer sessions are characterized by empathy, small talk, therapeutic +alliance, and shared experiences but often exhibit therapist drift. Conversely, +HELPERT reconstructed sessions exhibit minimal therapist drift and higher +adherence to CBT methods but display a lack of collaboration, empathy, and +cultural understanding. Through CTRS ratings and psychologists' feedback, we +highlight the importance of human-AI collaboration for scalable mental health. +Our work outlines the ethical implication of imparting human-like subjective +qualities to LLMs in therapeutic settings, particularly the risk of deceptive +empathy, which may lead to unrealistic patient expectations and potential harm. + +
+
+
+
+
+ + ☆ Temporal Order Preserved Optimal Transport-based Cross-modal Knowledge + Transfer Learning for ASR + + +
+ Transferring linguistic knowledge from a pretrained language model (PLM) to +an acoustic model has been shown to greatly improve the performance of +automatic speech recognition (ASR). However, due to the heterogeneous feature +distributions in cross-modalities, designing an effective model for feature +alignment and knowledge transfer between linguistic and acoustic sequences +remains a challenging task. Optimal transport (OT), which efficiently measures +probability distribution discrepancies, holds great potential for aligning and +transferring knowledge between acoustic and linguistic modalities. Nonetheless, +the original OT treats acoustic and linguistic feature sequences as two +unordered sets in alignment and neglects temporal order information during OT +coupling estimation. Consequently, a time-consuming pretraining stage is +required to learn a good alignment between the acoustic and linguistic +representations. In this paper, we propose a Temporal Order Preserved OT +(TOT)-based Cross-modal Alignment and Knowledge Transfer (CAKT) (TOT-CAKT) for +ASR. In the TOT-CAKT, local neighboring frames of acoustic sequences are +smoothly mapped to neighboring regions of linguistic sequences, preserving +their temporal order relationship in feature alignment and matching. With the +TOT-CAKT model framework, we conduct Mandarin ASR experiments with a pretrained +Chinese PLM for linguistic knowledge transfer. Our results demonstrate that the +proposed TOT-CAKT significantly improves ASR performance compared to several +state-of-the-art models employing linguistic knowledge transfer, and addresses +the weaknesses of the original OT-based method in sequential feature alignment +for ASR. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ Unforgettable Generalization in Language Models + + +
+ When language models (LMs) are trained to forget (or "unlearn'') a skill, how +precisely does their behavior change? We study the behavior of transformer LMs +in which tasks have been forgotten via fine-tuning on randomized labels. Such +LMs learn to generate near-random predictions for individual examples in the +"training'' set used for forgetting. Across tasks, however, LMs exhibit extreme +variability in whether LM predictions change on examples outside the training +set. In some tasks (like entailment classification), forgetting generalizes +robustly, and causes models to produce uninformative predictions on new task +instances; in other tasks (like physical commonsense reasoning and scientific +question answering) forgetting affects only the training examples, and models +continue to perform the "forgotten'' task accurately even for examples very +similar to those that appeared in the training set. Dataset difficulty is not +predictive of whether a behavior can be forgotten; instead, generalization in +forgetting is (weakly) predicted by the confidence of LMs' initial task +predictions and the variability of LM representations of training data, with +low confidence and low variability both associated with greater generalization. +Perhaps most surprisingly, random-label forgetting appears to be somewhat +insensitive to the contents of the training set: for example, models trained on +science questions with random labels continue to answer other science questions +accurately, but begin to produce random labels on entailment classification +tasks. Finally, we show that even generalizable forgetting is shallow: linear +probes trained on LMs' representations can still perform tasks reliably after +forgetting. Our results highlight the difficulty and unpredictability of +performing targeted skill removal from models via fine-tuning. + +
+
+ comment: 18 pages, 9 figures, published in First Conference on Language + Modeling 2024 +
+
+
+
+
+ + ☆ Visually Grounded Speech Models for Low-resource Languages and Cognitive + Modelling + + +
+ This dissertation examines visually grounded speech (VGS) models that learn +from unlabelled speech paired with images. It focuses on applications for +low-resource languages and understanding human language acquisition. We +introduce a task called visually prompted keyword localisation to detect and +localise keywords in speech using images. We demonstrate the effectiveness of +VGS models in few-shot learning scenarios for low-resource languages like +Yoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our +monolingual VGS model exhibits this bias, but we found that multilingualism +does not affect the bias in this VGS model similarly to what is observed in +children. + +
+
+ comment: PhD Dissertation +
+
+
+
+
+ + ☆ CRAFT Your Dataset: Task-Specific Synthetic Dataset Generation Through + Corpus Retrieval and Augmentation + + +
+ Building high-quality datasets for specialized tasks is a time-consuming and +resource-intensive process that often requires specialized domain knowledge. We +propose Corpus Retrieval and Augmentation for Fine-Tuning (CRAFT), a method for +generating synthetic datasets, given a small number of user-written few-shots +that demonstrate the task to be performed. Given the few-shot examples, we use +large-scale public web-crawled corpora and similarity-based document retrieval +to find other relevant human-written documents. Lastly, instruction-tuned large +language models (LLMs) augment the retrieved documents into custom-formatted +task samples, which then can be used for fine-tuning. We demonstrate that CRAFT +can efficiently generate large-scale task-specific training datasets for four +diverse tasks: biology question-answering (QA), medicine QA and commonsense QA +as well as summarization. Our experiments show that CRAFT-based models +outperform or achieve comparable performance to general LLMs for QA tasks, +while CRAFT-based summarization models outperform models trained on +human-curated data by 46 preference points. + +
+
+
+
+
+ + ☆ Political DEBATE: Efficient Zero-shot and Few-shot Classifiers for + Political Text + + +
+ Social scientists quickly adopted large language models due to their ability +to annotate documents without supervised training, an ability known as +zero-shot learning. However, due to their compute demands, cost, and often +proprietary nature, these models are often at odds with replication and open +science standards. This paper introduces the Political DEBATE (DeBERTa +Algorithm for Textual Entailment) language models for zero-shot and few-shot +classification of political documents. These models are not only as good, or +better than, state-of-the art large language models at zero and few-shot +classification, but are orders of magnitude more efficient and completely open +source. By training the models on a simple random sample of 10-25 documents, +they can outperform supervised classifiers trained on hundreds or thousands of +documents and state-of-the-art generative models with complex, engineered +prompts. Additionally, we release the PolNLI dataset used to train these models +-- a corpus of over 200,000 political documents with highly accurate labels +across over 800 classification tasks. + +
+
+ comment: 26 pages, 5 figures +
+
+
+
+
+ + ☆ Spinning the Golden Thread: Benchmarking Long-Form Generation in + Language Models + + +
+ The abilities of long-context language models (LMs) are often evaluated using +the "Needle-in-a-Haystack" (NIAH) test, which comprises tasks designed to +assess a model's ability to identify specific information ("needle") within +large text sequences ("haystack"). While these benchmarks measure how well +models understand long-context input sequences, they do not effectively gauge +the quality of long-form text generation--a critical aspect for applications +such as design proposals and creative writing. To address this gap, we have +introduced a new long-form text evaluation benchmark, Spinning the Golden +Thread (SGT), which tests models' ability to identify specific events within +generated long text sequences. In this benchmark, we prompt long-context LMs to +create long-form text that must include particular events or constraints and +evaluate their ability to incorporate these elements. We evaluated ten +long-context LMs across four distinct scenarios, three types of prompt +instructions, and two different generation-length settings (16K and 32K). +Although these models perform well on NIAH benchmarks, none demonstrated +satisfactory performance on the Spinning the Golden Thread, raising concerns +about their ability to generate coherent long-form text that follows +instructions. Additionally, as the length of the generated text increases, all +models exhibit a significant drop in performance. + +
+
+
+
+
+ + ☆ OLMoE: Open Mixture-of-Experts Language Models + + +
+ We introduce OLMoE, a fully open, state-of-the-art language model leveraging +sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but +uses only 1B per input token. We pretrain it on 5 trillion tokens and further +adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available +models with similar active parameters, even surpassing larger ones like +Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE +training, analyze routing in our model showing high specialization, and +open-source all aspects of our work: model weights, training data, code, and +logs. + +
+
+ comment: 61 pages (24 main), 36 figures, 14 tables +
+
+
+
+
+ + ☆ Enhancing Code-Switching Speech Recognition with LID-Based Collaborative + Mixture of Experts Model + + +
+ Due to the inherent difficulty in modeling phonetic similarities across +different languages, code-switching speech recognition presents a formidable +challenge. This study proposes a Collaborative-MoE, a Mixture of Experts (MoE) +model that leverages a collaborative mechanism among expert groups. Initially, +a preceding routing network explicitly learns Language Identification (LID) +tasks and selects experts based on acquired LID weights. This process ensures +robust routing information to the MoE layer, mitigating interference from +diverse language domains on expert network parameter updates. The LID weights +are also employed to facilitate inter-group collaboration, enabling the +integration of language-specific representations. Furthermore, within each +language expert group, a gating network operates unsupervised to foster +collaboration on attributes beyond language. Extensive experiments demonstrate +the efficacy of our approach, achieving significant performance enhancements +compared to alternative methods. Importantly, our method preserves the +efficient inference capabilities characteristic of MoE models without +necessitating additional pre-training. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ BEAVER: An Enterprise Benchmark for Text-to-SQL + + +
+ Existing text-to-SQL benchmarks have largely been constructed using publicly +available tables from the web with human-generated tests containing question +and SQL statement pairs. They typically show very good results and lead people +to think that LLMs are effective at text-to-SQL tasks. In this paper, we apply +off-the-shelf LLMs to a benchmark containing enterprise data warehouse data. In +this environment, LLMs perform poorly, even when standard prompt engineering +and RAG techniques are utilized. As we will show, the reasons for poor +performance are largely due to three characteristics: (1) public LLMs cannot +train on enterprise data warehouses because they are largely in the "dark web", +(2) schemas of enterprise tables are more complex than the schemas in public +data, which leads the SQL-generation task innately harder, and (3) +business-oriented questions are often more complex, requiring joins over +multiple tables and aggregations. As a result, we propose a new dataset BEAVER, +sourced from real enterprise data warehouses together with natural language +queries and their correct SQL statements which we collected from actual user +history. We evaluated this dataset using recent LLMs and demonstrated their +poor performance on this task. We hope this dataset will facilitate future +researchers building more sophisticated text-to-SQL systems which can do better +on this important class of data. + +
+
+
+
+
+ + ☆ Foundations of Large Language Model Compression -- Part 1: Weight + Quantization + + +
+ In recent years, compression of large language models (LLMs) has emerged as +an important problem to allow language model deployment on resource-constrained +devices, reduce computational costs, and mitigate the environmental footprint +of large-scale AI infrastructure. In this paper, we present the foundations of +LLM quantization from a convex optimization perspective and propose a +quantization method that builds on these foundations and outperforms previous +methods. Our quantization framework, CVXQ, scales to models containing hundreds +of billions of weight parameters and provides users with the flexibility to +compress models to any specified model size, post-training. A reference +implementation of CVXQ can be obtained from https://github.com/seannz/cvxq. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ FuzzCoder: Byte-level Fuzzing Test via Large Language Model + + +
+ Fuzzing is an important dynamic program analysis technique designed for +finding vulnerabilities in complex software. Fuzzing involves presenting a +target program with crafted malicious input to cause crashes, buffer overflows, +memory errors, and exceptions. Crafting malicious inputs in an efficient manner +is a difficult open problem and the best approaches often apply uniform random +mutations to pre-existing valid inputs. In this work, we propose to adopt +fine-tuned large language models (FuzzCoder) to learn patterns in the input +files from successful attacks to guide future fuzzing explorations. +Specifically, we develop a framework to leverage the code LLMs to guide the +mutation process of inputs in fuzzing. The mutation process is formulated as +the sequence-to-sequence modeling, where LLM receives a sequence of bytes and +then outputs the mutated byte sequence. FuzzCoder is fine-tuned on the created +instruction dataset (Fuzz-Instruct), where the successful fuzzing history is +collected from the heuristic fuzzing tool. FuzzCoder can predict mutation +locations and strategies locations in input files to trigger abnormal behaviors +of the program. Experimental results show that FuzzCoder based on AFL (American +Fuzzy Lop) gain significant improvements in terms of effective proportion of +mutation (EPM) and number of crashes (NC) for various input formats including +ELF, JPG, MP3, and XML. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Towards Leveraging Large Language Models for Automated Medical Q&A + Evaluation + + +
+ This paper explores the potential of using Large Language Models (LLMs) to +automate the evaluation of responses in medical Question and Answer (Q\&A) +systems, a crucial form of Natural Language Processing. Traditionally, human +evaluation has been indispensable for assessing the quality of these responses. +However, manual evaluation by medical professionals is time-consuming and +costly. Our study examines whether LLMs can reliably replicate human +evaluations by using questions derived from patient data, thereby saving +valuable time for medical experts. While the findings suggest promising +results, further research is needed to address more specific or complex +questions that were beyond the scope of this initial investigation. + +
+
+ comment: 10 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ 3D-LEX v1.0: 3D Lexicons for American Sign Language and Sign Language of + the Netherlands + + +
+ In this work, we present an efficient approach for capturing sign language in +3D, introduce the 3D-LEX v1.0 dataset, and detail a method for semi-automatic +annotation of phonetic properties. Our procedure integrates three motion +capture techniques encompassing high-resolution 3D poses, 3D handshapes, and +depth-aware facial features, and attains an average sampling rate of one sign +every 10 seconds. This includes the time for presenting a sign example, +performing and recording the sign, and archiving the capture. The 3D-LEX +dataset includes 1,000 signs from American Sign Language and an additional +1,000 signs from the Sign Language of the Netherlands. We showcase the dataset +utility by presenting a simple method for generating handshape annotations +directly from 3D-LEX. We produce handshape labels for 1,000 signs from American +Sign Language and evaluate the labels in a sign recognition task. The labels +enhance gloss recognition accuracy by 5% over using no handshape annotations, +and by 1% over expert annotations. Our motion capture data supports in-depth +analysis of sign features and facilitates the generation of 2D projections from +any viewpoint. The 3D-LEX collection has been aligned with existing sign +language benchmarks and linguistic resources, to support studies in 3D-aware +sign language processing. + +
+
+
+
+
+ + ☆ What are the Essential Factors in Crafting Effective Long Context + Multi-Hop Instruction Datasets? Insights and Best Practices + + +
+ Recent advancements in large language models (LLMs) with extended context +windows have significantly improved tasks such as information extraction, +question answering, and complex planning scenarios. In order to achieve success +in long context tasks, a large amount of work has been done to enhance the long +context capabilities of the model through synthetic data. Existing methods +typically utilize the Self-Instruct framework to generate instruction tuning +data for better long context capability improvement. However, our preliminary +experiments indicate that less than 35% of generated samples are multi-hop, and +more than 40% exhibit poor quality, limiting comprehensive understanding and +further research. To improve the quality of synthetic data, we propose the +Multi-agent Interactive Multi-hop Generation (MIMG) framework, incorporating a +Quality Verification Agent, a Single-hop Question Generation Agent, a Multiple +Question Sampling Strategy, and a Multi-hop Question Merger Agent. This +framework improves the data quality, with the proportion of high-quality, +multi-hop, and diverse data exceeding 85%. Furthermore, we systematically +investigate strategies for document selection, question merging, and validation +techniques through extensive experiments across various models. Our findings +show that our synthetic high-quality long-context instruction data +significantly enhances model performance, even surpassing models trained on +larger amounts of human-annotated data. Our code is available at: +https://github.com/WowCZ/LongMIT. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Investigating Expert-in-the-Loop LLM Discourse Patterns for Ancient + Intertextual Analysis + + +
+ This study explores the potential of large language models (LLMs) for +identifying and examining intertextual relationships within biblical, Koine +Greek texts. By evaluating the performance of LLMs on various intertextuality +scenarios the study demonstrates that these models can detect direct +quotations, allusions, and echoes between texts. The LLM's ability to generate +novel intertextual observations and connections highlights its potential to +uncover new insights. However, the model also struggles with long query +passages and the inclusion of false intertextual dependences, emphasizing the +importance of expert evaluation. The expert-in-the-loop methodology presented +offers a scalable approach for intertextual research into the complex web of +intertextuality within and beyond the biblical corpus. + +
+
+
+
+
+ + ☆ The Role of Large Language Models in Musicology: Are We Ready to Trust + the Machines? + + +
+ In this work, we explore the use and reliability of Large Language Models +(LLMs) in musicology. From a discussion with experts and students, we assess +the current acceptance and concerns regarding this, nowadays ubiquitous, +technology. We aim to go one step further, proposing a semi-automatic method to +create an initial benchmark using retrieval-augmented generation models and +multiple-choice question generation, validated by human experts. Our evaluation +on 400 human-validated questions shows that current vanilla LLMs are less +reliable than retrieval augmented generation from music dictionaries. This +paper suggests that the potential of LLMs in musicology requires musicology +driven research that can specialized LLMs by including accurate and reliable +domain knowledge. + +
+
+
+
+
+ + ☆ AgentRE: An Agent-Based Framework for Navigating Complex Information + Landscapes in Relation Extraction CIKM 2024 + + +
+ The relation extraction (RE) in complex scenarios faces challenges such as +diverse relation types and ambiguous relations between entities within a single +sentence, leading to the poor performance of pure "text-in, text-out" language +models (LMs). To address these challenges, in this paper, we propose an +agent-based RE framework, namely AgentRE, which fully leverages the potential +of large language models (LLMs) including memory, retrieval and reflection, to +achieve RE in complex scenarios. Specifically, three major modules are built in +AgentRE serving as the tools to help the agent acquire and process various +useful information, thereby obtaining improved RE performance. Our extensive +experimental results upon two datasets in English and Chinese demonstrate our +AgentRE's superior performance, especially in low-resource scenarios. +Additionally, the trajectories generated by AgentRE can be refined to construct +a high-quality training dataset incorporating different reasoning methods, +which can be used to fine-tune smaller models. Code is available at +https://github.com/Lightblues/AgentRE. + +
+
+ comment: Accepted by CIKM 2024 +
+
+
+
+
+ + ☆ Towards Generative Class Prompt Learning for Few-shot Visual Recognition BMVC 2024 + + +
+ Although foundational vision-language models (VLMs) have proven to be very +successful for various semantic discrimination tasks, they still struggle to +perform faithfully for fine-grained categorization. Moreover, foundational +models trained on one domain do not generalize well on a different domain +without fine-tuning. We attribute these to the limitations of the VLM's +semantic representations and attempt to improve their fine-grained visual +awareness using generative modeling. Specifically, we propose two novel +methods: Generative Class Prompt Learning (GCPL) and Contrastive Multi-class +Prompt Learning (CoMPLe). Utilizing text-to-image diffusion models, GCPL +significantly improves the visio-linguistic synergy in class embeddings by +conditioning on few-shot exemplars with learnable class prompts. CoMPLe builds +on this foundation by introducing a contrastive learning component that +encourages inter-class separation during the generative optimization process. +Our empirical results demonstrate that such a generative class prompt learning +approach substantially outperform existing methods, offering a better +alternative to few shot image recognition challenges. The source code will be +made available at: https://github.com/soumitri2001/GCPL. + +
+
+ comment: Accepted at BMVC 2024 +
+
+
+
+
+ + ☆ Dialogue You Can Trust: Human and AI Perspectives on Generated + Conversations ALT + + +
+ As dialogue systems and chatbots increasingly integrate into everyday +interactions, the need for efficient and accurate evaluation methods becomes +paramount. This study explores the comparative performance of human and AI +assessments across a range of dialogue scenarios, focusing on seven key +performance indicators (KPIs): Coherence, Innovation, Concreteness, Goal +Contribution, Commonsense Contradiction, Incorrect Fact, and Redundancy. +Utilizing the GPT-4o API, we generated a diverse dataset of conversations and +conducted a two-part experimental analysis. In Experiment 1, we evaluated +multi-party conversations on Coherence, Innovation, Concreteness, and Goal +Contribution, revealing that GPT models align closely with human judgments. +Notably, both human and AI evaluators exhibited a tendency towards binary +judgment rather than linear scaling, highlighting a shared challenge in these +assessments. Experiment 2 extended the work of Finch et al. (2023) by focusing +on dyadic dialogues and assessing Commonsense Contradiction, Incorrect Fact, +and Redundancy. The results indicate that while GPT-4o demonstrates strong +performance in maintaining factual accuracy and commonsense reasoning, it still +struggles with reducing redundancy and self-contradiction. Our findings +underscore the potential of GPT models to closely replicate human evaluation in +dialogue systems, while also pointing to areas for improvement. This research +offers valuable insights for advancing the development and implementation of +more refined dialogue evaluation methodologies, contributing to the evolution +of more effective and human-like AI communication tools. + +
+
+ comment: 17 pages, 15 figures, shorter version submitted to 22nd Annual + Workshop of the Australasian Language Technology Association (ALTA'24) +
+
+
+
+
+ + ☆ LASP: Surveying the State-of-the-Art in Large Language Model-Assisted AI + Planning + + +
+ Effective planning is essential for the success of any task, from organizing +a vacation to routing autonomous vehicles and developing corporate strategies. +It involves setting goals, formulating plans, and allocating resources to +achieve them. LLMs are particularly well-suited for automated planning due to +their strong capabilities in commonsense reasoning. They can deduce a sequence +of actions needed to achieve a goal from a given state and identify an +effective course of action. However, it is frequently observed that plans +generated through direct prompting often fail upon execution. Our survey aims +to highlight the existing challenges in planning with language models, focusing +on key areas such as embodied environments, optimal scheduling, competitive and +cooperative games, task decomposition, reasoning, and planning. Through this +study, we explore how LLMs transform AI planning and provide unique insights +into the future of LM-assisted planning. + +
+
+
+
+
+ + ☆ Training on the Benchmark Is Not All You Need + + +
+ The success of Large Language Models (LLMs) relies heavily on the huge amount +of pre-training data learned in the pre-training phase. The opacity of the +pre-training process and the training data causes the results of many benchmark +tests to become unreliable. If any model has been trained on a benchmark test +set, it can seriously hinder the health of the field. In order to automate and +efficiently test the capabilities of large language models, numerous mainstream +benchmarks adopt a multiple-choice format. As the swapping of the contents of +multiple-choice options does not affect the meaning of the question itself, we +propose a simple and effective data leakage detection method based on this +property. Specifically, we shuffle the contents of the options in the data to +generate the corresponding derived data sets, and then detect data leakage +based on the model's log probability distribution over the derived data sets. +If there is a maximum and outlier in the set of log probabilities, it indicates +that the data is leaked. Our method is able to work under black-box conditions +without access to model training data or weights, effectively identifying data +leakage from benchmark test sets in model pre-training data, including both +normal scenarios and complex scenarios where options may have been shuffled +intentionally or unintentionally. Through experiments based on two LLMs and +benchmark designs, we demonstrate the effectiveness of our method. In addition, +we evaluate the degree of data leakage of 31 mainstream open-source LLMs on +four benchmark datasets and give a ranking of the leaked LLMs for each +benchmark, and we find that the Qwen family of LLMs has the highest degree of +data leakage. + +
+
+
+
+
+ + ☆ LLM-GAN: Construct Generative Adversarial Network Through Large Language + Models For Explainable Fake News Detection + + +
+ Explainable fake news detection predicts the authenticity of news items with +annotated explanations. Today, Large Language Models (LLMs) are known for their +powerful natural language understanding and explanation generation abilities. +However, presenting LLMs for explainable fake news detection remains two main +challenges. Firstly, fake news appears reasonable and could easily mislead +LLMs, leaving them unable to understand the complex news-faking process. +Secondly, utilizing LLMs for this task would generate both correct and +incorrect explanations, which necessitates abundant labor in the loop. In this +paper, we propose LLM-GAN, a novel framework that utilizes prompting mechanisms +to enable an LLM to become Generator and Detector and for realistic fake news +generation and detection. Our results demonstrate LLM-GAN's effectiveness in +both prediction performance and explanation quality. We further showcase the +integration of LLM-GAN to a cloud-native AI platform to provide better fake +news detection service in the cloud. + +
+
+
+
+
+ + ☆ State-of-the-art Advances of Deep-learning Linguistic Steganalysis + Research + + +
+ With the evolution of generative linguistic steganography techniques, +conventional steganalysis falls short in robustly quantifying the alterations +induced by steganography, thereby complicating detection. Consequently, the +research paradigm has pivoted towards deep-learning-based linguistic +steganalysis. This study offers a comprehensive review of existing +contributions and evaluates prevailing developmental trajectories. +Specifically, we first provided a formalized exposition of the general formulas +for linguistic steganalysis, while comparing the differences between this field +and the domain of text classification. Subsequently, we classified the existing +work into two levels based on vector space mapping and feature extraction +models, thereby comparing the research motivations, model advantages, and other +details. A comparative analysis of the experiments is conducted to assess the +performances. Finally, the challenges faced by this field are discussed, and +several directions for future development and key issues that urgently need to +be addressed are proposed. + +
+
+ comment: Accepted by 2023 International Conference on Data, Information and + Computing Science +
+
+
+
+
+ + ☆ FC-KAN: Function Combinations in Kolmogorov-Arnold Networks + + +
+ In this paper, we introduce FC-KAN, a Kolmogorov-Arnold Network (KAN) that +leverages combinations of popular mathematical functions such as B-splines, +wavelets, and radial basis functions on low-dimensional data through +element-wise operations. We explore several methods for combining the outputs +of these functions, including sum, element-wise product, the addition of sum +and element-wise product, quadratic function representation, and concatenation. +In our experiments, we compare FC-KAN with multi-layer perceptron network (MLP) +and other existing KANs, such as BSRBF-KAN, EfficientKAN, FastKAN, and +FasterKAN, on the MNIST and Fashion-MNIST datasets. A variant of FC-KAN, which +uses a combination of outputs from B-splines and Difference of Gaussians (DoG) +in the form of a quadratic function, outperformed all other models on the +average of 5 independent training runs. We expect that FC-KAN can leverage +function combinations to design future KANs. Our repository is publicly +available at: https://github.com/hoangthangta/FC_KAN. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ☆ Empirical evidence of Large Language Model's influence on human spoken + communication + + +
+ Artificial Intelligence (AI) agents now interact with billions of humans in +natural language, thanks to advances in Large Language Models (LLMs) like +ChatGPT. This raises the question of whether AI has the potential to shape a +fundamental aspect of human culture: the way we speak. Recent analyses revealed +that scientific publications already exhibit evidence of AI-specific language. +But this evidence is inconclusive, since scientists may simply be using AI to +copy-edit their writing. To explore whether AI has influenced human spoken +communication, we transcribed and analyzed about 280,000 English-language +videos of presentations, talks, and speeches from more than 20,000 YouTube +channels of academic institutions. We find a significant shift in the trend of +word usage specific to words distinctively associated with ChatGPT following +its release. These findings provide the first empirical evidence that humans +increasingly imitate LLMs in their spoken language. Our results raise societal +and policy-relevant concerns about the potential of AI to unintentionally +reduce linguistic diversity, or to be deliberately misused for mass +manipulation. They also highlight the need for further investigation into the +feedback loops between machine behavior and human culture. + +
+
+
+
+
+ + ☆ Taming CLIP for Fine-grained and Structured Visual Understanding of + Museum Exhibits ECCV 2024 + + +
+ CLIP is a powerful and widely used tool for understanding images in the +context of natural language descriptions to perform nuanced tasks. However, it +does not offer application-specific fine-grained and structured understanding, +due to its generic nature. In this work, we aim to adapt CLIP for fine-grained +and structured -- in the form of tabular data -- visual understanding of museum +exhibits. To facilitate such understanding we (a) collect, curate, and +benchmark a dataset of 200K+ image-table pairs, and (b) develop a method that +allows predicting tabular outputs for input images. Our dataset is the first of +its kind in the public domain. At the same time, the proposed method is novel +in leveraging CLIP's powerful representations for fine-grained and tabular +understanding. The proposed method (MUZE) learns to map CLIP's image embeddings +to the tabular structure by means of a proposed transformer-based parsing +network (parseNet). More specifically, parseNet enables prediction of missing +attribute values while integrating context from known attribute-value pairs for +an input image. We show that this leads to significant improvement in accuracy. +Through exhaustive experiments, we show the effectiveness of the proposed +method on fine-grained and structured understanding of museum exhibits, by +achieving encouraging results in a newly established benchmark. Our dataset and +source-code can be found at: https://github.com/insait-institute/MUZE + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ In Defense of RAG in the Era of Long-Context Language Models + + +
+ Overcoming the limited context limitations in early-generation LLMs, +retrieval-augmented generation (RAG) has been a reliable solution for +context-based answer generation in the past. Recently, the emergence of +long-context LLMs allows the models to incorporate much longer text sequences, +making RAG less attractive. Recent studies show that long-context LLMs +significantly outperform RAG in long-context applications. Unlike the existing +works favoring the long-context LLM over RAG, we argue that the extremely long +context in LLMs suffers from a diminished focus on relevant information and +leads to potential degradation in answer quality. This paper revisits the RAG +in long-context answer generation. We propose an order-preserve +retrieval-augmented generation (OP-RAG) mechanism, which significantly improves +the performance of RAG for long-context question-answer applications. With +OP-RAG, as the number of retrieved chunks increases, the answer quality +initially rises, and then declines, forming an inverted U-shaped curve. There +exist sweet points where OP-RAG could achieve higher answer quality with much +less tokens than long-context LLM taking the whole context as input. Extensive +experiments on public benchmark demonstrate the superiority of our OP-RAG. + +
+
+
+
+
+ + ☆ Interpreting and Improving Large Language Models in Arithmetic + Calculation ICML 2024 + + +
+ Large language models (LLMs) have demonstrated remarkable potential across +numerous applications and have shown an emergent ability to tackle complex +reasoning tasks, such as mathematical computations. However, even for the +simplest arithmetic calculations, the intrinsic mechanisms behind LLMs remain +mysterious, making it challenging to ensure reliability. In this work, we delve +into uncovering a specific mechanism by which LLMs execute calculations. +Through comprehensive experiments, we find that LLMs frequently involve a small +fraction (< 5%) of attention heads, which play a pivotal role in focusing on +operands and operators during calculation processes. Subsequently, the +information from these operands is processed through multi-layer perceptrons +(MLPs), progressively leading to the final solution. These pivotal heads/MLPs, +though identified on a specific dataset, exhibit transferability across +different datasets and even distinct tasks. This insight prompted us to +investigate the potential benefits of selectively fine-tuning these essential +heads/MLPs to boost the LLMs' computational performance. We empirically find +that such precise tuning can yield notable enhancements on mathematical +prowess, without compromising the performance on non-mathematical tasks. Our +work serves as a preliminary exploration into the arithmetic calculation +abilities inherent in LLMs, laying a solid foundation to reveal more intricate +mathematical tasks. + +
+
+ comment: Accepted by ICML 2024 (oral) +
+
+
+
+
+ + ☆ From Yes-Men to Truth-Tellers: Addressing Sycophancy in Large Language + Models with Pinpoint Tuning ICML 2024 + + +
+ Large Language Models (LLMs) tend to prioritize adherence to user prompts +over providing veracious responses, leading to the sycophancy issue. When +challenged by users, LLMs tend to admit mistakes and provide inaccurate +responses even if they initially provided the correct answer. Recent works +propose to employ supervised fine-tuning (SFT) to mitigate the sycophancy +issue, while it typically leads to the degeneration of LLMs' general +capability. To address the challenge, we propose a novel supervised pinpoint +tuning (SPT), where the region-of-interest modules are tuned for a given +objective. Specifically, SPT first reveals and verifies a small percentage +(<5%) of the basic modules, which significantly affect a particular behavior of +LLMs. i.e., sycophancy. Subsequently, SPT merely fine-tunes these identified +modules while freezing the rest. To verify the effectiveness of the proposed +SPT, we conduct comprehensive experiments, demonstrating that SPT significantly +mitigates the sycophancy issue of LLMs (even better than SFT). Moreover, SPT +introduces limited or even no side effects on the general capability of LLMs. +Our results shed light on how to precisely, effectively, and efficiently +explain and improve the targeted ability of LLMs. + +
+
+ comment: Accepted by ICML 2024 +
+
+
+
+
+ + ☆ CTG-KrEW: Generating Synthetic Structured Contextually Correlated + Content by Conditional Tabular GAN with K-Means Clustering and Efficient Word + Embedding + + +
+ Conditional Tabular Generative Adversarial Networks (CTGAN) and their various +derivatives are attractive for their ability to efficiently and flexibly create +synthetic tabular data, showcasing strong performance and adaptability. +However, there are certain critical limitations to such models. The first is +their inability to preserve the semantic integrity of contextually correlated +words or phrases. For instance, skillset in freelancer profiles is one such +attribute where individual skills are semantically interconnected and +indicative of specific domain interests or qualifications. The second challenge +of traditional approaches is that, when applied to generate contextually +correlated tabular content, besides generating semantically shallow content, +they consume huge memory resources and CPU time during the training stage. To +address these problems, we introduce a novel framework, CTGKrEW (Conditional +Tabular GAN with KMeans Clustering and Word Embedding), which is adept at +generating realistic synthetic tabular data where attributes are collections of +semantically and contextually coherent words. CTGKrEW is trained and evaluated +using a dataset from Upwork, a realworld freelancing platform. Comprehensive +experiments were conducted to analyze the variability, contextual similarity, +frequency distribution, and associativity of the generated data, along with +testing the framework's system feasibility. CTGKrEW also takes around 99\% less +CPU time and 33\% less memory footprints than the conventional approach. +Furthermore, we developed KrEW, a web application to facilitate the generation +of realistic data containing skill-related information. This application, +available at https://riyasamanta.github.io/krew.html, is freely accessible to +both the general public and the research community. + +
+
+
+
+
+ + ☆ Booster: Tackling Harmful Fine-tuing for Large Language Models via + Attenuating Harmful Perturbation + + +
+ Harmful fine-tuning issue \citep{qi2023fine} poses serious safety concerns +for Large language models' fine-tuning-as-a-service. While existing defenses +\citep{huang2024vaccine,rosati2024representation} have been proposed to +mitigate the issue, their performances are still far away from satisfactory, +and the root cause of the problem has not been fully recovered. For the first +time in the literature, we in this paper show that \textit{harmful +perturbation} over the model weights should be the root cause of +alignment-broken of harmful fine-tuning. In order to attenuate the negative +impact of harmful perturbation, we propose an alignment-stage solution, dubbed +Booster. Technically, along with the original alignment loss, we append a loss +regularizer in the alignment stage's optimization. The regularizer ensures that +the model's harmful loss reduction before/after simulated harmful perturbation +is attenuated, thereby mitigating the subsequent fine-tuning risk. Empirical +results show that Booster can effectively reduce the harmful score of the +fine-tuned models while maintaining the performance of downstream tasks. Our +code is available at \url{https://github.com/git-disl/Booster}. + +
+
+
+
+
+ + ☆ Towards Cross-Lingual Explanation of Artwork in Large-scale Vision + Language Models + + +
+ As the performance of Large-scale Vision Language Models (LVLMs) improves, +they are increasingly capable of responding in multiple languages, and there is +an expectation that the demand for explanations generated by LVLMs will grow. +However, pre-training of Vision Encoder and the integrated training of LLMs +with Vision Encoder are mainly conducted using English training data, leaving +it uncertain whether LVLMs can completely handle their potential when +generating explanations in languages other than English. In addition, +multilingual QA benchmarks that create datasets using machine translation have +cultural differences and biases, remaining issues for use as evaluation tasks. +To address these challenges, this study created an extended dataset in multiple +languages without relying on machine translation. This dataset that takes into +account nuances and country-specific phrases was then used to evaluate the +generation explanation abilities of LVLMs. Furthermore, this study examined +whether Instruction-Tuning in resource-rich English improves performance in +other languages. Our findings indicate that LVLMs perform worse in languages +other than English compared to English. In addition, it was observed that LVLMs +struggle to effectively manage the knowledge learned from English data. + +
+
+
+
+
+ + ☆ AdaComp: Extractive Context Compression with Adaptive Predictor for + Retrieval-Augmented Large Language Models + + +
+ Retrieved documents containing noise will hinder RAG from detecting answer +clues and make the inference process slow and expensive. Therefore, context +compression is necessary to enhance its accuracy and efficiency. Existing +context compression methods use extractive or generative models to retain the +most query-relevant sentences or apply the information bottleneck theory to +preserve sufficient information. However, these methods may face issues such as +over-compression or high computational costs. We observe that the retriever +often ranks relevant documents at the top, but the exact number of documents +needed to answer the query is uncertain due to the impact of query complexity +and retrieval quality: complex queries like multi-hop questions may require +retaining more documents than simpler queries, and a low-quality retrieval may +need to rely on more documents to generate accurate outputs. Therefore, +determining the minimum number of required documents (compression rate) is +still a challenge for RAG. In this paper, we introduce AdaComp, a low-cost +extractive context compression method that adaptively determines the +compression rate based on both query complexity and retrieval quality. +Specifically, we first annotate the minimum top-k documents necessary for the +RAG system to answer the current query as the compression rate and then +construct triplets of the query, retrieved documents, and its compression rate. +Then, we use this triplet dataset to train a compression-rate predictor. +Experiments on three QA datasets and one conversational Muiti-doc QA dataset +show that AdaComp significantly reduces inference costs while maintaining +performance nearly identical to uncompressed models, achieving a balance +between efficiency and performance. + +
+
+ comment: 8 pages, 5 figures, code available at + https://anonymous.4open.science/r/AdaComp-8C0C/ +
+
+
+
+
+ + ☆ An Implementation of Werewolf Agent That does not Truly Trust LLMs + + +
+ Werewolf is an incomplete information game, which has several challenges when +creating a computer agent as a player given the lack of understanding of the +situation and individuality of utterance (e.g., computer agents are not capable +of characterful utterance or situational lying). We propose a werewolf agent +that solves some of those difficulties by combining a Large Language Model +(LLM) and a rule-based algorithm. In particular, our agent uses a rule-based +algorithm to select an output either from an LLM or a template prepared +beforehand based on the results of analyzing conversation history using an LLM. +It allows the agent to refute in specific situations, identify when to end the +conversation, and behave with persona. This approach mitigated conversational +inconsistencies and facilitated logical utterance as a result. We also +conducted a qualitative evaluation, which resulted in our agent being perceived +as more human-like compared to an unmodified LLM. The agent is freely available +for contributing to advance the research in the field of Werewolf game. + +
+
+
+
+
+ + ☆ Benchmarking Cognitive Domains for LLMs: Insights from Taiwanese Hakka + Culture + + +
+ This study introduces a comprehensive benchmark designed to evaluate the +performance of large language models (LLMs) in understanding and processing +cultural knowledge, with a specific focus on Hakka culture as a case study. +Leveraging Bloom's Taxonomy, the study develops a multi-dimensional framework +that systematically assesses LLMs across six cognitive domains: Remembering, +Understanding, Applying, Analyzing, Evaluating, and Creating. This benchmark +extends beyond traditional single-dimensional evaluations by providing a deeper +analysis of LLMs' abilities to handle culturally specific content, ranging from +basic recall of facts to higher-order cognitive tasks such as creative +synthesis. Additionally, the study integrates Retrieval-Augmented Generation +(RAG) technology to address the challenges of minority cultural knowledge +representation in LLMs, demonstrating how RAG enhances the models' performance +by dynamically incorporating relevant external information. The results +highlight the effectiveness of RAG in improving accuracy across all cognitive +domains, particularly in tasks requiring precise retrieval and application of +cultural knowledge. However, the findings also reveal the limitations of RAG in +creative tasks, underscoring the need for further optimization. This benchmark +provides a robust tool for evaluating and comparing LLMs in culturally diverse +contexts, offering valuable insights for future research and development in +AI-driven cultural knowledge preservation and dissemination. + +
+
+ comment: Submitted to O-COCOSDA 2024 +
+
+
+
+
+ + ☆ Self-Instructed Derived Prompt Generation Meets In-Context Learning: + Unlocking New Potential of Black-Box LLMs + + +
+ Large language models (LLMs) have shown success in generating high-quality +responses. In order to achieve better alignment with LLMs with human +preference, various works are proposed based on specific optimization process, +which, however, is not suitable to Black-Box LLMs like GPT-4, due to +inaccessible parameters. In Black-Box LLMs case, their performance is highly +dependent on the quality of the provided prompts. Existing methods to enhance +response quality often involve a prompt refinement model, yet these approaches +potentially suffer from semantic inconsistencies between the refined and +original prompts, and typically overlook the relationship between them. To +address these challenges, we introduce a self-instructed in-context learning +framework that empowers LLMs to deliver more effective responses by generating +reliable derived prompts to construct informative contextual environments. Our +approach incorporates a self-instructed reinforcement learning mechanism, +enabling direct interaction with the response model during derived prompt +generation for better alignment. We then formulate querying as an in-context +learning task, using responses from LLMs combined with the derived prompts to +establish a contextual demonstration for the original prompt. This strategy +ensures alignment with the original query, reduces discrepancies from refined +prompts, and maximizes the LLMs' in-context learning capability. Extensive +experiments demonstrate that the proposed method not only generates more +reliable derived prompts but also significantly enhances LLMs' ability to +deliver more effective responses, including Black-Box models such as GPT-4. + +
+
+
+
+
+ + ☆ VoxHakka: A Dialectally Diverse Multi-speaker Text-to-Speech System for + Taiwanese Hakka + + +
+ This paper introduces VoxHakka, a text-to-speech (TTS) system designed for +Taiwanese Hakka, a critically under-resourced language spoken in Taiwan. +Leveraging the YourTTS framework, VoxHakka achieves high naturalness and +accuracy and low real-time factor in speech synthesis while supporting six +distinct Hakka dialects. This is achieved by training the model with +dialect-specific data, allowing for the generation of speaker-aware Hakka +speech. To address the scarcity of publicly available Hakka speech corpora, we +employed a cost-effective approach utilizing a web scraping pipeline coupled +with automatic speech recognition (ASR)-based data cleaning techniques. This +process ensured the acquisition of a high-quality, multi-speaker, multi-dialect +dataset suitable for TTS training. Subjective listening tests conducted using +comparative mean opinion scores (CMOS) demonstrate that VoxHakka significantly +outperforms existing publicly available Hakka TTS systems in terms of +pronunciation accuracy, tone correctness, and overall naturalness. This work +represents a significant advancement in Hakka language technology and provides +a valuable resource for language preservation and revitalization efforts. + +
+
+ comment: Submitted to O-COCOSDA 2024 +
+
+
+
+
+ + ☆ Effective Noise-aware Data Simulation for Domain-adaptive Speech + Enhancement Leveraging Dynamic Stochastic Perturbation + + +
+ Cross-domain speech enhancement (SE) is often faced with severe challenges +due to the scarcity of noise and background information in an unseen target +domain, leading to a mismatch between training and test conditions. This study +puts forward a novel data simulation method to address this issue, leveraging +noise-extractive techniques and generative adversarial networks (GANs) with +only limited target noisy speech data. Notably, our method employs a noise +encoder to extract noise embeddings from target-domain data. These embeddings +aptly guide the generator to synthesize utterances acoustically fitted to the +target domain while authentically preserving the phonetic content of the input +clean speech. Furthermore, we introduce the notion of dynamic stochastic +perturbation, which can inject controlled perturbations into the noise +embeddings during inference, thereby enabling the model to generalize well to +unseen noise conditions. Experiments on the VoiceBank-DEMAND benchmark dataset +demonstrate that our domain-adaptive SE method outperforms an existing strong +baseline based on data simulation. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ It is Time to Develop an Auditing Framework to Promote Value Aware + Chatbots + + +
+ The launch of ChatGPT in November 2022 marked the beginning of a new era in +AI, the availability of generative AI tools for everyone to use. ChatGPT and +other similar chatbots boast a wide range of capabilities from answering +student homework questions to creating music and art. Given the large amounts +of human data chatbots are built on, it is inevitable that they will inherit +human errors and biases. These biases have the potential to inflict significant +harm or increase inequity on different subpopulations. Because chatbots do not +have an inherent understanding of societal values, they may create new content +that is contrary to established norms. Examples of concerning generated content +includes child pornography, inaccurate facts, and discriminatory posts. In this +position paper, we argue that the speed of advancement of this technology +requires us, as computer and data scientists, to mobilize and develop a +values-based auditing framework containing a community established standard set +of measurements to monitor the health of different chatbots and LLMs. To +support our argument, we use a simple audit template to share the results of +basic audits we conduct that are focused on measuring potential bias in search +engine style tasks, code generation, and story generation. We identify +responses from GPT 3.5 and GPT 4 that are both consistent and not consistent +with values derived from existing law. While the findings come as no surprise, +they do underscore the urgency of developing a robust auditing framework for +openly sharing results in a consistent way so that mitigation strategies can be +developed by the academic community, government agencies, and companies when +our values are not being adhered to. We conclude this paper with +recommendations for value-based strategies for improving the technologies. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2306.07500 +
+
+
+
+
+ + ☆ S$^3$c-Math: Spontaneous Step-level Self-correction Makes Large Language + Models Better Mathematical Reasoners + + +
+ Self-correction is a novel method that can stimulate the potential reasoning +abilities of large language models (LLMs). It involves detecting and correcting +errors during the inference process when LLMs solve reasoning problems. +However, recent works do not regard self-correction as a spontaneous and +intrinsic capability of LLMs. Instead, such correction is achieved through +post-hoc generation, external knowledge introduction, multi-model +collaboration, and similar techniques. In this paper, we propose a series of +mathematical LLMs called S$^3$c-Math, which are able to perform Spontaneous +Step-level Self-correction for Mathematical reasoning. This capability helps +LLMs to recognize whether their ongoing inference tends to contain errors and +simultaneously correct these errors to produce a more reliable response. We +proposed a method, which employs a step-level sampling approach to construct +step-wise self-correction data for achieving such ability. Additionally, we +implement a training strategy that uses above constructed data to equip LLMs +with spontaneous step-level self-correction capacities. Our data and methods +have been demonstrated to be effective across various foundation LLMs, +consistently showing significant progress in evaluations on GSM8K, MATH, and +other mathematical benchmarks. To the best of our knowledge, we are the first +to introduce the spontaneous step-level self-correction ability of LLMs in +mathematical reasoning. + +
+
+
+
+
+ + ♻ ☆ Improving Rare Word Translation With Dictionaries and Attention Masking + + +
+ In machine translation, rare words continue to be a problem for the dominant +encoder-decoder architecture, especially in low-resource and out-of-domain +translation settings. Human translators solve this problem with monolingual or +bilingual dictionaries. In this paper, we propose appending definitions from a +bilingual dictionary to source sentences and using attention masking to link +together rare words with their definitions. We find that including definitions +for rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1. + +
+
+ comment: 11 pages, 3 figures, 3 tables. Accepted at AMTA 2024 +
+
+
+
+
+ + ♻ ☆ Low-Rank Quantization-Aware Training for LLMs + + +
+ Large language models (LLMs) are omnipresent, however their practical +deployment is challenging due to their ever increasing computational and memory +demands. Quantization is one of the most effective ways to make them more +compute and memory efficient. Quantization-aware training (QAT) methods, +generally produce the best quantized performance, however it comes at the cost +of potentially long training time and excessive memory usage, making it +impractical when applying for LLMs. Inspired by parameter-efficient fine-tuning +(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a +lightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several +components to save memory without sacrificing predictive performance: (a) +low-rank auxiliary weights that are aware of the quantization grid; (b) a +downcasting operator using fixed-point or double-packed integers and (c) +checkpointing. Unlike most related work, our method (i) is inference-efficient, +leading to no additional overhead compared to traditional PTQ; (ii) can be seen +as a general extended pretraining framework, meaning that the resulting model +can still be utilized for any downstream task afterwards; (iii) can be applied +across a wide range of quantization settings, such as different choices +quantization granularity, activation quantization, and seamlessly combined with +many PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families +and validate its effectiveness on several downstream tasks. Our method +outperforms common post-training quantization (PTQ) approaches and reaches the +same model performance as full-model QAT at the fraction of its memory usage. +Specifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of +memory. Our source code is available at +https://github.com/qualcomm-ai-research/LR-QAT + +
+
+
+
+
+ + ♻ ☆ Foundation Models for Music: A Survey + + +
+ In recent years, foundation models (FMs) such as large language models (LLMs) +and latent diffusion models (LDMs) have profoundly impacted diverse sectors, +including music. This comprehensive review examines state-of-the-art (SOTA) +pre-trained models and foundation models in music, spanning from representation +learning, generative learning and multimodal learning. We first contextualise +the significance of music in various industries and trace the evolution of AI +in music. By delineating the modalities targeted by foundation models, we +discover many of the music representations are underexplored in FM development. +Then, emphasis is placed on the lack of versatility of previous methods on +diverse music applications, along with the potential of FMs in music +understanding, generation and medical application. By comprehensively exploring +the details of the model pre-training paradigm, architectural choices, +tokenisation, finetuning methodologies and controllability, we emphasise the +important topics that should have been well explored, like instruction tuning +and in-context learning, scaling law and emergent ability, as well as +long-sequence modelling etc. A dedicated section presents insights into music +agents, accompanied by a thorough analysis of datasets and evaluations +essential for pre-training and downstream tasks. Finally, by underscoring the +vital importance of ethical considerations, we advocate that following research +on FM for music should focus more on such issues as interpretability, +transparency, human responsibility, and copyright issues. The paper offers +insights into future challenges and trends on FMs for music, aiming to shape +the trajectory of human-AI collaboration in the music realm. + +
+
+
+
+
+ + ♻ ☆ InkubaLM: A small language model for low-resource African languages + + +
+ High-resource language models often fall short in the African context, where +there is a critical need for models that are efficient, accessible, and locally +relevant, even amidst significant computing and data constraints. This paper +introduces InkubaLM, a small language model with 0.4 billion parameters, which +achieves performance comparable to models with significantly larger parameter +counts and more extensive training data on tasks such as machine translation, +question-answering, AfriMMLU, and the AfriXnli task. Notably, InkubaLM +outperforms many larger models in sentiment analysis and demonstrates +remarkable consistency across multiple languages. This work represents a +pivotal advancement in challenging the conventional paradigm that effective +language models must rely on substantial resources. Our model and datasets are +publicly available at https://huggingface.co/lelapa to encourage research and +development on low-resource languages. + +
+
+
+
+
+ + ♻ ☆ OceanGPT: A Large Language Model for Ocean Science Tasks ACL2024 + + +
+ Ocean science, which delves into the oceans that are reservoirs of life and +biodiversity, is of great significance given that oceans cover over 70% of our +planet's surface. Recently, advances in Large Language Models (LLMs) have +transformed the paradigm in science. Despite the success in other domains, +current LLMs often fall short in catering to the needs of domain experts like +oceanographers, and the potential of LLMs for ocean science is under-explored. +The intrinsic reasons are the immense and intricate nature of ocean data as +well as the necessity for higher granularity and richness in knowledge. To +alleviate these issues, we introduce OceanGPT, the first-ever large language +model in the ocean domain, which is expert in various ocean science tasks. We +also propose OceanGPT, a novel framework to automatically obtain a large volume +of ocean domain instruction data, which generates instructions based on +multi-agent collaboration. Additionally, we construct the first oceanography +benchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean +domain. Though comprehensive experiments, OceanGPT not only shows a higher +level of knowledge expertise for oceans science tasks but also gains +preliminary embodied intelligence capabilities in ocean technology. + +
+
+ comment: ACL2024. Project Website: http://oceangpt.zjukg.cn/ +
+
+
+
+
+ + ♻ ☆ A Survey on Stability of Learning with Limited Labelled Data and its + Sensitivity to the Effects of Randomness + + +
+ Learning with limited labelled data, such as prompting, in-context learning, +fine-tuning, meta-learning or few-shot learning, aims to effectively train a +model using only a small amount of labelled samples. However, these approaches +have been observed to be excessively sensitive to the effects of uncontrolled +randomness caused by non-determinism in the training process. The randomness +negatively affects the stability of the models, leading to large variances in +results across training runs. When such sensitivity is disregarded, it can +unintentionally, but unfortunately also intentionally, create an imaginary +perception of research progress. Recently, this area started to attract +research attention and the number of relevant studies is continuously growing. +In this survey, we provide a comprehensive overview of 415 papers addressing +the effects of randomness on the stability of learning with limited labelled +data. We distinguish between four main tasks addressed in the papers +(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness +effects), providing findings for each one. Furthermore, we identify and discuss +seven challenges and open problems together with possible directions to +facilitate further research. The ultimate goal of this survey is to emphasise +the importance of this growing research area, which so far has not received an +appropriate level of attention, and reveal impactful directions for future +research. + +
+
+ comment: Accepted to ACM Comput. Surv. 2024 +
+
+
+
+
+ + ♻ ☆ Towards Scalable Automated Alignment of LLMs: A Survey + + +
+ Alignment is the most critical step in building large language models (LLMs) +that meet human needs. With the rapid development of LLMs gradually surpassing +human capabilities, traditional alignment methods based on human-annotation are +increasingly unable to meet the scalability demands. Therefore, there is an +urgent need to explore new sources of automated alignment signals and technical +approaches. In this paper, we systematically review the recently emerging +methods of automated alignment, attempting to explore how to achieve effective, +scalable, automated alignment once the capabilities of LLMs exceed those of +humans. Specifically, we categorize existing automated alignment methods into 4 +major categories based on the sources of alignment signals and discuss the +current status and potential development of each category. Additionally, we +explore the underlying mechanisms that enable automated alignment and discuss +the essential factors that make automated alignment technologies feasible and +effective from the fundamental role of alignment. + +
+
+ comment: Paper List: https://github.com/cascip/awesome-auto-alignment +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes and formatting improvements. V3: improvements from + journal revisions +
+
+
+
+
+ + ♻ ☆ A Fundamental Trade-off in Aligned Language Models and its Relation to + Sampling Adaptors + + +
+ The relationship between the quality of a string, as judged by a human +reader, and its probability, $p(\boldsymbol{y})$ under a language model +undergirds the development of better language models. For example, many popular +algorithms for sampling from a language model have been conceived with the goal +of manipulating $p(\boldsymbol{y})$ to place higher probability on strings that +humans deem of high quality. In this article, we examine the +probability--quality relationship in language models explicitly aligned to +human preferences, e.g., through reinforcement learning through human feedback. +We show that, when sampling corpora from an aligned language model, there +exists a trade-off between the strings' average reward and average +log-likelihood under the prior language model, i.e., the same model before +alignment with human preferences. We provide a formal treatment of this +phenomenon and demonstrate how a choice of sampling adaptor allows for a +selection of how much likelihood we exchange for the reward. + +
+
+
+
+
+ + ♻ ☆ Correcting misinformation on social media with a large language model + + +
+ Real-world misinformation, often multimodal, can be partially or fully +factual but misleading using diverse tactics like conflating correlation with +causation. Such misinformation is severely understudied, challenging to +address, and harms various social domains, particularly on social media, where +it can spread rapidly. High-quality and timely correction of misinformation +that identifies and explains its (in)accuracies effectively reduces false +beliefs. Despite the wide acceptance of manual correction, it is difficult to +be timely and scalable. While LLMs have versatile capabilities that could +accelerate misinformation correction, they struggle due to a lack of recent +information, a tendency to produce false content, and limitations in addressing +multimodal information. We propose MUSE, an LLM augmented with access to and +credibility evaluation of up-to-date information. By retrieving evidence as +refutations or supporting context, MUSE identifies and explains content +(in)accuracies with references. It conducts multimodal retrieval and interprets +visual content to verify and correct multimodal content. Given the absence of a +comprehensive evaluation approach, we propose 13 dimensions of misinformation +correction quality. Then, fact-checking experts evaluate responses to social +media content that are not presupposed to be misinformation but broadly include +(partially) incorrect and correct posts that may (not) be misleading. Results +demonstrate MUSE's ability to write high-quality responses to potential +misinformation--across modalities, tactics, domains, political leanings, and +for information that has not previously been fact-checked online--within +minutes of its appearance on social media. Overall, MUSE outperforms GPT-4 by +37% and even high-quality responses from laypeople by 29%. Our work provides a +general methodological and evaluative framework to correct misinformation at +scale. + +
+
+ comment: 50 pages +
+
+
+
+
+ + ♻ ☆ NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment + + +
+ Aligning Large Language Models (LLMs) with human values and preferences is +essential for making them helpful and safe. However, building efficient tools +to perform alignment can be challenging, especially for the largest and most +competent LLMs which often contain tens or hundreds of billions of parameters. +We create NeMo-Aligner, a toolkit for model alignment that can efficiently +scale to a thousand GPUs for training the largest open-source LLMs such as +Nemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized +and scalable implementations for major paradigms of model alignment such as: +Reinforcement Learning from Human Feedback (RLHF), Direct Preference +Optimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally, +our toolkit supports running most of the alignment techniques in a Parameter +Efficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for +extensibility, allowing support for other alignment techniques with minimal +effort. It is open-sourced with Apache 2.0 License and we invite community +contributions at https://github.com/NVIDIA/NeMo-Aligner + +
+
+ comment: 16 pages, 4 figures, Accepted to COLM 2024 +
+
+
+
+
+ + ♻ ☆ Squid: Long Context as a New Modality for Energy-Efficient On-Device + Language Models + + +
+ This paper presents Dolphin, a novel decoder-decoder architecture for +energy-efficient processing of long contexts in language models. Our approach +addresses the significant energy consumption and latency challenges inherent in +on-device models. Dolphin employs a compact 0.5B parameter decoder to distill +extensive contextual information into a memory embedding, substantially +reducing the input length for the primary 7B parameter decoder model. Inspired +by vision-language models, we repurpose the image embedding projector to encode +long textual contexts, effectively treating extended context as a distinct +modality. This innovative method enables processing of substantially longer +contexts without the typical computational overhead associated with extended +input sequences. Empirical evaluations demonstrate a 10-fold improvement in +energy efficiency and a 5-fold reduction in latency compared to conventional +full-length context processing methods without losing quality of the response. +Our work contributes to the development of more sustainable and scalable +language models for on-device applications, addressing the critical need for +energy-efficient and responsive AI technologies in resource-constrained +environments while maintaining the accuracy to understand long contexts. This +research has implications for the broader field of natural language processing, +particularly in the domain of efficient model design for resource-limited +settings. By enabling more sophisticated AI capabilities on edge devices, +Dolphin paves the way for advanced language processing in a wide range of +applications where computational resources are at a premium. The Dolphin model +is publicly available at https://huggingface.co/NexaAIDev/Dolphin. + +
+
+
+
+
+ + ♻ ☆ OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step + + +
+ Despite significant advancements in text generation and reasoning, Large +Language Models (LLMs) still face challenges in accurately performing complex +arithmetic operations. Language model systems often enable LLMs to generate +code for arithmetic operations to achieve accurate calculations. However, this +approach compromises speed and security, and fine-tuning risks the language +model losing prior capabilities. We propose a framework that enables exact +arithmetic in a single autoregressive step, providing faster, more secure, and +more interpretable LLM systems with arithmetic capabilities. We use the hidden +states of a LLM to control a symbolic architecture that performs arithmetic. +Our implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama) +achieves 100\% accuracy on single arithmetic operations +($+,-,\times,\div,\sin{},\cos{},\log{},\exp{},\sqrt{}$), outperforming GPT 4o +with and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o +with and without a code interpreter on average across a range of mathematical +problem solving benchmarks, demonstrating that OccamLLMs can excel in +arithmetic tasks, even surpassing much larger models. We will make our code +public shortly. + +
+
+
+
+
+ + ♻ ☆ Flood of Techniques and Drought of Theories: Emotion Mining in Disasters + + +
+ Emotion mining has become a crucial tool for understanding human emotions +during disasters, leveraging the extensive data generated on social media +platforms. This paper aims to summarize existing research on emotion mining +within disaster contexts, highlighting both significant discoveries and +persistent issues. On the one hand, emotion mining techniques have achieved +acceptable accuracy enabling applications such as rapid damage assessment and +mental health surveillance. On the other hand, with many studies adopting +data-driven approaches, several methodological issues remain. These include +arbitrary emotion classification, ignoring biases inherent in data collection +from social media, such as the overrepresentation of individuals from higher +socioeconomic status on Twitter, and the lack of application of theoretical +frameworks like cross-cultural comparisons. These problems can be summarized as +a notable lack of theory-driven research and ignoring insights from social and +behavioral sciences. This paper underscores the need for interdisciplinary +collaboration between computer scientists and social scientists to develop more +robust and theoretically grounded approaches in emotion mining. By addressing +these gaps, we aim to enhance the effectiveness and reliability of emotion +mining methodologies, ultimately contributing to improved disaster +preparedness, response, and recovery. + Keywords: emotion mining, sentiment analysis, natural disasters, psychology, +technological disasters + +
+
+
+
+
+ + ♻ ☆ How Far Are We on the Decision-Making of LLMs? Evaluating LLMs' Gaming + Ability in Multi-Agent Environments + + +
+ Decision-making, a complicated task requiring various types of abilities, +presents an excellent framework for assessing Large Language Models (LLMs). Our +research investigates decision-making capabilities of LLMs through the lens of +Game Theory. We focus specifically on games that support the simultaneous +participation of more than two agents. We introduce GAMA($\gamma$)-Bench, which +evaluates LLMs' Gaming Ability in Multi-Agent environments. $\gamma$-Bench +includes eight classical multi-agent games and a scoring scheme specially +designed to quantitatively assess LLMs' performance. Leveraging $\gamma$-Bench, +we investigate LLMs' robustness, generalizability, and strategies for +enhancement. Results reveal that while GPT-3.5 shows satisfying robustness, its +generalizability is relatively limited. However, its performance can be +improved through approaches such as Chain-of-Thought. Additionally, we evaluate +twelve versions from six models, including GPT-3.5, GPT-4, Gemini, LLaMA-3.1, +Mixtral, and Qwen-2. We find that Gemini-1.5-Pro outperforms other models with +a score of $63.8$ out of $100$, followed by LLaMA-3.1-70B and GPT-4 with scores +of $60.9$ and $60.5$, respectively. The code and experimental results are made +publicly available via https://github.com/CUHK-ARISE/GAMABench. + +
+
+ comment: 11 pages of main text. 20 pages of appendices. 12 figures, 9 tables. + Added models: Gemini-1.5-Pro, LLaMA-3.1-{7, 70, 405}B, Mixtral-8x{7, 22}B, + Qwen-2-72B +
+
+
+
+
+ + ♻ ☆ The Responsible Foundation Model Development Cheatsheet: A Review of + Tools & Resources + + +
+ Foundation model development attracts a rapidly expanding body of +contributors, scientists, and applications. To help shape responsible +development practices, we introduce the Foundation Model Development +Cheatsheet: a growing collection of 250+ tools and resources spanning text, +vision, and speech modalities. We draw on a large body of prior work to survey +resources (e.g. software, documentation, frameworks, guides, and practical +tools) that support informed data selection, processing, and understanding, +precise and limitation-aware artifact documentation, efficient model training, +advance awareness of the environmental impact from training, careful model +evaluation of capabilities, risks, and claims, as well as responsible model +release, licensing and deployment practices. We hope this curated collection of +resources helps guide more responsible development. The process of curating +this list, enabled us to review the AI development ecosystem, revealing what +tools are critically missing, misused, or over-used in existing practices. We +find that (i) tools for data sourcing, model evaluation, and monitoring are +critically under-serving ethical and real-world needs, (ii) evaluations for +model safety, capabilities, and environmental impact all lack reproducibility +and transparency, (iii) text and particularly English-centric analyses continue +to dominate over multilingual and multi-modal analyses, and (iv) evaluation of +systems, rather than just models, is needed so that capabilities and impact are +assessed in context. + +
+
+
+
+
+ + ♻ ☆ COFFEE: A Contrastive Oracle-Free Framework for Event Extraction ATC + + +
+ Event extraction is a complex information extraction task that involves +extracting events from unstructured text. Prior classification-based methods +require comprehensive entity annotations for joint training, while newer +generation-based methods rely on heuristic templates containing oracle +information such as event type, which is often unavailable in real-world +scenarios. In this study, we consider a more realistic setting of this task, +namely the Oracle-Free Event Extraction (OFEE) task, where only the input +context is given without any oracle information, including event type, event +ontology and trigger word. To solve this task, we propose a new framework, +called COFFEE, which extracts the events solely based on the document context +without referring to any oracle information. In particular, a contrastive +selection model is introduced in COFFEE to rectify the generated triggers and +handle multi-event instances. The proposed COFFEE outperforms state-of-the-art +approaches under the oracle-free setting of the event extraction task, as +evaluated on a public event extraction benchmark ACE05. + +
+
+ comment: Accepted to MATCHING Workshop at ACL 2023 +
+
+
+
+
+ + ♻ ☆ Persian Slang Text Conversion to Formal and Deep Learning of Persian + Short Texts on Social Media for Sentiment Classification + + +
+ The lack of a suitable tool for the analysis of conversational texts in the +Persian language has made various analyses of these texts, including Sentiment +Analysis, difficult. In this research, we tried to make the understanding of +these texts easier for the machine by providing PSC, Persian Slang Converter, a +tool for converting conversational texts into formal ones, and by using the +most up-to-date and best deep learning methods along with the PSC, the +sentiment learning of short Persian language texts for the machine in a better +way. be made More than 10 million unlabeled texts from various social networks +and movie subtitles (as Conversational texts) and about 10 million news texts +(as formal texts) have been used for training unsupervised models and formal +implementation of the tool. 60,000 texts from the comments of Instagram social +network users with positive, negative, and neutral labels are considered +supervised data for training the emotion classification model of short texts. +Using the formal tool, 57% of the words of the corpus of conversation were +converted. Finally, by using the formalizer, FastText model, and deep LSTM +network, an accuracy of 81.91 was obtained on the test data. + +
+
+ comment: 16 pages, 4 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Preference Learning Algorithms Do Not Learn Preference Rankings + + +
+ Preference learning algorithms (e.g., RLHF and DPO) are frequently used to +steer LLMs to produce generations that are more preferred by humans, but our +understanding of their inner workings is still limited. In this work, we study +the conventional wisdom that preference learning trains models to assign higher +likelihoods to more preferred outputs than less preferred outputs, measured via +$\textit{ranking accuracy}$. Surprisingly, we find that most state-of-the-art +preference-tuned models achieve a ranking accuracy of less than 60% on common +preference datasets. We furthermore derive the $\textit{idealized ranking +accuracy}$ that a preference-tuned LLM would achieve if it optimized the DPO or +RLHF objective perfectly. We demonstrate that existing models exhibit a +significant $\textit{alignment gap}$ -- $\textit{i.e.}$, a gap between the +observed and idealized ranking accuracies. We attribute this discrepancy to the +DPO objective, which is empirically and theoretically ill-suited to fix even +mild ranking errors in the reference model, and derive a simple and efficient +formula for quantifying the difficulty of learning a given preference +datapoint. Finally, we demonstrate that ranking accuracy strongly correlates +with the empirically popular win rate metric when the model is close to the +reference model used in the objective, shedding further light on the +differences between on-policy (e.g., RLHF) and off-policy (e.g., DPO) +preference learning algorithms. + +
+
+
+
+
+ + ♻ ☆ A Voter-Based Stochastic Rejection-Method Framework for Asymptotically + Safe Language Model Outputs + + +
+ This paper proposes a new method for preventing unsafe or otherwise low +quality large language model (LLM) outputs, by leveraging the stochasticity of +LLMs. We propose a system whereby LLM checkers vote on the acceptability of a +generated output, regenerating it if a threshold of disapproval is reached, +until sufficient checkers approve. We further propose estimators for cost and +failure rate, and based on those estimators and experimental data tailored to +the application, we propose an algorithm that achieves a desired failure rate +at the least possible cost. We demonstrate that, under these models, failure +rate decreases exponentially as a function of cost when voter count and +threshold are chosen according to the algorithm, and that the models reasonably +estimate the actual performance of such a system in action, even with limited +data. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ ARN: Analogical Reasoning on Narratives + + +
+ As a core cognitive skill that enables the transferability of information +across domains, analogical reasoning has been extensively studied for both +humans and computational models. However, while cognitive theories of analogy +often focus on narratives and study the distinction between surface, +relational, and system similarities, existing work in natural language +processing has a narrower focus as far as relational analogies between word +pairs. This gap brings a natural question: can state-of-the-art large language +models (LLMs) detect system analogies between narratives? To gain insight into +this question and extend word-based relational analogies to relational system +analogies, we devise a comprehensive computational framework that +operationalizes dominant theories of analogy, using narrative elements to +create surface and system mappings. Leveraging the interplay between these +mappings, we create a binary task and benchmark for Analogical Reasoning on +Narratives (ARN), covering four categories of far (cross-domain)/near +(within-domain) analogies and disanalogies. We show that while all LLMs can +largely recognize near analogies, even the largest ones struggle with far +analogies in a zero-shot setting, with GPT4.0 scoring below random. Guiding the +models through solved examples and chain-of-thought reasoning enhances their +analogical reasoning ability. Yet, since even in the few-shot setting, the best +model only performs halfway between random and humans, ARN opens exciting +directions for computational analogical reasoners. + +
+
+
+
+
+ + ♻ ☆ Zyda: A 1.3T Dataset for Open Language Modeling + + +
+ The size of large language models (LLMs) has scaled dramatically in recent +years and their computational and data requirements have surged +correspondingly. State-of-the-art language models, even at relatively smaller +sizes, typically require training on at least a trillion tokens. This rapid +advancement has eclipsed the growth of open-source datasets available for +large-scale LLM pretraining. In this paper, we introduce Zyda (Zyphra Dataset), +a dataset under a permissive license comprising 1.3 trillion tokens, assembled +by integrating several major respected open-source datasets into a single, +high-quality corpus. We apply rigorous filtering and deduplication processes, +both within and across datasets, to maintain and enhance the quality derived +from the original datasets. Our evaluations show that Zyda not only competes +favorably with other open datasets like Dolma, FineWeb, and RefinedWeb, but +also substantially improves the performance of comparable models from the +Pythia suite. Our rigorous data processing methods significantly enhance Zyda's +effectiveness, outperforming even the best of its constituent datasets when +used independently. + +
+
+
+
+
+ + ♻ ☆ Correction with Backtracking Reduces Hallucination in Summarization + + +
+ Abstractive summarization aims at generating natural language summaries of a +source document that are succinct while preserving the important elements. +Despite recent advances, neural text summarization models are known to be +susceptible to hallucinating (or more correctly confabulating), that is to +produce summaries with details that are not grounded in the source document. In +this paper, we introduce a simple yet efficient technique, CoBa, to reduce +hallucination in abstractive summarization. The approach is based on two steps: +hallucination detection and mitigation. We show that the former can be achieved +through measuring simple statistics about conditional word probabilities and +distance to context words. Further, we demonstrate that straight-forward +backtracking is surprisingly effective at mitigation. We thoroughly evaluate +the proposed method with prior art on three benchmark datasets for text +summarization. The results show that CoBa is effective and efficient in +reducing hallucination, and offers great adaptability and flexibility. Code can +be found at https://github.com/zhenzhel/CoBa. + +
+
+
+
+
+ + ♻ ☆ Investigating the Robustness of LLMs on Math Word Problems + + +
+ Large Language Models (LLMs) excel at various tasks, including solving math +word problems (MWPs), but struggle with real-world problems containing +irrelevant information. To address this, we propose a prompting framework that +generates adversarial variants of MWPs by adding irrelevant variables. We +introduce a dataset, ProbleMATHIC, containing both adversarial and +non-adversarial MWPs. Our experiments reveal that LLMs are susceptible to +distraction by numerical noise, resulting in an average relative performance +drop of ~26% on adversarial MWPs. To mitigate this, we fine-tune LLMs (Llama-2, +Mistral) on the adversarial samples from our dataset. Fine-tuning on +adversarial training instances improves performance on adversarial MWPs by ~8%, +indicating increased robustness to noise and better ability to identify +relevant data for reasoning. Finally, to assess the generalizability of our +prompting framework, we introduce GSM-8K-Adv, an adversarial variant of the +GSM-8K benchmark. LLMs continue to struggle when faced with adversarial +information, reducing performance by up to ~6%. + +
+
+
+
+
+ + ♻ ☆ A Survey on Responsible Generative AI: What to Generate and What Not + + +
+ In recent years, generative AI (GenAI), like large language models and +text-to-image models, has received significant attention across various +domains. However, ensuring the responsible generation of content by these +models is crucial for their real-world applicability. This raises an +interesting question: What should responsible GenAI generate, and what should +it not? To answer the question, this paper investigates the practical +responsible requirements of both textual and visual generative models, +outlining five key considerations: generating truthful content, avoiding toxic +content, refusing harmful instruction, leaking no training data-related +content, and ensuring generated content identifiable. Specifically, we review +recent advancements and challenges in addressing these requirements. Besides, +we discuss and emphasize the importance of responsible GenAI across healthcare, +education, finance, and artificial general intelligence domains. Through a +unified perspective on both textual and visual generative models, this paper +aims to provide insights into practical safety-related issues and further +benefit the community in building responsible GenAI. + +
+
+ comment: 77 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ GPT has become financially literate: Insights from financial literacy + tests of GPT and a preliminary test of how people use it as a source of + advice + + +
+ We assess the ability of GPT -- a large language model -- to serve as a +financial robo-advisor for the masses, by using a financial literacy test. +Davinci and ChatGPT based on GPT-3.5 score 66% and 65% on the financial +literacy test, respectively, compared to a baseline of 33%. However, ChatGPT +based on GPT-4 achieves a near-perfect 99% score, pointing to financial +literacy becoming an emergent ability of state-of-the-art models. We use the +Judge-Advisor System and a savings dilemma to illustrate how researchers might +assess advice-utilization from large language models. We also present a number +of directions for future research. + +
+
+ comment: 43 pages, 2 figures and 2 tables in main text; in V2 added + information that this is the Author Accepted Manuscript version +
+
+
+
+
+ + ♻ ☆ Interpretation of Intracardiac Electrograms Through Textual + Representations + + +
+ Understanding the irregular electrical activity of atrial fibrillation (AFib) +has been a key challenge in electrocardiography. For serious cases of AFib, +catheter ablations are performed to collect intracardiac electrograms (EGMs). +EGMs offer intricately detailed and localized electrical activity of the heart +and are an ideal modality for interpretable cardiac studies. Recent +advancements in artificial intelligence (AI) has allowed some works to utilize +deep learning frameworks to interpret EGMs during AFib. Additionally, language +models (LMs) have shown exceptional performance in being able to generalize to +unseen domains, especially in healthcare. In this study, we are the first to +leverage pretrained LMs for finetuning of EGM interpolation and AFib +classification via masked language modeling. We formulate the EGM as a textual +sequence and present competitive performances on AFib classification compared +against other representations. Lastly, we provide a comprehensive +interpretability study to provide a multi-perspective intuition of the model's +behavior, which could greatly benefit the clinical use. + +
+
+ comment: 17 pages, 7 figures; Accepted to CHIL 2024 +
+
+
+
+
+ + ♻ ☆ RLAIF vs. RLHF: Scaling Reinforcement Learning from Human Feedback with + AI Feedback ICML 2024 + + +
+ Reinforcement learning from human feedback (RLHF) has proven effective in +aligning large language models (LLMs) with human preferences, but gathering +high-quality preference labels is expensive. RL from AI Feedback (RLAIF), +introduced in Bai et al., offers a promising alternative that trains the reward +model (RM) on preferences generated by an off-the-shelf LLM. Across the tasks +of summarization, helpful dialogue generation, and harmless dialogue +generation, we show that RLAIF achieves comparable performance to RLHF. +Furthermore, we take a step towards "self-improvement" by demonstrating that +RLAIF can outperform a supervised fine-tuned baseline even when the AI labeler +is the same size as the policy, or even the exact same checkpoint as the +initial policy. Finally, we introduce direct-RLAIF (d-RLAIF) - a technique that +circumvents RM training by obtaining rewards directly from an off-the-shelf LLM +during RL, which achieves superior performance to canonical RLAIF. Our results +suggest that RLAIF can achieve performance on-par with using human feedback, +offering a potential solution to the scalability limitations of RLHF. + +
+
+ comment: Presented at ICML 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 67 + +
+
+
+ + ☆ Coaching a Robotic Sonographer: Learning Robotic Ultrasound with Sparse + Expert's Feedback + + +
+ Ultrasound is widely employed for clinical intervention and diagnosis, due to +its advantages of offering non-invasive, radiation-free, and real-time imaging. +However, the accessibility of this dexterous procedure is limited due to the +substantial training and expertise required of operators. The robotic +ultrasound (RUS) offers a viable solution to address this limitation; +nonetheless, achieving human-level proficiency remains challenging. Learning +from demonstrations (LfD) methods have been explored in RUS, which learns the +policy prior from a dataset of offline demonstrations to encode the mental +model of the expert sonographer. However, active engagement of experts, i.e. +Coaching, during the training of RUS has not been explored thus far. Coaching +is known for enhancing efficiency and performance in human training. This paper +proposes a coaching framework for RUS to amplify its performance. The framework +combines DRL (self-supervised practice) with sparse expert's feedback through +coaching. The DRL employs an off-policy Soft Actor-Critic (SAC) network, with a +reward based on image quality rating. The coaching by experts is modeled as a +Partially Observable Markov Decision Process (POMDP), which updates the policy +parameters based on the correction by the expert. The validation study on +phantoms showed that coaching increases the learning rate by $25\%$ and the +number of high-quality image acquisition by $74.5\%$. + +
+
+ comment: Accepted in IEEE Transactions on Medical Robotics and Bionics (TMRB) + 2024 +
+
+
+
+
+ + ☆ What Do You See in Common? Learning Hierarchical Prototypes over + Tree-of-Life to Discover Evolutionary Traits + + +
+ A grand challenge in biology is to discover evolutionary traits - features of +organisms common to a group of species with a shared ancestor in the tree of +life (also referred to as phylogenetic tree). With the growing availability of +image repositories in biology, there is a tremendous opportunity to discover +evolutionary traits directly from images in the form of a hierarchy of +prototypes. However, current prototype-based methods are mostly designed to +operate over a flat structure of classes and face several challenges in +discovering hierarchical prototypes, including the issue of learning +over-specific features at internal nodes. To overcome these challenges, we +introduce the framework of Hierarchy aligned Commonality through Prototypical +Networks (HComP-Net). We empirically show that HComP-Net learns prototypes that +are accurate, semantically consistent, and generalizable to unseen species in +comparison to baselines on birds, butterflies, and fishes datasets. The code +and datasets are available at https://github.com/Imageomics/HComPNet. + +
+
+ comment: 34 pages, 27 figures +
+
+
+
+
+ + ☆ YoloTag: Vision-based Robust UAV Navigation with Fiducial Markers + + +
+ By harnessing fiducial markers as visual landmarks in the environment, +Unmanned Aerial Vehicles (UAVs) can rapidly build precise maps and navigate +spaces safely and efficiently, unlocking their potential for fluent +collaboration and coexistence with humans. Existing fiducial marker methods +rely on handcrafted feature extraction, which sacrifices accuracy. On the other +hand, deep learning pipelines for marker detection fail to meet real-time +runtime constraints crucial for navigation applications. In this work, we +propose YoloTag \textemdash a real-time fiducial marker-based localization +system. YoloTag uses a lightweight YOLO v8 object detector to accurately detect +fiducial markers in images while meeting the runtime constraints needed for +navigation. The detected markers are then used by an efficient +perspective-n-point algorithm to estimate UAV states. However, this +localization system introduces noise, causing instability in trajectory +tracking. To suppress noise, we design a higher-order Butterworth filter that +effectively eliminates noise through frequency domain analysis. We evaluate our +algorithm through real-robot experiments in an indoor environment, comparing +the trajectory tracking performance of our method against other approaches in +terms of several distance metrics. + +
+
+
+
+
+ + ☆ Visual Servoing for Robotic On-Orbit Servicing: A Survey + + +
+ On-orbit servicing (OOS) activities will power the next big step for +sustainable exploration and commercialization of space. Developing robotic +capabilities for autonomous OOS operations is a priority for the space +industry. Visual Servoing (VS) enables robots to achieve the precise manoeuvres +needed for critical OOS missions by utilizing visual information for motion +control. This article presents an overview of existing VS approaches for +autonomous OOS operations with space manipulator systems (SMS). We divide the +approaches according to their contribution to the typical phases of a robotic +OOS mission: a) Recognition, b) Approach, and c) Contact. We also present a +discussion on the reviewed VS approaches, identifying current trends. Finally, +we highlight the challenges and areas for future research on VS techniques for +robotic OOS. + +
+
+ comment: Accepted for publication at the 2024 International Conference on + Space Robotics (iSpaRo) +
+
+
+
+
+ + ☆ Geometry-aware Feature Matching for Large-Scale Structure from Motion + + +
+ Establishing consistent and dense correspondences across multiple images is +crucial for Structure from Motion (SfM) systems. Significant view changes, such +as air-to-ground with very sparse view overlap, pose an even greater challenge +to the correspondence solvers. We present a novel optimization-based approach +that significantly enhances existing feature matching methods by introducing +geometry cues in addition to color cues. This helps fill gaps when there is +less overlap in large-scale scenarios. Our method formulates geometric +verification as an optimization problem, guiding feature matching within +detector-free methods and using sparse correspondences from detector-based +methods as anchor points. By enforcing geometric constraints via the Sampson +Distance, our approach ensures that the denser correspondences from +detector-free methods are geometrically consistent and more accurate. This +hybrid strategy significantly improves correspondence density and accuracy, +mitigates multi-view inconsistencies, and leads to notable advancements in +camera pose accuracy and point cloud density. It outperforms state-of-the-art +feature matching methods on benchmark datasets and enables feature matching in +challenging extreme large-scale settings. + +
+
+
+
+
+ + ☆ QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of + DWI Data MICCAI 2024 + + +
+ We propose an image-conditioned diffusion model to estimate high angular +resolution diffusion weighted imaging (DWI) from a low angular resolution +acquisition. Our model, which we call QID$^2$, takes as input a set of low +angular resolution DWI data and uses this information to estimate the DWI data +associated with a target gradient direction. We leverage a U-Net architecture +with cross-attention to preserve the positional information of the reference +images, further guiding the target image generation. We train and evaluate +QID$^2$ on single-shell DWI samples curated from the Human Connectome Project +(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to +produce low angular resolution DWI data and train QID$^2$ to reconstruct the +missing high angular resolution samples. We compare QID$^2$ with two +state-of-the-art GAN models. Our results demonstrate that QID$^2$ not only +achieves higher-quality generated images, but it consistently outperforms the +GAN models in downstream tensor estimation across multiple metrics. Taken +together, this study highlights the potential of diffusion models, and QID$^2$ +in particular, for q-space up-sampling, thus offering a promising toolkit for +clinical and research applications. + +
+
+ comment: Accepted at MICCAI 2024 International Workshop on Computational + Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work +
+
+
+
+
+ + ☆ Unsupervised Welding Defect Detection Using Audio And Video + + +
+ In this work we explore the application of AI to robotic welding. Robotic +welding is a widely used technology in many industries, but robots currently do +not have the capability to detect welding defects which get introduced due to +various reasons in the welding process. We describe how deep-learning methods +can be applied to detect weld defects in real-time by recording the welding +process with microphones and a camera. Our findings are based on a large +database with more than 4000 welding samples we collected which covers +different weld types, materials and various defect categories. All deep +learning models are trained in an unsupervised fashion because the space of +possible defects is large and the defects in our data may contain biases. We +demonstrate that a reliable real-time detection of most categories of weld +defects is feasible both from audio and video, with improvements achieved by +combining both modalities. Specifically, the multi-modal approach achieves an +average Area-under-ROC-Curve (AUC) of 0.92 over all eleven defect types in our +data. We conclude the paper with an analysis of the results by defect type and +a discussion of future work. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Biochemical Prostate Cancer Recurrence Prediction: Thinking Fast & Slow + + +
+ Time to biochemical recurrence in prostate cancer is essential for prognostic +monitoring of the progression of patients after prostatectomy, which assesses +the efficacy of the surgery. In this work, we proposed to leverage multiple +instance learning through a two-stage ``thinking fast \& slow'' strategy for +the time to recurrence (TTR) prediction. The first (``thinking fast'') stage +finds the most relevant WSI area for biochemical recurrence and the second +(``thinking slow'') stage leverages higher resolution patches to predict TTR. +Our approach reveals a mean C-index ($Ci$) of 0.733 ($\theta=0.059$) on our +internal validation and $Ci=0.603$ on the LEOPARD challenge validation set. +Post hoc attention visualization shows that the most attentive area contributes +to the TTR prediction. + +
+
+ comment: 8 pages, 3 figures, methodology paper for LEOPRARD Challenge +
+
+
+
+
+ + ☆ K-Origins: Better Colour Quantification for Neural Networks + + +
+ K-Origins is a neural network layer designed to improve image-based network +performances when learning colour, or intensities, is beneficial. Over 250 +encoder-decoder convolutional networks are trained and tested on 16-bit +synthetic data, demonstrating that K-Origins improves semantic segmentation +accuracy in two scenarios: object detection with low signal-to-noise ratios, +and segmenting multiple objects that are identical in shape but vary in colour. +K-Origins generates output features from the input features, $\textbf{X}$, by +the equation $\textbf{Y}_k = \textbf{X}-\textbf{J}\cdot w_k$ for each trainable +parameter $w_k$, where $\textbf{J}$ is a matrix of ones. Additionally, networks +with varying receptive fields were trained to determine optimal network depths +based on the dimensions of target classes, suggesting that receptive field +lengths should exceed object sizes. By ensuring a sufficient receptive field +length and incorporating K-Origins, we can achieve better semantic network +performance. + +
+
+ comment: 16 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Evaluation and Comparison of Visual Language Models for Transportation + Engineering Problems + + +
+ Recent developments in vision language models (VLM) have shown great +potential for diverse applications related to image understanding. In this +study, we have explored state-of-the-art VLM models for vision-based +transportation engineering tasks such as image classification and object +detection. The image classification task involves congestion detection and +crack identification, whereas, for object detection, helmet violations were +identified. We have applied open-source models such as CLIP, BLIP, OWL-ViT, +Llava-Next, and closed-source GPT-4o to evaluate the performance of these +state-of-the-art VLM models to harness the capabilities of language +understanding for vision-based transportation tasks. These tasks were performed +by applying zero-shot prompting to the VLM models, as zero-shot prompting +involves performing tasks without any training on those tasks. It eliminates +the need for annotated datasets or fine-tuning for specific tasks. Though these +models gave comparative results with benchmark Convolutional Neural Networks +(CNN) models in the image classification tasks, for object localization tasks, +it still needs improvement. Therefore, this study provides a comprehensive +evaluation of the state-of-the-art VLM models highlighting the advantages and +limitations of the models, which can be taken as the baseline for future +improvement and wide-scale implementation. + +
+
+
+
+
+ + ☆ ADHD diagnosis based on action characteristics recorded in videos using + machine learning + + +
+ Demand for ADHD diagnosis and treatment is increasing significantly and the +existing services are unable to meet the demand in a timely manner. In this +work, we introduce a novel action recognition method for ADHD diagnosis by +identifying and analysing raw video recordings. Our main contributions include +1) designing and implementing a test focusing on the attention and +hyperactivity/impulsivity of participants, recorded through three cameras; 2) +implementing a novel machine learning ADHD diagnosis system based on action +recognition neural networks for the first time; 3) proposing classification +criteria to provide diagnosis results and analysis of ADHD action +characteristics. + +
+
+ comment: Neuroscience Applied +
+
+
+
+
+ + ☆ Action-Based ADHD Diagnosis in Video + + +
+ Attention Deficit Hyperactivity Disorder (ADHD) causes significant impairment +in various domains. Early diagnosis of ADHD and treatment could significantly +improve the quality of life and functioning. Recently, machine learning methods +have improved the accuracy and efficiency of the ADHD diagnosis process. +However, the cost of the equipment and trained staff required by the existing +methods are generally huge. Therefore, we introduce the video-based frame-level +action recognition network to ADHD diagnosis for the first time. We also record +a real multi-modal ADHD dataset and extract three action classes from the video +modality for ADHD diagnosis. The whole process data have been reported to +CNTW-NHS Foundation Trust, which would be reviewed by medical +consultants/professionals and will be made public in due course. + +
+
+ comment: 31st European Symposium on Artificial Neural Networks +
+
+
+
+
+ + ☆ Optimal L-Systems for Stochastic L-system Inference Problems + + +
+ This paper presents two novel theorems that address two open problems in +stochastic Lindenmayer-system (L-system) inference, specifically focusing on +the construction of an optimal stochastic L-system capable of generating a +given sequence of strings. The first theorem delineates a method for crafting a +stochastic L-system that maximizes the likelihood of producing a given sequence +of words through a singular derivation. Furthermore, the second theorem +determines the stochastic L-systems with the highest probability of producing a +given sequence of words with multiple possible derivations. From these, we +introduce an algorithm to infer an optimal stochastic L-system from a given +sequence. This algorithm incorporates sophisticated optimization techniques, +such as interior point methods, ensuring production of a stochastically optimal +stochastic L-system suitable for generating the given sequence. This allows for +the use of using stochastic L-systems as model for machine learning using only +positive data for training. + +
+
+
+
+
+ + ☆ How to Determine the Preferred Image Distribution of a Black-Box + Vision-Language Model? + + +
+ Large foundation models have revolutionized the field, yet challenges remain +in optimizing multi-modal models for specialized visual tasks. We propose a +novel, generalizable methodology to identify preferred image distributions for +black-box Vision-Language Models (VLMs) by measuring output consistency across +varied input prompts. Applying this to different rendering types of 3D objects, +we demonstrate its efficacy across various domains requiring precise +interpretation of complex structures, with a focus on Computer-Aided Design +(CAD) as an exemplar field. We further refine VLM outputs using in-context +learning with human feedback, significantly enhancing explanation quality. To +address the lack of benchmarks in specialized domains, we introduce CAD-VQA, a +new dataset for evaluating VLMs on CAD-related visual question answering tasks. +Our evaluation of state-of-the-art VLMs on CAD-VQA establishes baseline +performance levels, providing a framework for advancing VLM capabilities in +complex visual reasoning tasks across various fields requiring expert-level +visual interpretation. We release the dataset and evaluation codes at +\url{https://github.com/asgsaeid/cad_vqa}. + +
+
+
+
+
+ + ☆ NoiseAttack: An Evasive Sample-Specific Multi-Targeted Backdoor Attack + Through White Gaussian Noise + + +
+ Backdoor attacks pose a significant threat when using third-party data for +deep learning development. In these attacks, data can be manipulated to cause a +trained model to behave improperly when a specific trigger pattern is applied, +providing the adversary with unauthorized advantages. While most existing works +focus on designing trigger patterns in both visible and invisible to poison the +victim class, they typically result in a single targeted class upon the success +of the backdoor attack, meaning that the victim class can only be converted to +another class based on the adversary predefined value. In this paper, we +address this issue by introducing a novel sample-specific multi-targeted +backdoor attack, namely NoiseAttack. Specifically, we adopt White Gaussian +Noise (WGN) with various Power Spectral Densities (PSD) as our underlying +triggers, coupled with a unique training strategy to execute the backdoor +attack. This work is the first of its kind to launch a vision backdoor attack +with the intent to generate multiple targeted classes with minimal input +configuration. Furthermore, our extensive experimental results demonstrate that +NoiseAttack can achieve a high attack success rate against popular network +architectures and datasets, as well as bypass state-of-the-art backdoor +detection methods. Our source code and experiments are available at +https://github.com/SiSL-URI/NoiseAttack/tree/main. + +
+
+
+
+
+ + ☆ A Novel Audio-Visual Information Fusion System for Mental Disorders + Detection + + +
+ Mental disorders are among the foremost contributors to the global healthcare +challenge. Research indicates that timely diagnosis and intervention are vital +in treating various mental disorders. However, the early somatization symptoms +of certain mental disorders may not be immediately evident, often resulting in +their oversight and misdiagnosis. Additionally, the traditional diagnosis +methods incur high time and cost. Deep learning methods based on fMRI and EEG +have improved the efficiency of the mental disorder detection process. However, +the cost of the equipment and trained staff are generally huge. Moreover, most +systems are only trained for a specific mental disorder and are not +general-purpose. Recently, physiological studies have shown that there are some +speech and facial-related symptoms in a few mental disorders (e.g., depression +and ADHD). In this paper, we focus on the emotional expression features of +mental disorders and introduce a multimodal mental disorder diagnosis system +based on audio-visual information input. Our proposed system is based on +spatial-temporal attention networks and innovative uses a less computationally +intensive pre-train audio recognition network to fine-tune the video +recognition module for better results. We also apply the unified system for +multiple mental disorders (ADHD and depression) for the first time. The +proposed system achieves over 80\% accuracy on the real multimodal ADHD dataset +and achieves state-of-the-art results on the depression dataset AVEC 2014. + +
+
+ comment: 27th International Conference on Information (FUSION) +
+
+
+
+
+ + ☆ What makes a face looks like a hat: Decoupling low-level and high-level + Visual Properties with Image Triplets ECCV2024 + + +
+ In visual decision making, high-level features, such as object categories, +have a strong influence on choice. However, the impact of low-level features on +behavior is less understood partly due to the high correlation between high- +and low-level features in the stimuli presented (e.g., objects of the same +category are more likely to share low-level features). To disentangle these +effects, we propose a method that de-correlates low- and high-level visual +properties in a novel set of stimuli. Our method uses two Convolutional Neural +Networks (CNNs) as candidate models of the ventral visual stream: the CORnet-S +that has high neural predictivity in high-level, IT-like responses and the +VGG-16 that has high neural predictivity in low-level responses. Triplets +(root, image1, image2) of stimuli are parametrized by the level of low- and +high-level similarity of images extracted from the different layers. These +stimuli are then used in a decision-making task where participants are tasked +to choose the most similar-to-the-root image. We found that different networks +show differing abilities to predict the effects of low-versus-high-level +similarity: while CORnet-S outperforms VGG-16 in explaining human choices based +on high-level similarity, VGG-16 outperforms CORnet-S in explaining human +choices based on low-level similarity. Using Brain-Score, we observed that the +behavioral prediction abilities of different layers of these networks +qualitatively corresponded to their ability to explain neural activity at +different levels of the visual hierarchy. In summary, our algorithm for +stimulus set generation enables the study of how different representations in +the visual stream affect high-level cognitive behaviors. + +
+
+ comment: Accepted at Workshop on Human-inspired Computer Vision @ ECCV2024 +
+
+
+
+
+ + ☆ EgoPressure: A Dataset for Hand Pressure and Pose Estimation in + Egocentric Vision + + +
+ Estimating touch contact and pressure in egocentric vision is a central task +for downstream applications in Augmented Reality, Virtual Reality, as well as +many robotic applications, because it provides precise physical insights into +hand-object interaction and object manipulation. However, existing contact +pressure datasets lack egocentric views and hand poses, which are essential for +accurate estimation during in-situ operation, both for AR/VR interaction and +robotic manipulation. In this paper, we introduce EgoPressure,a novel dataset +of touch contact and pressure interaction from an egocentric perspective, +complemented with hand pose meshes and fine-grained pressure intensities for +each contact. The hand poses in our dataset are optimized using our proposed +multi-view sequence-based method that processes footage from our capture rig of +8 accurately calibrated RGBD cameras. EgoPressure comprises 5.0 hours of touch +contact and pressure interaction from 21 participants captured by a moving +egocentric camera and 7 stationary Kinect cameras, which provided RGB images +and depth maps at 30 Hz. In addition, we provide baselines for estimating +pressure with different modalities, which will enable future developments and +benchmarking on the dataset. Overall, we demonstrate that pressure and hand +poses are complementary, which supports our intention to better facilitate the +physical understanding of hand-object interactions in AR/VR and robotics +research. + +
+
+
+
+
+ + ☆ Visually Grounded Speech Models for Low-resource Languages and Cognitive + Modelling + + +
+ This dissertation examines visually grounded speech (VGS) models that learn +from unlabelled speech paired with images. It focuses on applications for +low-resource languages and understanding human language acquisition. We +introduce a task called visually prompted keyword localisation to detect and +localise keywords in speech using images. We demonstrate the effectiveness of +VGS models in few-shot learning scenarios for low-resource languages like +Yoruba. Additionally, we examine the mutual exclusivity bias in VGS models. Our +monolingual VGS model exhibits this bias, but we found that multilingualism +does not affect the bias in this VGS model similarly to what is observed in +children. + +
+
+ comment: PhD Dissertation +
+
+
+
+
+ + ☆ Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection, + Removal, and Generation in the Era of Deep Learning + + +
+ Shadows are formed when light encounters obstacles, leading to areas of +diminished illumination. In computer vision, shadow detection, removal, and +generation are crucial for enhancing scene understanding, refining image +quality, ensuring visual consistency in video editing, and improving virtual +environments. This paper presents a comprehensive survey of shadow detection, +removal, and generation in images and videos within the deep learning landscape +over the past decade, covering tasks, deep models, datasets, and evaluation +metrics. Our key contributions include a comprehensive survey of shadow +analysis, standardization of experimental comparisons, exploration of the +relationships among model size, speed, and performance, a cross-dataset +generalization study, identification of open issues and future directions, and +provision of publicly available resources to support further research. + +
+
+ comment: Publicly available results, trained models, and evaluation metrics at + https://github.com/xw-hu/Unveiling-Deep-Shadows +
+
+
+
+
+ + ☆ DynOMo: Online Point Tracking by Dynamic Online Monocular Gaussian + Reconstruction + + +
+ Reconstructing scenes and tracking motion are two sides of the same coin. +Tracking points allow for geometric reconstruction [14], while geometric +reconstruction of (dynamic) scenes allows for 3D tracking of points over time +[24, 39]. The latter was recently also exploited for 2D point tracking to +overcome occlusion ambiguities by lifting tracking directly into 3D [38]. +However, above approaches either require offline processing or multi-view +camera setups both unrealistic for real-world applications like robot +navigation or mixed reality. We target the challenge of online 2D and 3D point +tracking from unposed monocular camera input introducing Dynamic Online +Monocular Reconstruction (DynOMo). We leverage 3D Gaussian splatting to +reconstruct dynamic scenes in an online fashion. Our approach extends 3D +Gaussians to capture new content and object motions while estimating camera +movements from a single RGB frame. DynOMo stands out by enabling emergence of +point trajectories through robust image feature reconstruction and a novel +similarity-enhanced regularization term, without requiring any +correspondence-level supervision. It sets the first baseline for online point +tracking with monocular unposed cameras, achieving performance on par with +existing methods. We aim to inspire the community to advance online point +tracking and reconstruction, expanding the applicability to diverse real-world +scenarios. + +
+
+
+
+
+ + ☆ Towards Real-World Adverse Weather Image Restoration: Enhancing + Clearness and Semantics with Vision-Language Models ECCV 2024 + + +
+ This paper addresses the limitations of adverse weather image restoration +approaches trained on synthetic data when applied to real-world scenarios. We +formulate a semi-supervised learning framework employing vision-language models +to enhance restoration performance across diverse adverse weather conditions in +real-world settings. Our approach involves assessing image clearness and +providing semantics using vision-language models on real data, serving as +supervision signals for training restoration models. For clearness enhancement, +we use real-world data, utilizing a dual-step strategy with pseudo-labels +assessed by vision-language models and weather prompt learning. For semantic +enhancement, we integrate real-world data by adjusting weather conditions in +vision-language model descriptions while preserving semantic meaning. +Additionally, we introduce an effective training strategy to bootstrap +restoration performance. Our approach achieves superior results in real-world +adverse weather image restoration, demonstrated through qualitative and +quantitative comparisons with state-of-the-art works. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ LinFusion: 1 GPU, 1 Minute, 16K Image + + +
+ Modern diffusion models, particularly those utilizing a Transformer-based +UNet for denoising, rely heavily on self-attention operations to manage complex +spatial relationships, thus achieving impressive generation performance. +However, this existing paradigm faces significant challenges in generating +high-resolution visual content due to its quadratic time and memory complexity +with respect to the number of spatial tokens. To address this limitation, we +aim at a novel linear attention mechanism as an alternative in this paper. +Specifically, we begin our exploration from recently introduced models with +linear complexity, e.g., Mamba, Mamba2, and Gated Linear Attention, and +identify two key features-attention normalization and non-causal inference-that +enhance high-resolution visual generation performance. Building on these +insights, we introduce a generalized linear attention paradigm, which serves as +a low-rank approximation of a wide spectrum of popular linear token mixers. To +save the training cost and better leverage pre-trained models, we initialize +our models and distill the knowledge from pre-trained StableDiffusion (SD). We +find that the distilled model, termed LinFusion, achieves performance on par +with or superior to the original SD after only modest training, while +significantly reducing time and memory complexity. Extensive experiments on +SD-v1.5, SD-v2.1, and SD-XL demonstrate that LinFusion delivers satisfactory +zero-shot cross-resolution generation performance, generating high-resolution +images like 16K resolution. Moreover, it is highly compatible with pre-trained +SD components, such as ControlNet and IP-Adapter, requiring no adaptation +efforts. Codes are available at https://github.com/Huage001/LinFusion. + +
+
+ comment: Work in Progress. Codes are available at + https://github.com/Huage001/LinFusion +
+
+
+
+
+ + ☆ DepthCrafter: Generating Consistent Long Depth Sequences for Open-world + Videos + + +
+ Despite significant advancements in monocular depth estimation for static +images, estimating video depth in the open world remains challenging, since +open-world videos are extremely diverse in content, motion, camera movement, +and length. We present DepthCrafter, an innovative method for generating +temporally consistent long depth sequences with intricate details for +open-world videos, without requiring any supplementary information such as +camera poses or optical flow. DepthCrafter achieves generalization ability to +open-world videos by training a video-to-depth model from a pre-trained +image-to-video diffusion model, through our meticulously designed three-stage +training strategy with the compiled paired video-depth datasets. Our training +approach enables the model to generate depth sequences with variable lengths at +one time, up to 110 frames, and harvest both precise depth details and rich +content diversity from realistic and synthetic datasets. We also propose an +inference strategy that processes extremely long videos through segment-wise +estimation and seamless stitching. Comprehensive evaluations on multiple +datasets reveal that DepthCrafter achieves state-of-the-art performance in +open-world video depth estimation under zero-shot settings. Furthermore, +DepthCrafter facilitates various downstream applications, including depth-based +visual effects and conditional video generation. + +
+
+ comment: Project webpage: https://depthcrafter.github.io +
+
+
+
+
+ + ☆ GraspSplats: Efficient Manipulation with 3D Feature Splatting + + +
+ The ability for robots to perform efficient and zero-shot grasping of object +parts is crucial for practical applications and is becoming prevalent with +recent advances in Vision-Language Models (VLMs). To bridge the 2D-to-3D gap +for representations to support such a capability, existing methods rely on +neural fields (NeRFs) via differentiable rendering or point-based projection +methods. However, we demonstrate that NeRFs are inappropriate for scene changes +due to their implicitness and point-based methods are inaccurate for part +localization without rendering-based optimization. To amend these issues, we +propose GraspSplats. Using depth supervision and a novel reference feature +computation method, GraspSplats generates high-quality scene representations in +under 60 seconds. We further validate the advantages of Gaussian-based +representation by showing that the explicit and optimized geometry in +GraspSplats is sufficient to natively support (1) real-time grasp sampling and +(2) dynamic and articulated object manipulation with point trackers. With +extensive experiments on a Franka robot, we demonstrate that GraspSplats +significantly outperforms existing methods under diverse task settings. In +particular, GraspSplats outperforms NeRF-based methods like F3RM and LERF-TOGO, +and 2D detection methods. + +
+
+ comment: Project webpage: https://graspsplats.github.io/ +
+
+
+
+
+ + ☆ Physical Rule-Guided Convolutional Neural Network + + +
+ The black-box nature of Convolutional Neural Networks (CNNs) and their +reliance on large datasets limit their use in complex domains with limited +labeled data. Physics-Guided Neural Networks (PGNNs) have emerged to address +these limitations by integrating scientific principles and real-world +knowledge, enhancing model interpretability and efficiency. This paper proposes +a novel Physics-Guided CNN (PGCNN) architecture that incorporates dynamic, +trainable, and automated LLM-generated, widely recognized rules integrated into +the model as custom layers to address challenges like limited data and low +confidence scores. The PGCNN is evaluated on multiple datasets, demonstrating +superior performance compared to a baseline CNN model. Key improvements include +a significant reduction in false positives and enhanced confidence scores for +true detection. The results highlight the potential of PGCNNs to improve CNN +performance for broader application areas. + +
+
+
+
+
+ + ♻ ☆ Open-vocabulary Temporal Action Localization using VLMs + + +
+ Video action localization aims to find timings of a specific action from a +long video. Although existing learning-based approaches have been successful, +those require annotating videos that come with a considerable labor cost. This +paper proposes a learning-free, open-vocabulary approach based on emerging +off-the-shelf vision-language models (VLM). The challenge stems from the fact +that VLMs are neither designed to process long videos nor tailored for finding +actions. We overcome these problems by extending an iterative visual prompting +technique. Specifically, we sample video frames into a concatenated image with +frame index labels, making a VLM guess a frame that is considered to be closest +to the start/end of the action. Iterating this process by narrowing a sampling +time window results in finding a specific frame of start and end of an action. +We demonstrate that this sampling technique yields reasonable results, +illustrating a practical extension of VLMs for understanding videos. A sample +code is available at +https://microsoft.github.io/VLM-Video-Action-Localization/. + +
+
+ comment: 7 pages, 5 figures, 4 tables. Last updated on September 3rd, 2024 +
+
+
+
+
+ + ♻ ☆ A Multiscale Gradient Fusion Method for Edge Detection in Color Images + Utilizing the CBM3D Filter + + +
+ In this paper, a color edge detection strategy based on collaborative +filtering combined with multiscale gradient fusion is proposed. The +block-matching and 3D (BM3D) filter are used to enhance the sparse +representation in the transform domain and achieve the effect of denoising, +whereas the multiscale gradient fusion makes up for the defect of loss of +details in single-scale edge detection and improves the edge detection +resolution and quality. First, the RGB images in the dataset are converted to +XYZ color space images through mathematical operations. Second, the colored +block-matching and 3D (CBM3D) filter are used on the sparse images and to +remove noise interference. Then, the vector gradients of the color image and +the anisotropic Gaussian directional derivative of the two scale parameters are +calculated and averaged pixel-by-pixel to obtain a new edge strength map. +Finally, the edge features are enhanced by image normalization and non-maximum +suppression technology, and on that basis, the edge contour is obtained by +double threshold selection and a new morphological refinement method. Through +an experimental analysis of the edge detection dataset, the method proposed has +good noise robustness and high edge quality, which is better than the Color +Sobel, Color Canny, SE and Color AGDD as shown by the PR curve, AUC, PSNR, MSE, +and FOM indicators. + +
+
+ comment: 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ IBO: Inpainting-Based Occlusion to Enhance Explainable Artificial + Intelligence Evaluation in Histopathology + + +
+ Histopathological image analysis is crucial for accurate cancer diagnosis and +treatment planning. While deep learning models, especially convolutional neural +networks, have advanced this field, their "black-box" nature raises concerns +about interpretability and trustworthiness. Explainable Artificial Intelligence +(XAI) techniques aim to address these concerns, but evaluating their +effectiveness remains challenging. A significant issue with current +occlusion-based XAI methods is that they often generate Out-of-Distribution +(OoD) samples, leading to inaccurate evaluations. In this paper, we introduce +Inpainting-Based Occlusion (IBO), a novel occlusion strategy that utilizes a +Denoising Diffusion Probabilistic Model to inpaint occluded regions in +histopathological images. By replacing cancerous areas with realistic, +non-cancerous tissue, IBO minimizes OoD artifacts and preserves data integrity. +We evaluate our method on the CAMELYON16 dataset through two phases: first, by +assessing perceptual similarity using the Learned Perceptual Image Patch +Similarity (LPIPS) metric, and second, by quantifying the impact on model +predictions through Area Under the Curve (AUC) analysis. Our results +demonstrate that IBO significantly improves perceptual fidelity, achieving +nearly twice the improvement in LPIPS scores compared to the best existing +occlusion strategy. Additionally, IBO increased the precision of XAI +performance prediction from 42% to 71% compared to traditional methods. These +results demonstrate IBO's potential to provide more reliable evaluations of XAI +techniques, benefiting histopathology and other applications. The source code +for this study is available at https://github.com/a-fsh-r/IBO. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Learning a Generalized Physical Face Model From Data + + +
+ Physically-based simulation is a powerful approach for 3D facial animation as +the resulting deformations are governed by physical constraints, allowing to +easily resolve self-collisions, respond to external forces and perform +realistic anatomy edits. Today's methods are data-driven, where the actuations +for finite elements are inferred from captured skin geometry. Unfortunately, +these approaches have not been widely adopted due to the complexity of +initializing the material space and learning the deformation model for each +character separately, which often requires a skilled artist followed by lengthy +network training. In this work, we aim to make physics-based facial animation +more accessible by proposing a generalized physical face model that we learn +from a large 3D face dataset. Once trained, our model can be quickly fit to any +unseen identity and produce a ready-to-animate physical face model +automatically. Fitting is as easy as providing a single 3D face scan, or even a +single face image. After fitting, we offer intuitive animation controls, as +well as the ability to retarget animations across characters. All the while, +the resulting animations allow for physical effects like collision avoidance, +gravity, paralysis, bone reshaping and more. + +
+
+
+
+
+ + ♻ ☆ $OC^4-ReID$: Occluded Cloth-Changing Person Re-Identification + + +
+ The study of Cloth-Changing Person Re-identification (CC-ReID) focuses on +retrieving specific pedestrians when their clothing has changed, typically +under the assumption that the entire pedestrian images are visible. Pedestrian +images in real-world scenarios, however, are often partially obscured by +obstacles, presenting a significant challenge to existing CC-ReID systems. In +this paper, we introduce a more challenging task termed Occluded Cloth-Changing +Person Re-Identification ($OC^4-ReID$), which simultaneously addresses two +challenges of clothing changes and occlusion. Concretely, we construct two new +datasets, Occ-LTCC and Occ-PRCC, based on original CC-ReID datasets to include +random occlusions of key pedestrians components (e.g., head, torso). Moreover, +a novel benchmark is proposed for $OC^4-ReID$ incorporating a Train-Test Micro +Granularity Screening ($T^2MGS$) module to mitigate the influence of occlusion +and proposing a Part-Robust Triplet (PRT) loss for partial features learning. +Comprehensive experiments on the proposed datasets, as well as on two CC-ReID +benchmark datasets demonstrate the superior performance of proposed method +against other state-of-the-art methods. The codes and datasets are available +at: https://github.com/1024AILab/OC4-ReID. + +
+
+
+
+
+ + ♻ ☆ Restorer: Removing Multi-Degradation with All-Axis Attention and Prompt + Guidance + + +
+ There are many excellent solutions in image restoration.However, most methods +require on training separate models to restore images with different types of +degradation.Although existing all-in-one models effectively address multiple +types of degradation simultaneously, their performance in real-world scenarios +is still constrained by the task confusion problem.In this work, we attempt to +address this issue by introducing \textbf{Restorer}, a novel Transformer-based +all-in-one image restoration model.To effectively address the complex +degradation present in real-world images, we propose All-Axis Attention (AAA), +a mechanism that simultaneously models long-range dependencies across both +spatial and channel dimensions, capturing potential correlations along all +axes.Additionally, we introduce textual prompts in Restorer to incorporate +explicit task priors, enabling the removal of specific degradation types based +on user instructions. By iterating over these prompts, Restorer can handle +composite degradation in real-world scenarios without requiring additional +training.Based on these designs, Restorer with one set of parameters +demonstrates state-of-the-art performance in multiple image restoration tasks +compared to existing all-in-one and even single-task models.Additionally, +Restorer is efficient during inference, suggesting the potential in real-world +applications. + +
+
+
+
+
+ + ♻ ☆ On the Federated Learning Framework for Cooperative Perception + + +
+ Cooperative perception is essential to enhance the efficiency and safety of +future transportation systems, requiring extensive data sharing among vehicles +on the road, which raises significant privacy concerns. Federated learning +offers a promising solution by enabling data privacy-preserving collaborative +enhancements in perception, decision-making, and planning among connected and +autonomous vehicles (CAVs). However, federated learning is impeded by +significant challenges arising from data heterogeneity across diverse clients, +potentially diminishing model accuracy and prolonging convergence periods. This +study introduces a specialized federated learning framework for CP, termed the +federated dynamic weighted aggregation (FedDWA) algorithm, facilitated by +dynamic adjusting loss (DALoss) function. This framework employs dynamic client +weighting to direct model convergence and integrates a novel loss function that +utilizes Kullback-Leibler divergence (KLD) to counteract the detrimental +effects of non-independently and identically distributed (Non-IID) and +unbalanced data. Utilizing the BEV transformer as the primary model, our +rigorous testing on the OpenV2V dataset, augmented with FedBEVT data, +demonstrates significant improvements in the average intersection over union +(IoU). These results highlight the substantial potential of our federated +learning framework to address data heterogeneity challenges in CP, thereby +enhancing the accuracy of environmental perception models and facilitating more +robust and efficient collaborative learning solutions in the transportation +sector. + +
+
+ comment: accepted by IEEE RA-L +
+
+
+
+
+ + ♻ ☆ 3DGS.zip: A survey on 3D Gaussian Splatting Compression Methods + + +
+ We present a work-in-progress survey on 3D Gaussian Splatting compression +methods, focusing on their statistical performance across various benchmarks. +This survey aims to facilitate comparability by summarizing key statistics of +different compression approaches in a tabulated format. The datasets evaluated +include TanksAndTemples, MipNeRF360, DeepBlending, and SyntheticNeRF. For each +method, we report the Peak Signal-to-Noise Ratio (PSNR), Structural Similarity +Index (SSIM), Learned Perceptual Image Patch Similarity (LPIPS), and the +resultant size in megabytes (MB), as provided by the respective authors. This +is an ongoing, open project, and we invite contributions from the research +community as GitHub issues or pull requests. Please visit +http://w-m.github.io/3dgs-compression-survey/ for more information and a +sortable version of the table. + +
+
+ comment: 3D Gaussian Splatting compression survey; 3DGS compression; new + approaches added +
+
+
+
+
+ + ♻ ☆ SUMix: Mixup with Semantic and Uncertain Information ECCV2024 + + +
+ Mixup data augmentation approaches have been applied for various tasks of +deep learning to improve the generalization ability of deep neural networks. +Some existing approaches CutMix, SaliencyMix, etc. randomly replace a patch in +one image with patches from another to generate the mixed image. Similarly, the +corresponding labels are linearly combined by a fixed ratio $\lambda$ by l. The +objects in two images may be overlapped during the mixing process, so some +semantic information is corrupted in the mixed samples. In this case, the mixed +image does not match the mixed label information. Besides, such a label may +mislead the deep learning model training, which results in poor performance. To +solve this problem, we proposed a novel approach named SUMix to learn the +mixing ratio as well as the uncertainty for the mixed samples during the +training process. First, we design a learnable similarity function to compute +an accurate mix ratio. Second, an approach is investigated as a regularized +term to model the uncertainty of the mixed samples. We conduct experiments on +five image benchmarks, and extensive experimental results imply that our method +is capable of improving the performance of classifiers with different +cutting-based mixup approaches. The source code is available at +https://github.com/JinXins/SUMix. + +
+
+ comment: Accepted by ECCV2024 [Camera Ready] (19 pages, 7 figures) with the + source code at https://github.com/JinXins/SUMix +
+
+
+
+
+ + ♻ ☆ SPIdepth: Strengthened Pose Information for Self-supervised Monocular + Depth Estimation + + +
+ Self-supervised monocular depth estimation has garnered considerable +attention for its applications in autonomous driving and robotics. While recent +methods have made strides in leveraging techniques like the Self Query Layer +(SQL) to infer depth from motion, they often overlook the potential of +strengthening pose information. In this paper, we introduce SPIdepth, a novel +approach that prioritizes enhancing the pose network for improved depth +estimation. Building upon the foundation laid by SQL, SPIdepth emphasizes the +importance of pose information in capturing fine-grained scene structures. By +enhancing the pose network's capabilities, SPIdepth achieves remarkable +advancements in scene understanding and depth estimation. Experimental results +on benchmark datasets such as KITTI, Cityscapes, and Make3D showcase SPIdepth's +state-of-the-art performance, surpassing previous methods by significant +margins. Specifically, SPIdepth tops the self-supervised KITTI benchmark. +Additionally, SPIdepth achieves the lowest AbsRel (0.029), SqRel (0.069), and +RMSE (1.394) on KITTI, establishing new state-of-the-art results. On +Cityscapes, SPIdepth shows improvements over SQLdepth of 21.7% in AbsRel, 36.8% +in SqRel, and 16.5% in RMSE, even without using motion masks. On Make3D, +SPIdepth in zero-shot outperforms all other models. Remarkably, SPIdepth +achieves these results using only a single image for inference, surpassing even +methods that utilize video sequences for inference, thus demonstrating its +efficacy and efficiency in real-world applications. Our approach represents a +significant leap forward in self-supervised monocular depth estimation, +underscoring the importance of strengthening pose information for advancing +scene understanding in real-world applications. The code and pre-trained models +are publicly available at https://github.com/Lavreniuk/SPIdepth. + +
+
+
+
+
+ + ♻ ☆ Realigned Softmax Warping for Deep Metric Learning + + +
+ Deep Metric Learning (DML) loss functions traditionally aim to control the +forces of separability and compactness within an embedding space so that the +same class data points are pulled together and different class ones are pushed +apart. Within the context of DML, a softmax operation will typically normalize +distances into a probability for optimization, thus coupling all the push/pull +forces together. This paper proposes a potential new class of loss functions +that operate within a euclidean domain and aim to take full advantage of the +coupled forces governing embedding space formation under a softmax. These +forces of compactness and separability can be boosted or mitigated within +controlled locations at will by using a warping function. In this work, we +provide a simple example of a warping function and use it to achieve +competitive, state-of-the-art results on various metric learning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Progressive Domain Adaptation for Thermal Infrared Object Tracking + + +
+ Due to the lack of large-scale labeled Thermal InfraRed (TIR) training +datasets, most existing TIR trackers are trained directly on RGB datasets. +However, tracking methods trained on RGB datasets suffer a significant drop-off +in TIR data due to the domain shift issue. To this end, in this work, we +propose a Progressive Domain Adaptation framework for TIR Tracking (PDAT), +which transfers useful knowledge learned from RGB tracking to TIR tracking. The +framework makes full use of large-scale labeled RGB datasets without requiring +time-consuming and labor-intensive labeling of large-scale TIR data. +Specifically, we first propose an adversarial-based global domain adaptation +module to reduce domain gap on the feature level coarsely. Second, we design a +clustering-based subdomain adaptation method to further align the feature +distributions of the RGB and TIR datasets finely. These two domain adaptation +modules gradually eliminate the discrepancy between the two domains, and thus +learn domain-invariant fine-grained features through progressive training. +Additionally, we collect a largescale TIR dataset with over 1.48 million +unlabeled TIR images for training the proposed domain adaptation framework. +Experimental results on five TIR tracking benchmarks show that the proposed +method gains a nearly 6% success rate, demonstrating its effectiveness. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ An Efficient Instance Segmentation Framework Using Segmentation + Foundation Models with Oriented Bounding Box Prompts + + +
+ Instance segmentation in unmanned aerial vehicle measurement is a +long-standing challenge. Since horizontal bounding boxes introduce many +interference objects, oriented bounding boxes (OBBs) are usually used for +instance identification. However, based on ``segmentation within bounding box'' +paradigm, current instance segmentation methods using OBBs are overly dependent +on bounding box detection performance. To tackle this, this paper proposes +OBSeg, an efficient instance segmentation framework using OBBs. OBSeg is based +on box prompt-based segmentation foundation models (BSMs), e.g., Segment +Anything Model. Specifically, OBSeg first detects OBBs to distinguish instances +and provide coarse localization information. Then, it predicts OBB +prompt-related masks for fine segmentation. Since OBBs only serve as prompts, +OBSeg alleviates the over-dependence on bounding box detection performance of +current instance segmentation methods using OBBs. In addition, to enable BSMs +to handle OBB prompts, we propose a novel OBB prompt encoder. To make OBSeg +more lightweight and further improve the performance of lightweight distilled +BSMs, a Gaussian smoothing-based knowledge distillation method is introduced. +Experiments demonstrate that OBSeg outperforms current instance segmentation +methods on multiple public datasets. The code is available at +https://github.com/zhen6618/OBBInstanceSegmentation. + +
+
+
+
+
+ + ♻ ☆ Collaborative Group: Composed Image Retrieval via Consensus Learning + from Noisy Annotations + + +
+ Composed image retrieval extends content-based image retrieval systems by +enabling users to search using reference images and captions that describe +their intention. Despite great progress in developing image-text compositors to +extract discriminative visual-linguistic features, we identify a hitherto +overlooked issue, triplet ambiguity, which impedes robust feature extraction. +Triplet ambiguity refers to a type of semantic ambiguity that arises between +the reference image, the relative caption, and the target image. It is mainly +due to the limited representation of the annotated text, resulting in many +noisy triplets where multiple visually dissimilar candidate images can be +matched to an identical reference pair (i.e., a reference image + a relative +caption). To address this challenge, we propose the Consensus Network +(Css-Net), inspired by the psychological concept that groups outperform +individuals. Css-Net comprises two core components: (1) a consensus module with +four diverse compositors, each generating distinct image-text embeddings, +fostering complementary feature extraction and mitigating dependence on any +single, potentially biased compositor; (2) a Kullback-Leibler divergence loss +that encourages learning of inter-compositor interactions to promote consensual +outputs. During evaluation, the decisions of the four compositors are combined +through a weighting scheme, enhancing overall agreement. On benchmark datasets, +particularly FashionIQ, Css-Net demonstrates marked improvements. Notably, it +achieves significant recall gains, with a 2.77% increase in R@10 and 6.67% +boost in R@50, underscoring its competitiveness in addressing the fundamental +limitations of existing methods. + +
+
+ comment: Accepted by Knowledge-Based Systems (KBS) +
+
+
+
+
+ + ♻ ☆ CAST: Cross-Attention in Space and Time for Video Action Recognition NeurIPS 2023 + + +
+ Recognizing human actions in videos requires spatial and temporal +understanding. Most existing action recognition models lack a balanced +spatio-temporal understanding of videos. In this work, we propose a novel +two-stream architecture, called Cross-Attention in Space and Time (CAST), that +achieves a balanced spatio-temporal understanding of videos using only RGB +input. Our proposed bottleneck cross-attention mechanism enables the spatial +and temporal expert models to exchange information and make synergistic +predictions, leading to improved performance. We validate the proposed method +with extensive experiments on public benchmarks with different characteristics: +EPIC-KITCHENS-100, Something-Something-V2, and Kinetics-400. Our method +consistently shows favorable performance across these datasets, while the +performance of existing methods fluctuates depending on the dataset +characteristics. + +
+
+ comment: This is an accepted NeurIPS 2023. Project webpage is available at + https://jong980812.github.io/CAST.github.io/ Code is available at + https://github.com/KHU-VLL/CAST +
+
+
+
+
+ + ♻ ☆ TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot + Image Classification ICASSP 2024 + + +
+ Few-shot image classification aims to classify images from unseen novel +classes with few samples. Recent works demonstrate that deep local descriptors +exhibit enhanced representational capabilities compared to image-level +features. However, most existing methods solely rely on either employing all +local descriptors or directly utilizing partial descriptors, potentially +resulting in the loss of crucial information. Moreover, these methods primarily +emphasize the selection of query descriptors while overlooking support +descriptors. In this paper, we propose a novel Task-Aware Adaptive Local +Descriptors Selection Network (TALDS-Net), which exhibits the capacity for +adaptive selection of task-aware support descriptors and query descriptors. +Specifically, we compare the similarity of each local support descriptor with +other local support descriptors to obtain the optimal support descriptor subset +and then compare the query descriptors with the optimal support subset to +obtain discriminative query descriptors. Extensive experiments demonstrate that +our TALDS-Net outperforms state-of-the-art methods on both general and +fine-grained datasets. + +
+
+ comment: 4 pages, 1 figures, is accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Asynchronous Blob Tracker for Event Cameras + + +
+ Event-based cameras are popular for tracking fast-moving objects due to their +high temporal resolution, low latency, and high dynamic range. In this paper, +we propose a novel algorithm for tracking event blobs using raw events +asynchronously in real time. We introduce the concept of an event blob as a +spatio-temporal likelihood of event occurrence where the conditional spatial +likelihood is blob-like. Many real-world objects such as car headlights or any +quickly moving foreground objects generate event blob data. The proposed +algorithm uses a nearest neighbour classifier with a dynamic threshold criteria +for data association coupled with an extended Kalman filter to track the event +blob state. Our algorithm achieves highly accurate blob tracking, velocity +estimation, and shape estimation even under challenging lighting conditions and +high-speed motions (> 11000 pixels/s). The microsecond time resolution achieved +means that the filter output can be used to derive secondary information such +as time-to-contact or range estimation, that will enable applications to +real-world problems such as collision avoidance in autonomous driving. + +
+
+ comment: 18 pages, 16 figures, Manuscript was accepted on August 7, 2024, by + IEEE Transactions on Robotics +
+
+
+
+
+ + ♻ ☆ Rethinking Barely-Supervised Volumetric Medical Image Segmentation from + an Unsupervised Domain Adaptation Perspective + + +
+ This paper investigates an extremely challenging problem: barely-supervised +volumetric medical image segmentation (BSS). A BSS training dataset consists of +two parts: 1) a barely-annotated labeled set, where each labeled image contains +only a single-slice annotation, and 2) an unlabeled set comprising numerous +unlabeled volumetric images. State-of-the-art BSS methods employ a +registration-based paradigm, which uses inter-slice image registration to +propagate single-slice annotations into volumetric pseudo labels, constructing +a completely annotated labeled set, to which a semi-supervised segmentation +scheme can be applied. However, the paradigm has a critical limitation: the +pseudo-labels generated by image registration are unreliable and noisy. +Motivated by this, we propose a new perspective: instead of solving BSS within +a semi-supervised learning scheme, this work formulates BSS as an unsupervised +domain adaptation problem. To this end, we propose a novel BSS framework, +\textbf{B}arely-supervised learning \textbf{via} unsupervised domain +\textbf{A}daptation (BvA), as an alternative to the dominant registration +paradigm. Specifically, we first design a novel noise-free labeled data +construction algorithm (NFC) for slice-to-volume labeled data synthesis. Then, +we introduce a frequency and spatial Mix-Up strategy (FSX) to mitigate the +domain shifts. Extensive experiments demonstrate that our method provides a +promising alternative for BSS. Remarkably, the proposed method, trained on the +left atrial segmentation dataset with \textbf{only one} barely-labeled image, +achieves a Dice score of 81.20%, outperforming the state-of-the-art by 61.71%. +The code is available at +\href{https://github.com/Senyh/BvA}{\textit{\texttt{https://github.com/Senyh/BvA}}}. + +
+
+
+
+
+ + ♻ ☆ Planning and Rendering: Towards Product Poster Generation with Diffusion + Models + + +
+ Product poster generation significantly optimizes design efficiency and +reduces production costs. Prevailing methods predominantly rely on +image-inpainting methods to generate clean background images for given +products. Subsequently, poster layout generation methods are employed to +produce corresponding layout results. However, the background images may not be +suitable for accommodating textual content due to their complexity, and the +fixed location of products limits the diversity of layout results. To alleviate +these issues, we propose a novel product poster generation framework based on +diffusion models named P\&R. The P\&R draws inspiration from the workflow of +designers in creating posters, which consists of two stages: Planning and +Rendering. At the planning stage, we propose a PlanNet to generate the layout +of the product and other visual components considering both the appearance +features of the product and semantic features of the text, which improves the +diversity and rationality of the layouts. At the rendering stage, we propose a +RenderNet to generate the background for the product while considering the +generated layout, where a spatial fusion module is introduced to fuse the +layout of different visual components. To foster the advancement of this field, +we propose the first product poster generation dataset PPG30k, comprising 30k +exquisite product poster images along with comprehensive image and text +annotations. Our method outperforms the state-of-the-art product poster +generation methods on PPG30k. The PPG30k will be released soon. + +
+
+
+
+
+ + ♻ ☆ Unveiling the Human-like Similarities of Automatic Facial Expression + Recognition: An Empirical Exploration through Explainable AI + + +
+ Facial expression recognition is vital for human behavior analysis, and deep +learning has enabled models that can outperform humans. However, it is unclear +how closely they mimic human processing. This study aims to explore the +similarity between deep neural networks and human perception by comparing +twelve different networks, including both general object classifiers and +FER-specific models. We employ an innovative global explainable AI method to +generate heatmaps, revealing crucial facial regions for the twelve networks +trained on six facial expressions. We assess these results both quantitatively +and qualitatively, comparing them to ground truth masks based on Friesen and +Ekman's description and among them. We use Intersection over Union (IoU) and +normalized correlation coefficients for comparisons. We generate 72 heatmaps to +highlight critical regions for each expression and architecture. Qualitatively, +models with pre-trained weights show more similarity in heatmaps compared to +those without pre-training. Specifically, eye and nose areas influence certain +facial expressions, while the mouth is consistently important across all models +and expressions. Quantitatively, we find low average IoU values (avg. 0.2702) +across all expressions and architectures. The best-performing architecture +averages 0.3269, while the worst-performing one averages 0.2066. Dendrograms, +built with the normalized correlation coefficient, reveal two main clusters for +most expressions: models with pre-training and models without pre-training. +Findings suggest limited alignment between human and AI facial expression +recognition, with network architectures influencing the similarity, as similar +architectures prioritize similar facial regions. + +
+
+ comment: Multimed Tools Appl (2024) +
+
+
+
+
+ + ♻ ☆ Learning Exposure Correction in Dynamic Scenes + + +
+ Exposure correction aims to enhance visual data suffering from improper +exposures, which can greatly improve satisfactory visual effects. However, +previous methods mainly focus on the image modality, and the video counterpart +is less explored in the literature. Directly applying prior image-based methods +to videos results in temporal incoherence with low visual quality. Through +thorough investigation, we find that the development of relevant communities is +limited by the absence of a benchmark dataset. Therefore, in this paper, we +construct the first real-world paired video dataset, including both +underexposure and overexposure dynamic scenes. To achieve spatial alignment, we +utilize two DSLR cameras and a beam splitter to simultaneously capture improper +and normal exposure videos. Additionally, we propose an end-to-end video +exposure correction network, in which a dual-stream module is designed to deal +with both underexposure and overexposure factors, enhancing the illumination +based on Retinex theory. The extensive experiments based on various metrics and +user studies demonstrate the significance of our dataset and the effectiveness +of our method. The code and dataset are available at +https://github.com/kravrolens/VECNet. + +
+
+ comment: To be published at ACM Multimedia 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Computer Vision based Activity Recognition and Fall + Detection of the Elderly: a Systematic Review + + +
+ As the percentage of elderly people in developed countries increases +worldwide, the healthcare of this collective is a worrying matter, especially +if it includes the preservation of their autonomy. In this direction, many +studies are being published on Ambient Assisted Living (AAL) systems, which +help to reduce the preoccupations raised by the independent living of the +elderly. In this study, a systematic review of the literature is presented on +fall detection and Human Activity Recognition (HAR) for the elderly, as the two +main tasks to solve to guarantee the safety of elderly people living alone. To +address the current tendency to perform these two tasks, the review focuses on +the use of Deep Learning (DL) based approaches on computer vision data. In +addition, different collections of data like DL models, datasets or hardware +(e.g. depth or thermal cameras) are gathered from the reviewed studies and +provided for reference in future studies. Strengths and weaknesses of existing +approaches are also discussed and, based on them, our recommendations for +future works are provided. + +
+
+
+
+
+ + ♻ ☆ RefSAM: Efficiently Adapting Segmenting Anything Model for Referring + Video Object Segmentation + + +
+ The Segment Anything Model (SAM) has gained significant attention for its +impressive performance in image segmentation. However, it lacks proficiency in +referring video object segmentation (RVOS) due to the need for precise +user-interactive prompts and a limited understanding of different modalities, +such as language and vision. This paper presents the RefSAM model, which +explores the potential of SAM for RVOS by incorporating multi-view information +from diverse modalities and successive frames at different timestamps in an +online manner. Our proposed approach adapts the original SAM model to enhance +cross-modality learning by employing a lightweight Cross-Modal MLP that +projects the text embedding of the referring expression into sparse and dense +embeddings, serving as user-interactive prompts. Additionally, we have +introduced the hierarchical dense attention module to fuse hierarchical visual +semantic information with sparse embeddings to obtain fine-grained dense +embeddings, and an implicit tracking module to generate a tracking token and +provide historical information for the mask decoder. Furthermore, we employ a +parameter-efficient tuning strategy to align and fuse the language and vision +features effectively. Through comprehensive ablation studies, we demonstrate +our model's practical and effective design choices. Extensive experiments +conducted on Refer-Youtube-VOS, Ref-DAVIS17, and three referring image +segmentation datasets validate the superiority and effectiveness of our RefSAM +model over existing methods. + +
+
+
+
+
+ + ♻ ☆ PointRWKV: Efficient RWKV-Like Model for Hierarchical Point Cloud + Learning + + +
+ Transformers have revolutionized the point cloud learning task, but the +quadratic complexity hinders its extension to long sequence and makes a burden +on limited computational resources. The recent advent of RWKV, a fresh breed of +deep sequence models, has shown immense potential for sequence modeling in NLP +tasks. In this paper, we present PointRWKV, a model of linear complexity +derived from the RWKV model in the NLP field with necessary modifications for +point cloud learning tasks. Specifically, taking the embedded point patches as +input, we first propose to explore the global processing capabilities within +PointRWKV blocks using modified multi-headed matrix-valued states and a dynamic +attention recurrence mechanism. To extract local geometric features +simultaneously, we design a parallel branch to encode the point cloud +efficiently in a fixed radius near-neighbors graph with a graph stabilizer. +Furthermore, we design PointRWKV as a multi-scale framework for hierarchical +feature learning of 3D point clouds, facilitating various downstream tasks. +Extensive experiments on different point cloud learning tasks show our proposed +PointRWKV outperforms the transformer- and mamba-based counterparts, while +significantly saving about 42\% FLOPs, demonstrating the potential option for +constructing foundational 3D models. + +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes and formatting improvements. V3: improvements from + journal revisions +
+
+
+
+
+ + ♻ ☆ Correlation-Embedded Transformer Tracking: A Single-Branch Framework + + +
+ Developing robust and discriminative appearance models has been a +long-standing research challenge in visual object tracking. In the prevalent +Siamese-based paradigm, the features extracted by the Siamese-like networks are +often insufficient to model the tracked targets and distractor objects, thereby +hindering them from being robust and discriminative simultaneously. While most +Siamese trackers focus on designing robust correlation operations, we propose a +novel single-branch tracking framework inspired by the transformer. Unlike the +Siamese-like feature extraction, our tracker deeply embeds cross-image feature +correlation in multiple layers of the feature network. By extensively matching +the features of the two images through multiple layers, it can suppress +non-target features, resulting in target-aware feature extraction. The output +features can be directly used for predicting target locations without +additional correlation steps. Thus, we reformulate the two-branch Siamese +tracking as a conceptually simple, fully transformer-based Single-Branch +Tracking pipeline, dubbed SBT. After conducting an in-depth analysis of the SBT +baseline, we summarize many effective design principles and propose an improved +tracker dubbed SuperSBT. SuperSBT adopts a hierarchical architecture with a +local modeling layer to enhance shallow-level features. A unified relation +modeling is proposed to remove complex handcrafted layer pattern designs. +SuperSBT is further improved by masked image modeling pre-training, integrating +temporal modeling, and equipping with dedicated prediction heads. Thus, +SuperSBT outperforms the SBT baseline by 4.7%,3.0%, and 4.5% AUC scores in +LaSOT, TrackingNet, and GOT-10K. Notably, SuperSBT greatly raises the speed of +SBT from 37 FPS to 81 FPS. Extensive experiments show that our method achieves +superior results on eight VOT benchmarks. + +
+
+ comment: Extension of SBT paper, accepted by TPAMI +
+
+
+
+
+ + ♻ ☆ Learning from the Web: Language Drives Weakly-Supervised Incremental + Learning for Semantic Segmentation ECCV 2024 + + +
+ Current weakly-supervised incremental learning for semantic segmentation +(WILSS) approaches only consider replacing pixel-level annotations with +image-level labels, while the training images are still from well-designed +datasets. In this work, we argue that widely available web images can also be +considered for the learning of new classes. To achieve this, firstly we +introduce a strategy to select web images which are similar to previously seen +examples in the latent space using a Fourier-based domain discriminator. Then, +an effective caption-driven reharsal strategy is proposed to preserve +previously learnt classes. To our knowledge, this is the first work to rely +solely on web images for both the learning of new concepts and the preservation +of the already learned ones in WILSS. Experimental results show that the +proposed approach can reach state-of-the-art performances without using +manually selected and annotated data in the incremental steps. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Image-Based Virtual Try-On: A Survey + + +
+ Image-based virtual try-on aims to synthesize a naturally dressed person +image with a clothing image, which revolutionizes online shopping and inspires +related topics within image generation, showing both research significance and +commercial potential. However, there is a gap between current research progress +and commercial applications and an absence of comprehensive overview of this +field to accelerate the development.In this survey, we provide a comprehensive +analysis of the state-of-the-art techniques and methodologies in aspects of +pipeline architecture, person representation and key modules such as try-on +indication, clothing warping and try-on stage. We additionally apply CLIP to +assess the semantic alignment of try-on results, and evaluate representative +methods with uniformly implemented evaluation metrics on the same dataset.In +addition to quantitative and qualitative evaluation of current open-source +methods, unresolved issues are highlighted and future research directions are +prospected to identify key trends and inspire further exploration. The +uniformly implemented evaluation metrics, dataset and collected methods will be +made public available at +https://github.com/little-misfit/Survey-Of-Virtual-Try-On. + +
+
+ comment: 30 pages, 20 figures +
+
+
+
+
+ + ♻ ☆ DocKylin: A Large Multimodal Model for Visual Document Understanding + with Efficient Visual Slimming + + +
+ Current multimodal large language models (MLLMs) face significant challenges +in visual document understanding (VDU) tasks due to the high resolution, dense +text, and complex layouts typical of document images. These characteristics +demand a high level of detail perception ability from MLLMs. While increasing +input resolution improves detail perception capability, it also leads to longer +sequences of visual tokens, increasing computational costs and straining the +models' ability to handle long contexts. To address these challenges, we +introduce DocKylin, a document-centric MLLM that performs visual content +slimming at both the pixel and token levels, thereby reducing token sequence +length in VDU scenarios. We introduce an Adaptive Pixel Slimming (APS) +preprocessing module to perform pixel-level slimming, increasing the proportion +of informative pixels. Moreover, we propose a novel Dynamic Token Slimming +(DTS) module to conduct token-level slimming, filtering essential tokens and +removing others to adaptively create a more compact visual sequence. +Experiments demonstrate DocKylin's promising performance across various VDU +benchmarks and the effectiveness of each component. + +
+
+
+
+
+ + ♻ ☆ Towards reliable respiratory disease diagnosis based on cough sounds and + vision transformers + + +
+ Recent advancements in deep learning techniques have sparked performance +boosts in various real-world applications including disease diagnosis based on +multi-modal medical data. Cough sound data-based respiratory disease (e.g., +COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also +attracted much attention. However, existing works usually utilise traditional +machine learning or deep models of moderate scales. On the other hand, the +developed approaches are trained and evaluated on small-scale data due to the +difficulty of curating and annotating clinical data on scale. To address these +issues in prior works, we create a unified framework to evaluate various deep +models from lightweight Convolutional Neural Networks (e.g., ResNet18) to +modern vision transformers and compare their performance in respiratory disease +classification. Based on the observations from such an extensive empirical +study, we propose a novel approach to cough-based disease classification based +on both self-supervised and supervised learning on a large-scale cough data +set. Experimental results demonstrate our proposed approach outperforms prior +arts consistently on two benchmark datasets for COVID-19 diagnosis and a +proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%. + +
+
+
+
+
+ + ♻ ☆ Learn Suspected Anomalies from Event Prompts for Video Anomaly Detection + + +
+ Most models for weakly supervised video anomaly detection (WS-VAD) rely on +multiple instance learning, aiming to distinguish normal and abnormal snippets +without specifying the type of anomaly. However, the ambiguous nature of +anomaly definitions across contexts may introduce inaccuracy in discriminating +abnormal and normal events. To show the model what is anomalous, a novel +framework is proposed to guide the learning of suspected anomalies from event +prompts. Given a textual prompt dictionary of potential anomaly events and the +captions generated from anomaly videos, the semantic anomaly similarity between +them could be calculated to identify the suspected events for each video +snippet. It enables a new multi-prompt learning process to constrain the +visual-semantic features across all videos, as well as provides a new way to +label pseudo anomalies for self-training. To demonstrate its effectiveness, +comprehensive experiments and detailed ablation studies are conducted on four +datasets, namely XD-Violence, UCF-Crime, TAD, and ShanghaiTech. Our proposed +model outperforms most state-of-the-art methods in terms of AP or AUC (86.5\%, +\hl{90.4}\%, 94.4\%, and 97.4\%). Furthermore, it shows promising performance +in open-set and cross-dataset cases. The data, code, and models can be found +at: \url{https://github.com/shiwoaz/lap}. + +
+
+
+
+
+ + ♻ ☆ TagCLIP: Improving Discrimination Ability of Open-Vocabulary Semantic + Segmentation + + +
+ Contrastive Language-Image Pre-training (CLIP) has recently shown great +promise in pixel-level zero-shot learning tasks. However, existing approaches +utilizing CLIP's text and patch embeddings to generate semantic masks often +misidentify input pixels from unseen classes, leading to confusion between +novel classes and semantically similar ones. In this work, we propose a novel +approach, TagCLIP (Trusty-aware guided CLIP), to address this issue. We +disentangle the ill-posed optimization problem into two parallel processes: +semantic matching performed individually and reliability judgment for improving +discrimination ability. Building on the idea of special tokens in language +modeling representing sentence-level embeddings, we introduce a trusty token +that enables distinguishing novel classes from known ones in prediction. To +evaluate our approach, we conduct experiments on two benchmark datasets, PASCAL +VOC 2012, COCO-Stuff 164K and PASCAL Context. Our results show that TagCLIP +improves the Intersection over Union (IoU) of unseen classes by 7.4%, 1.7% and +2.1%, respectively, with negligible overheads. The code is available at +https://github.com/dvlab-research/TagCLIP. + +
+
+ comment: TPAMI2024 +
+
+
+
+
+ + ♻ ☆ Cross-Platform Video Person ReID: A New Benchmark Dataset and Adaptation + Approach ECCV 2024 + + +
+ In this paper, we construct a large-scale benchmark dataset for +Ground-to-Aerial Video-based person Re-Identification, named G2A-VReID, which +comprises 185,907 images and 5,576 tracklets, featuring 2,788 distinct +identities. To our knowledge, this is the first dataset for video ReID under +Ground-to-Aerial scenarios. G2A-VReID dataset has the following +characteristics: 1) Drastic view changes; 2) Large number of annotated +identities; 3) Rich outdoor scenarios; 4) Huge difference in resolution. +Additionally, we propose a new benchmark approach for cross-platform ReID by +transforming the cross-platform visual alignment problem into visual-semantic +alignment through vision-language model (i.e., CLIP) and applying a +parameter-efficient Video Set-Level-Adapter module to adapt image-based +foundation model to video ReID tasks, termed VSLA-CLIP. Besides, to further +reduce the great discrepancy across the platforms, we also devise the +platform-bridge prompts for efficient visual feature alignment. Extensive +experiments demonstrate the superiority of the proposed method on all existing +video ReID datasets and our proposed G2A-VReID dataset. + +
+
+ comment: Published at ECCV 2024 +
+
+
+
+
+ + ♻ ☆ The Impact of Print-Scanning in Heterogeneous Morph Evaluation Scenarios + + +
+ Face morphing attacks pose an increasing threat to face recognition (FR) +systems. A morphed photo contains biometric information from two different +subjects to take advantage of vulnerabilities in FRs. These systems are +particularly susceptible to attacks when the morphs are subjected to +print-scanning to mask the artifacts generated during the morphing process. We +investigate the impact of print-scanning on morphing attack detection through a +series of evaluations on heterogeneous morphing attack scenarios. Our +experiments show that we can increase the Mated Morph Presentation Match Rate +(MMPMR) by up to 8.48%. Furthermore, when a Single-image Morphing Attack +Detection (S-MAD) algorithm is not trained to detect print-scanned morphs the +Morphing Attack Classification Error Rate (MACER) can increase by up to 96.12%, +indicating significant vulnerability. + +
+
+ comment: Accepted as a special sessions paper at IJCB 2024 +
+
+
+
+
+ + ♻ ☆ AIGCs Confuse AI Too: Investigating and Explaining Synthetic + Image-induced Hallucinations in Large Vision-Language Models + + +
+ The evolution of Artificial Intelligence Generated Contents (AIGCs) is +advancing towards higher quality. The growing interactions with AIGCs present a +new challenge to the data-driven AI community: While AI-generated contents have +played a crucial role in a wide range of AI models, the potential hidden risks +they introduce have not been thoroughly examined. Beyond human-oriented forgery +detection, AI-generated content poses potential issues for AI models originally +designed to process natural data. In this study, we underscore the exacerbated +hallucination phenomena in Large Vision-Language Models (LVLMs) caused by +AI-synthetic images. Remarkably, our findings shed light on a consistent AIGC +\textbf{hallucination bias}: the object hallucinations induced by synthetic +images are characterized by a greater quantity and a more uniform position +distribution, even these synthetic images do not manifest unrealistic or +additional relevant visual features compared to natural images. Moreover, our +investigations on Q-former and Linear projector reveal that synthetic images +may present token deviations after visual projection, thereby amplifying the +hallucination bias. + +
+
+
+
+
+ + ♻ ☆ Enhancing Representation in Radiography-Reports Foundation Model: A + Granular Alignment Algorithm Using Masked Contrastive Learning + + +
+ Recently, multi-modal vision-language foundation models have gained +significant attention in the medical field. While these models offer great +opportunities, they still face crucial challenges, such as the requirement for +fine-grained knowledge understanding in computer-aided diagnosis and the +capability of utilizing very limited or even no task-specific labeled data in +real-world clinical applications. In this study, we present MaCo, a masked +contrastive chest X-ray foundation model that tackles these challenges. MaCo +explores masked contrastive learning to simultaneously achieve fine-grained +image understanding and zero-shot learning for a variety of medical imaging +tasks. It designs a correlation weighting mechanism to adjust the correlation +between masked chest X-ray image patches and their corresponding reports, +thereby enhancing the model's representation learning capabilities. To evaluate +the performance of MaCo, we conducted extensive experiments using 6 well-known +open-source X-ray datasets. The experimental results demonstrate the +superiority of MaCo over 10 state-of-the-art approaches across tasks such as +classification, segmentation, detection, and phrase grounding. These findings +highlight the significant potential of MaCo in advancing a wide range of +medical image analysis tasks. + +
+
+
+
+
+ + ♻ ☆ BrainVis: Exploring the Bridge between Brain and Visual Signals via + Image Reconstruction + + +
+ Analyzing and reconstructing visual stimuli from brain signals effectively +advances the understanding of human visual system. However, the EEG signals are +complex and contain significant noise. This leads to substantial limitations in +existing works of visual stimuli reconstruction from EEG, such as difficulties +in aligning EEG embeddings with the fine-grained semantic information and a +heavy reliance on additional large self-collected dataset for training. To +address these challenges, we propose a novel approach called BrainVis. Firstly, +we divide the EEG signals into various units and apply a self-supervised +approach on them to obtain EEG time-domain features, in an attempt to ease the +training difficulty. Additionally, we also propose to utilize the +frequency-domain features to enhance the EEG representations. Then, we +simultaneously align EEG time-frequency embeddings with the interpolation of +the coarse and fine-grained semantics in the CLIP space, to highlight the +primary visual components and reduce the cross-modal alignment difficulty. +Finally, we adopt the cascaded diffusion models to reconstruct images. Using +only 10\% training data of the previous work, our proposed BrainVis outperforms +state of the arts in both semantic fidelity reconstruction and generation +quality. The code is available at https://github.com/RomGai/BrainVis. + +
+
+
+
+
+ + ♻ ☆ GISR: Geometric Initialization and Silhouette-based Refinement for + Single-View Robot Pose and Configuration Estimation + + +
+ In autonomous robotics, measurement of the robot's internal state and +perception of its environment, including interaction with other agents such as +collaborative robots, are essential. Estimating the pose of the robot arm from +a single view has the potential to replace classical eye-to-hand calibration +approaches and is particularly attractive for online estimation and dynamic +environments. In addition to its pose, recovering the robot configuration +provides a complete spatial understanding of the observed robot that can be +used to anticipate the actions of other agents in advanced robotics use cases. +Furthermore, this additional redundancy enables the planning and execution of +recovery protocols in case of sensor failures or external disturbances. We +introduce GISR - a deep configuration and robot-to-camera pose estimation +method that prioritizes execution in real-time. GISR consists of two modules: +(i) a geometric initialization module that efficiently computes an approximate +robot pose and configuration, and (ii) a deep iterative silhouette-based +refinement module that arrives at a final solution in just a few iterations. We +evaluate GISR on publicly available data and show that it outperforms existing +methods of the same class in terms of both speed and accuracy, and can compete +with approaches that rely on ground-truth proprioception and recover only the +pose. + +
+
+ comment: IEEE Robotics and Automation Letters (under revision), code available + at http://github.com/iwhitey/GISR-robot +
+
+
+
+
+ + ♻ ☆ IDNet: A Novel Dataset for Identity Document Analysis and Fraud + Detection + + +
+ Effective fraud detection and analysis of government-issued identity +documents, such as passports, driver's licenses, and identity cards, are +essential in thwarting identity theft and bolstering security on online +platforms. The training of accurate fraud detection and analysis tools depends +on the availability of extensive identity document datasets. However, current +publicly available benchmark datasets for identity document analysis, including +MIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a +limited number of samples, cover insufficient varieties of fraud patterns, and +seldom include alterations in critical personal identifying fields like +portrait images, limiting their utility in training models capable of detecting +realistic frauds while preserving privacy. + In response to these shortcomings, our research introduces a new benchmark +dataset, IDNet, designed to advance privacy-preserving fraud detection efforts. +The IDNet dataset comprises 837,060 images of synthetically generated identity +documents, totaling approximately 490 gigabytes, categorized into 20 types from +$10$ U.S. states and 10 European countries. We evaluate the utility and present +use cases of the dataset, illustrating how it can aid in training +privacy-preserving fraud detection methods, facilitating the generation of +camera and video capturing of identity documents, and testing schema +unification and other identity document management functionalities. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ♻ ☆ Projected Stochastic Gradient Descent with Quantum Annealed Binary + Gradients + + +
+ We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards +training neural networks with binary weights, known as binary neural networks +(BNNs), on quantum hardware. BNNs reduce the computational requirements and +energy consumption of deep learning models with minimal loss in accuracy. +However, training them in practice remains to be an open challenge. Most known +BNN-optimisers either rely on projected updates or binarise weights +post-training. Instead, QP-SBGD approximately maps the gradient onto binary +variables, by solving a quadratic constrained binary optimisation. Under +practically reasonable assumptions, we show that this update rule converges +with a rate of $\mathcal{O}(1 / \sqrt{T})$. Moreover, we show how the +$\mathcal{NP}$-hard projection can be effectively executed on an adiabatic +quantum annealer, harnessing recent advancements in quantum computation. We +also introduce a projected version of this update rule and prove that if a +fixed point exists in the binary variable space, the modified updates will +converge to it. Last but not least, our algorithm is implemented layer-wise, +making it suitable to train larger networks on resource-limited quantum +hardware. Through extensive evaluations, we show that QP-SBGD outperforms or is +on par with competitive and well-established baselines such as BinaryConnect, +signSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as +well as binary graph neural networks. + +
+
+
+
+
+ + ♻ ☆ Co-synthesis of Histopathology Nuclei Image-Label Pairs using a + Context-Conditioned Joint Diffusion Model ECCV 2024 + + +
+ In multi-class histopathology nuclei analysis tasks, the lack of training +data becomes a main bottleneck for the performance of learning-based methods. +To tackle this challenge, previous methods have utilized generative models to +increase data by generating synthetic samples. However, existing methods often +overlook the importance of considering the context of biological tissues (e.g., +shape, spatial layout, and tissue type) in the synthetic data. Moreover, while +generative models have shown superior performance in synthesizing realistic +histopathology images, none of the existing methods are capable of producing +image-label pairs at the same time. In this paper, we introduce a novel +framework for co-synthesizing histopathology nuclei images and paired semantic +labels using a context-conditioned joint diffusion model. We propose +conditioning of a diffusion model using nucleus centroid layouts with +structure-related text prompts to incorporate spatial and structural context +information into the generation targets. Moreover, we enhance the granularity +of our synthesized semantic labels by generating instance-wise nuclei labels +using distance maps synthesized concurrently in conjunction with the images and +semantic labels. We demonstrate the effectiveness of our framework in +generating high-quality samples on multi-institutional, multi-organ, and +multi-modality datasets. Our synthetic data consistently outperforms existing +augmentation methods in the downstream tasks of nuclei segmentation and +classification. + +
+
+ comment: ECCV 2024 accepted +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ SpannerLib: Embedding Declarative Information Extraction in an + Imperative Workflow + + +
+ Document spanners have been proposed as a formal framework for declarative +Information Extraction (IE) from text, following IE products from the industry +and academia. Over the past decade, the framework has been studied thoroughly +in terms of expressive power, complexity, and the ability to naturally combine +text analysis with relational querying. This demonstration presents SpannerLib +a library for embedding document spanners in Python code. SpannerLib +facilitates the development of IE programs by providing an implementation of +Spannerlog (Datalog-based documentspanners) that interacts with the Python code +in two directions: rules can be embedded inside Python, and they can invoke +custom Python code (e.g., calls to ML-based NLP models) via user-defined +functions. The demonstration scenarios showcase IE programs, with increasing +levels of complexity, within Jupyter Notebook. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Laser: Parameter-Efficient LLM Bi-Tuning for Sequential Recommendation + with Collaborative Information + + +
+ Sequential recommender systems are essential for discerning user preferences +from historical interactions and facilitating targeted recommendations. Recent +innovations employing Large Language Models (LLMs) have advanced the field by +encoding item semantics, yet they often necessitate substantial parameter +tuning and are resource-demanding. Moreover, these works fails to consider the +diverse characteristics of different types of users and thus diminishes the +recommendation accuracy. In this paper, we propose a parameter-efficient Large +Language Model Bi-Tuning framework for sequential recommendation with +collaborative information (Laser). Specifically, Bi-Tuning works by inserting +trainable virtual tokens at both the prefix and suffix of the input sequence +and freezing the LLM parameters, thus optimizing the LLM for the sequential +recommendation. In our Laser, the prefix is utilized to incorporate user-item +collaborative information and adapt the LLM to the recommendation task, while +the suffix converts the output embeddings of the LLM from the language space to +the recommendation space for the follow-up item recommendation. Furthermore, to +capture the characteristics of different types of users when integrating the +collaborative information via the prefix, we introduce M-Former, a lightweight +MoE-based querying transformer that uses a set of query experts to integrate +diverse user-specific collaborative information encoded by frozen ID-based +sequential recommender systems, significantly improving the accuracy of +recommendations. Extensive experiments on real-world datasets demonstrate that +Laser can parameter-efficiently adapt LLMs to effective recommender systems, +significantly outperforming state-of-the-art methods. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Blockchain-based Federated Recommendation with Incentive Mechanism + + +
+ Nowadays, federated recommendation technology is rapidly evolving to help +multiple organisations share data and train models while meeting user privacy, +data security and government regulatory requirements. However, federated +recommendation increases customer system costs such as power, computational and +communication resources. Besides, federated recommendation systems are also +susceptible to model attacks and data poisoning by participating malicious +clients. Therefore, most customers are unwilling to participate in federated +recommendation without any incentive. To address these problems, we propose a +blockchain-based federated recommendation system with incentive mechanism to +promote more trustworthy, secure, and efficient federated recommendation +service. First, we construct a federated recommendation system based on NeuMF +and FedAvg. Then we introduce a reverse auction mechanism to select optimal +clients that can maximize the social surplus. Finally, we employ blockchain for +on-chain evidence storage of models to ensure the safety of the federated +recommendation system. The experimental results show that our proposed +incentive mechanism can attract clients with superior training data to engage +in the federal recommendation at a lower cost, which can increase the economic +benefit of federal recommendation by 54.9\% while improve the recommendation +performance. Thus our work provides theoretical and technological support for +the construction of a harmonious and healthy ecological environment for the +application of federal recommendation. + +
+
+ comment: This paper has been accepted on 2024 Blockchain and Web3 Technology + Innovation and Application Exchange Conference (BWTAC 2024) +
+
+
+
+
+ + ♻ ☆ rerankers: A Lightweight Python Library to Unify Ranking Methods + + +
+ This paper presents rerankers, a Python library which provides an easy-to-use +interface to the most commonly used re-ranking approaches. Re-ranking is an +integral component of many retrieval pipelines; however, there exist numerous +approaches to it, relying on different implementation methods. rerankers +unifies these methods into a single user-friendly interface, allowing +practitioners and researchers alike to explore different methods while only +changing a single line of Python code. Moreover ,rerankers ensures that its +implementations are done with the fewest dependencies possible, and re-uses the +original implementation whenever possible, guaranteeing that our simplified +interface results in no performance degradation compared to more complex ones. +The full source code and list of supported models are updated regularly and +available at https://github.com/answerdotai/rerankers. + +
+
+
+
+
+ + ♻ ☆ Impedance vs. Power Side-channel Vulnerabilities: A Comparative Study + + +
+ In recent times, impedance side-channel analysis has emerged as a potent +strategy for adversaries seeking to extract sensitive information from +computing systems. It leverages variations in the intrinsic impedance of a +chip's internal structure across different logic states. In this study, we +conduct a comparative analysis between the newly explored impedance side +channel and the well-established power side channel. Through experimental +evaluation, we investigate the efficacy of these two side channels in +extracting the cryptographic key from the Advanced Encryption Standard (AES) +and analyze their performance. Our results indicate that impedance analysis +demonstrates a higher potential for cryptographic key extraction compared to +power side-channel analysis. Moreover, we identify scenarios where power +side-channel analysis does not yield satisfactory results, whereas impedance +analysis proves to be more robust and effective. This work not only underscores +the significance of impedance side-channel analysis in enhancing cryptographic +security but also emphasizes the necessity for a deeper understanding of its +mechanisms and implications. + +
+
+
+
+
+
+
+
+ + Machine Learning 51 + +
+
+
+ + ☆ Double Machine Learning at Scale to Predict Causal Impact of Customer + Actions ECML + + +
+ Causal Impact (CI) of customer actions are broadly used across the industry +to inform both short- and long-term investment decisions of various types. In +this paper, we apply the double machine learning (DML) methodology to estimate +the CI values across 100s of customer actions of business interest and 100s of +millions of customers. We operationalize DML through a causal ML library based +on Spark with a flexible, JSON-driven model configuration approach to estimate +CI at scale (i.e., across hundred of actions and millions of customers). We +outline the DML methodology and implementation, and associated benefits over +the traditional potential outcomes based CI model. We show population-level as +well as customer-level CI values along with confidence intervals. The +validation metrics show a 2.2% gain over the baseline methods and a 2.5X gain +in the computational time. Our contribution is to advance the scalable +application of CI, while also providing an interface that allows faster +experimentation, cross-platform support, ability to onboard new use cases, and +improves accessibility of underlying code for partner teams. + +
+
+ comment: 16 pages, 11 figures. Accepted at the European Conference on Machine + Learning and Principles and Practice of Knowledge Discovery in Databases + (ECML PKDD) 2023, Turin, Italy +
+
+
+
+
+ + ☆ Generative Principal Component Regression via Variational Inference + + +
+ The ability to manipulate complex systems, such as the brain, to modify +specific outcomes has far-reaching implications, particularly in the treatment +of psychiatric disorders. One approach to designing appropriate manipulations +is to target key features of predictive models. While generative latent +variable models, such as probabilistic principal component analysis (PPCA), is +a powerful tool for identifying targets, they struggle incorporating +information relevant to low-variance outcomes into the latent space. When +stimulation targets are designed on the latent space in such a scenario, the +intervention can be suboptimal with minimal efficacy. To address this problem, +we develop a novel objective based on supervised variational autoencoders +(SVAEs) that enforces such information is represented in the latent space. The +novel objective can be used with linear models, such as PPCA, which we refer to +as generative principal component regression (gPCR). We show in simulations +that gPCR dramatically improves target selection in manipulation as compared to +standard PCR and SVAEs. As part of these simulations, we develop a metric for +detecting when relevant information is not properly incorporated into the +loadings. We then show in two neural datasets related to stress and social +behavior in which gPCR dramatically outperforms PCR in predictive performance +and that SVAEs exhibit low incorporation of relevant information into the +loadings. Overall, this work suggests that our method significantly improves +target selection for manipulation using latent variable models over competitor +inference schemes. + +
+
+
+
+
+ + ☆ TimeDiT: General-purpose Diffusion Transformers for Time Series + Foundation Model ICML 2024 + + +
+ With recent advances in building foundation models for texts and video data, +there is a surge of interest in foundation models for time series. A family of +models have been developed, utilizing a temporal auto-regressive generative +Transformer architecture, whose effectiveness has been proven in Large Language +Models. While the empirical results are promising, almost all existing time +series foundation models have only been tested on well-curated ``benchmark'' +datasets very similar to texts. However, real-world time series exhibit unique +challenges, such as variable channel sizes across domains, missing values, and +varying signal sampling intervals due to the multi-resolution nature of +real-world data. Additionally, the uni-directional nature of temporally +auto-regressive decoding limits the incorporation of domain knowledge, such as +physical laws expressed as partial differential equations (PDEs). To address +these challenges, we introduce the Time Diffusion Transformer (TimeDiT), a +general foundation model for time series that employs a denoising diffusion +paradigm instead of temporal auto-regressive generation. TimeDiT leverages the +Transformer architecture to capture temporal dependencies and employs diffusion +processes to generate high-quality candidate samples without imposing stringent +assumptions on the target distribution via novel masking schemes and a channel +alignment strategy. Furthermore, we propose a finetuning-free model editing +strategy that allows the seamless integration of external knowledge during the +sampling process without updating any model parameters. Extensive experiments +conducted on a varity of tasks such as forecasting, imputation, and anomaly +detection, demonstrate the effectiveness of TimeDiT. + +
+
+ comment: 23 Pages, 6 Figures, 11 Tables. First present at ICML 2024 Workshop + on Foundation Models in the Wild +
+
+
+
+
+ + ☆ On the Benefits of Memory for Modeling Time-Dependent PDEs + + +
+ Data-driven techniques have emerged as a promising alternative to traditional +numerical methods for solving partial differential equations (PDEs). These +techniques frequently offer a better trade-off between computational cost and +accuracy for many PDE families of interest. For time-dependent PDEs, existing +methodologies typically treat PDEs as Markovian systems, i.e., the evolution of +the system only depends on the ``current state'', and not the past states. +However, distortion of the input signals -- e.g., due to discretization or +low-pass filtering -- can render the evolution of the distorted signals +non-Markovian. In this work, motivated by the Mori-Zwanzig theory of model +reduction, we investigate the impact of architectures with memory for modeling +PDEs: that is, when past states are explicitly used to predict the future. We +introduce Memory Neural Operator (MemNO), a network based on the recent SSM +architectures and Fourier Neural Operator (FNO). We empirically demonstrate on +a variety of PDE families of interest that when the input is given on a +low-resolution grid, MemNO significantly outperforms the baselines without +memory, achieving more than 6 times less error on unseen PDEs. Via a +combination of theory and experiments, we show that the effect of memory is +particularly significant when the solution of the PDE has high frequency +Fourier components (e.g., low-viscosity fluid dynamics), and it also increases +robustness to observation noise. + +
+
+
+
+
+ + ☆ QID$^2$: An Image-Conditioned Diffusion Model for Q-space Up-sampling of + DWI Data MICCAI 2024 + + +
+ We propose an image-conditioned diffusion model to estimate high angular +resolution diffusion weighted imaging (DWI) from a low angular resolution +acquisition. Our model, which we call QID$^2$, takes as input a set of low +angular resolution DWI data and uses this information to estimate the DWI data +associated with a target gradient direction. We leverage a U-Net architecture +with cross-attention to preserve the positional information of the reference +images, further guiding the target image generation. We train and evaluate +QID$^2$ on single-shell DWI samples curated from the Human Connectome Project +(HCP) dataset. Specifically, we sub-sample the HCP gradient directions to +produce low angular resolution DWI data and train QID$^2$ to reconstruct the +missing high angular resolution samples. We compare QID$^2$ with two +state-of-the-art GAN models. Our results demonstrate that QID$^2$ not only +achieves higher-quality generated images, but it consistently outperforms the +GAN models in downstream tensor estimation across multiple metrics. Taken +together, this study highlights the potential of diffusion models, and QID$^2$ +in particular, for q-space up-sampling, thus offering a promising toolkit for +clinical and research applications. + +
+
+ comment: Accepted at MICCAI 2024 International Workshop on Computational + Diffusion MRI. Zijian Chen and Jueqi Wang contributed equally to this work +
+
+
+
+
+ + ☆ A Lesion-aware Edge-based Graph Neural Network for Predicting Language + Ability in Patients with Post-stroke Aphasia MICCAI 2024 + + +
+ We propose a lesion-aware graph neural network (LEGNet) to predict language +ability from resting-state fMRI (rs-fMRI) connectivity in patients with +post-stroke aphasia. Our model integrates three components: an edge-based +learning module that encodes functional connectivity between brain regions, a +lesion encoding module, and a subgraph learning module that leverages +functional similarities for prediction. We use synthetic data derived from the +Human Connectome Project (HCP) for hyperparameter tuning and model pretraining. +We then evaluate the performance using repeated 10-fold cross-validation on an +in-house neuroimaging dataset of post-stroke aphasia. Our results demonstrate +that LEGNet outperforms baseline deep learning methods in predicting language +ability. LEGNet also exhibits superior generalization ability when tested on a +second in-house dataset that was acquired under a slightly different +neuroimaging protocol. Taken together, the results of this study highlight the +potential of LEGNet in effectively learning the relationships between rs-fMRI +connectivity and language ability in a patient cohort with brain lesions for +improved post-stroke aphasia evaluation. + +
+
+ comment: Accepted at MICCAI 2024 International Workshop on Machine Learning in + Clinical Neuroimaging (MLCN) +
+
+
+
+
+ + ☆ K-Origins: Better Colour Quantification for Neural Networks + + +
+ K-Origins is a neural network layer designed to improve image-based network +performances when learning colour, or intensities, is beneficial. Over 250 +encoder-decoder convolutional networks are trained and tested on 16-bit +synthetic data, demonstrating that K-Origins improves semantic segmentation +accuracy in two scenarios: object detection with low signal-to-noise ratios, +and segmenting multiple objects that are identical in shape but vary in colour. +K-Origins generates output features from the input features, $\textbf{X}$, by +the equation $\textbf{Y}_k = \textbf{X}-\textbf{J}\cdot w_k$ for each trainable +parameter $w_k$, where $\textbf{J}$ is a matrix of ones. Additionally, networks +with varying receptive fields were trained to determine optimal network depths +based on the dimensions of target classes, suggesting that receptive field +lengths should exceed object sizes. By ensuring a sufficient receptive field +length and incorporating K-Origins, we can achieve better semantic network +performance. + +
+
+ comment: 16 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Reinforcement Learning-enabled Satellite Constellation Reconfiguration + and Retasking for Mission-Critical Applications + + +
+ The development of satellite constellation applications is rapidly advancing +due to increasing user demands, reduced operational costs, and technological +advancements. However, a significant gap in the existing literature concerns +reconfiguration and retasking issues within satellite constellations, which is +the primary focus of our research. In this work, we critically assess the +impact of satellite failures on constellation performance and the associated +task requirements. To facilitate this analysis, we introduce a system modeling +approach for GPS satellite constellations, enabling an investigation into +performance dynamics and task distribution strategies, particularly in +scenarios where satellite failures occur during mission-critical operations. +Additionally, we introduce reinforcement learning (RL) techniques, specifically +Q-learning, Policy Gradient, Deep Q-Network (DQN), and Proximal Policy +Optimization (PPO), for managing satellite constellations, addressing the +challenges posed by reconfiguration and retasking following satellite failures. +Our results demonstrate that DQN and PPO achieve effective outcomes in terms of +average rewards, task completion rates, and response times. + +
+
+ comment: Accepted for publication in the IEEE Military Communications + Conference (IEEE MILCOM 2024) +
+
+
+
+
+ + ♻ ☆ PID Accelerated Temporal Difference Algorithms + + +
+ Long-horizon tasks, which have a large discount factor, pose a challenge for +most conventional reinforcement learning (RL) algorithms. Algorithms such as +Value Iteration and Temporal Difference (TD) learning have a slow convergence +rate and become inefficient in these tasks. When the transition distributions +are given, PID VI was recently introduced to accelerate the convergence of +Value Iteration using ideas from control theory. Inspired by this, we introduce +PID TD Learning and PID Q-Learning algorithms for the RL setting, in which only +samples from the environment are available. We give a theoretical analysis of +the convergence of PID TD Learning and its acceleration compared to the +conventional TD Learning. We also introduce a method for adapting PID gains in +the presence of noise and empirically verify its effectiveness. + +
+
+
+
+
+ + ♻ ☆ Improving Rare Word Translation With Dictionaries and Attention Masking + + +
+ In machine translation, rare words continue to be a problem for the dominant +encoder-decoder architecture, especially in low-resource and out-of-domain +translation settings. Human translators solve this problem with monolingual or +bilingual dictionaries. In this paper, we propose appending definitions from a +bilingual dictionary to source sentences and using attention masking to link +together rare words with their definitions. We find that including definitions +for rare words improves performance by up to 1.0 BLEU and 1.6 MacroF1. + +
+
+ comment: 11 pages, 3 figures, 3 tables. Accepted at AMTA 2024 +
+
+
+
+
+ + ♻ ☆ Low-Rank Quantization-Aware Training for LLMs + + +
+ Large language models (LLMs) are omnipresent, however their practical +deployment is challenging due to their ever increasing computational and memory +demands. Quantization is one of the most effective ways to make them more +compute and memory efficient. Quantization-aware training (QAT) methods, +generally produce the best quantized performance, however it comes at the cost +of potentially long training time and excessive memory usage, making it +impractical when applying for LLMs. Inspired by parameter-efficient fine-tuning +(PEFT) and low-rank adaptation (LoRA) literature, we propose LR-QAT -- a +lightweight and memory-efficient QAT algorithm for LLMs. LR-QAT employs several +components to save memory without sacrificing predictive performance: (a) +low-rank auxiliary weights that are aware of the quantization grid; (b) a +downcasting operator using fixed-point or double-packed integers and (c) +checkpointing. Unlike most related work, our method (i) is inference-efficient, +leading to no additional overhead compared to traditional PTQ; (ii) can be seen +as a general extended pretraining framework, meaning that the resulting model +can still be utilized for any downstream task afterwards; (iii) can be applied +across a wide range of quantization settings, such as different choices +quantization granularity, activation quantization, and seamlessly combined with +many PTQ techniques. We apply LR-QAT to LLaMA-1/2/3 and Mistral model families +and validate its effectiveness on several downstream tasks. Our method +outperforms common post-training quantization (PTQ) approaches and reaches the +same model performance as full-model QAT at the fraction of its memory usage. +Specifically, we can train a 7B LLM on a single consumer grade GPU with 24GB of +memory. Our source code is available at +https://github.com/qualcomm-ai-research/LR-QAT + +
+
+
+
+
+ + ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of + Peptides + + +
+ Molecular Dynamics (MD) simulations are irreplaceable and ubiquitous in +fields of materials science, chemistry, pharmacology just to name a few. +Conventional MD simulations are plagued by numerical stability as well as long +equilibration time issues, which limits broader applications of MD simulations. +Recently, a surge of deep learning approaches have been devised for +time-coarsened dynamics, which learns the state transition mechanism over much +larger time scales to overcome these limitations. However, only a few methods +target the underlying Boltzmann distribution by resampling techniques, where +proposals are rarely accepted as new states with low efficiency. In this work, +we propose a force-guided bridge matching model, FBM, a novel framework that +first incorporates physical priors into bridge matching for full-atom +time-coarsened dynamics. With the guidance of our well-designed intermediate +force field, FBM is feasible to target the Boltzmann-like distribution by +direct inference without extra steps. Experiments on small peptides verify our +superiority in terms of comprehensive metrics and demonstrate transferability +to unseen peptide systems. + +
+
+
+
+
+ + ♻ ☆ Verifiable cloud-based variational quantum algorithms + + +
+ Variational quantum algorithms (VQAs) have shown potential for quantum +advantage with noisy intermediate-scale quantum (NISQ) devices for quantum +machine learning (QML). However, given the high cost and limited availability +of quantum resources, delegating VQAs via cloud networks is a more practical +solution for clients with limited quantum capabilities. Recently, Shingu et +al.[Physical Review A, 105, 022603 (2022)] proposed a variational secure cloud +quantum computing protocol, utilizing ancilla-driven quantum computation (ADQC) +for cloud-based VQAs with minimal quantum resource consumption. However, their +protocol lacks verifiability, which exposes it to potential malicious behaviors +by the server. Additionally, channel loss requires frequent re-delegation as +the size of the delegated variational circuit grows, complicating verification +due to increased circuit complexity. This paper introduces a new protocol to +address these challenges and enhance both verifiability and tolerance to +channel loss in cloud-based VQAs. + +
+
+
+
+
+ + ♻ ☆ Bayesian Learning in a Nonlinear Multiscale State-Space Model + + +
+ The ubiquity of multiscale interactions in complex systems is +well-recognized, with development and heredity serving as a prime example of +how processes at different temporal scales influence one another. This work +introduces a novel multiscale state-space model to explore the dynamic +interplay between systems interacting across different time scales, with +feedback between each scale. We propose a Bayesian learning framework to +estimate unknown states by learning the unknown process noise covariances +within this multiscale model. We develop a Particle Gibbs with Ancestor +Sampling (PGAS) algorithm for inference and demonstrate through simulations the +efficacy of our approach. + +
+
+ comment: Corrected a typo +
+
+
+
+
+ + ♻ ☆ Foundation Models for Music: A Survey + + +
+ In recent years, foundation models (FMs) such as large language models (LLMs) +and latent diffusion models (LDMs) have profoundly impacted diverse sectors, +including music. This comprehensive review examines state-of-the-art (SOTA) +pre-trained models and foundation models in music, spanning from representation +learning, generative learning and multimodal learning. We first contextualise +the significance of music in various industries and trace the evolution of AI +in music. By delineating the modalities targeted by foundation models, we +discover many of the music representations are underexplored in FM development. +Then, emphasis is placed on the lack of versatility of previous methods on +diverse music applications, along with the potential of FMs in music +understanding, generation and medical application. By comprehensively exploring +the details of the model pre-training paradigm, architectural choices, +tokenisation, finetuning methodologies and controllability, we emphasise the +important topics that should have been well explored, like instruction tuning +and in-context learning, scaling law and emergent ability, as well as +long-sequence modelling etc. A dedicated section presents insights into music +agents, accompanied by a thorough analysis of datasets and evaluations +essential for pre-training and downstream tasks. Finally, by underscoring the +vital importance of ethical considerations, we advocate that following research +on FM for music should focus more on such issues as interpretability, +transparency, human responsibility, and copyright issues. The paper offers +insights into future challenges and trends on FMs for music, aiming to shape +the trajectory of human-AI collaboration in the music realm. + +
+
+
+
+
+ + ♻ ☆ Different Victims, Same Layout: Email Visual Similarity Detection for + Enhanced Email Protection CCS 2024 + + +
+ In the pursuit of an effective spam detection system, the focus has often +been on identifying known spam patterns either through rule-based detection +systems or machine learning (ML) solutions that rely on keywords. However, both +systems are susceptible to evasion techniques and zero-day attacks that can be +achieved at low cost. Therefore, an email that bypassed the defense system once +can do it again in the following days, even though rules are updated or the ML +models are retrained. The recurrence of failures to detect emails that exhibit +layout similarities to previously undetected spam is concerning for customers +and can erode their trust in a company. Our observations show that threat +actors reuse email kits extensively and can bypass detection with little +effort, for example, by making changes to the content of emails. In this work, +we propose an email visual similarity detection approach, named Pisco, to +improve the detection capabilities of an email threat defense system. We apply +our proof of concept to some real-world samples received from different +sources. Our results show that email kits are being reused extensively and +visually similar emails are sent to our customers at various time intervals. +Therefore, this method could be very helpful in situations where detection +features that rely on textual features and keywords are bypassed, an occurrence +our observations show happens frequently. + +
+
+ comment: To be published in the proceedings of the ACM Conference on Computer + and Communications Security (ACM CCS 2024) +
+
+
+
+
+ + ♻ ☆ On the Convergence of Gradient Descent for Large Learning Rates + + +
+ A vast literature on convergence guarantees for gradient descent and derived +methods exists at the moment. However, a simple practical situation remains +unexplored: when a fixed step size is used, can we expect gradient descent to +converge starting from any initialization? We provide fundamental impossibility +results showing that convergence becomes impossible no matter the +initialization if the step size gets too big. Looking at the asymptotic value +of the gradient norm along the optimization trajectory, we see that there is a +phase transition as the step size crosses a critical value. This has been +observed by practitioners, yet the true mechanisms through which this happens +remain unclear beyond heuristics. Using results from dynamical systems theory, +we provide a proof of this in the case of linear neural networks with a squared +loss. We also prove the impossibility of convergence for more general losses +without requiring strong assumptions such as Lipschitz continuity for the +gradient. We validate our findings through experiments with non-linear +networks. + +
+
+
+
+
+ + ♻ ☆ On the Federated Learning Framework for Cooperative Perception + + +
+ Cooperative perception is essential to enhance the efficiency and safety of +future transportation systems, requiring extensive data sharing among vehicles +on the road, which raises significant privacy concerns. Federated learning +offers a promising solution by enabling data privacy-preserving collaborative +enhancements in perception, decision-making, and planning among connected and +autonomous vehicles (CAVs). However, federated learning is impeded by +significant challenges arising from data heterogeneity across diverse clients, +potentially diminishing model accuracy and prolonging convergence periods. This +study introduces a specialized federated learning framework for CP, termed the +federated dynamic weighted aggregation (FedDWA) algorithm, facilitated by +dynamic adjusting loss (DALoss) function. This framework employs dynamic client +weighting to direct model convergence and integrates a novel loss function that +utilizes Kullback-Leibler divergence (KLD) to counteract the detrimental +effects of non-independently and identically distributed (Non-IID) and +unbalanced data. Utilizing the BEV transformer as the primary model, our +rigorous testing on the OpenV2V dataset, augmented with FedBEVT data, +demonstrates significant improvements in the average intersection over union +(IoU). These results highlight the substantial potential of our federated +learning framework to address data heterogeneity challenges in CP, thereby +enhancing the accuracy of environmental perception models and facilitating more +robust and efficient collaborative learning solutions in the transportation +sector. + +
+
+ comment: accepted by IEEE RA-L +
+
+
+
+
+ + ♻ ☆ An embedding-based distance for temporal graphs + + +
+ Temporal graphs are commonly used to represent time-resolved relations +between entities in many natural and artificial systems. Many techniques were +devised to investigate the evolution of temporal graphs by comparing their +state at different time points. However, quantifying the similarity between +temporal graphs as a whole is an open problem. Here, we use embeddings based on +time-respecting random walks to introduce a new notion of distance between +temporal graphs. This distance is well-defined for pairs of temporal graphs +with different numbers of nodes and different time spans. We study the case of +a matched pair of graphs, when a known relation exists between their nodes, and +the case of unmatched graphs, when such a relation is unavailable and the +graphs may be of different sizes. We use empirical and synthetic temporal +network data to show that the distance we introduce discriminates graphs with +different topological and temporal properties. We provide an efficient +implementation of the distance computation suitable for large-scale temporal +graphs. + +
+
+
+
+
+ + ♻ ☆ Heterogeneity-Informed Meta-Parameter Learning for Spatiotemporal Time + Series Forecasting KDD'24 + + +
+ Spatiotemporal time series forecasting plays a key role in a wide range of +real-world applications. While significant progress has been made in this area, +fully capturing and leveraging spatiotemporal heterogeneity remains a +fundamental challenge. Therefore, we propose a novel Heterogeneity-Informed +Meta-Parameter Learning scheme. Specifically, our approach implicitly captures +spatiotemporal heterogeneity through learning spatial and temporal embeddings, +which can be viewed as a clustering process. Then, a novel spatiotemporal +meta-parameter learning paradigm is proposed to learn spatiotemporal-specific +parameters from meta-parameter pools, which is informed by the captured +heterogeneity. Based on these ideas, we develop a Heterogeneity-Informed +Spatiotemporal Meta-Network (HimNet) for spatiotemporal time series +forecasting. Extensive experiments on five widely-used benchmarks demonstrate +our method achieves state-of-the-art performance while exhibiting superior +interpretability. Our code is available at +https://github.com/XDZhelheim/HimNet. + +
+
+ comment: Published in KDD'24 Research Track +
+
+
+
+
+ + ♻ ☆ FairX: A comprehensive benchmarking tool for model analysis using + fairness, utility, and explainability + + +
+ We present FairX, an open-source Python-based benchmarking tool designed for +the comprehensive analysis of models under the umbrella of fairness, utility, +and eXplainability (XAI). FairX enables users to train benchmarking +bias-mitigation models and evaluate their fairness using a wide array of +fairness metrics, data utility metrics, and generate explanations for model +predictions, all within a unified framework. Existing benchmarking tools do not +have the way to evaluate synthetic data generated from fair generative models, +also they do not have the support for training fair generative models either. +In FairX, we add fair generative models in the collection of our fair-model +library (pre-processing, in-processing, post-processing) and evaluation metrics +for evaluating the quality of synthetic fair data. This version of FairX +supports both tabular and image datasets. It also allows users to provide their +own custom datasets. The open-source FairX benchmarking package is publicly +available at \url{https://github.com/fahim-sikder/FairX}. + +
+
+
+
+
+ + ♻ ☆ Behavioral Learning of Dish Rinsing and Scrubbing based on Interruptive + Direct Teaching Considering Assistance Rate + + +
+ Robots are expected to manipulate objects in a safe and dexterous way. For +example, washing dishes is a dexterous operation that involves scrubbing the +dishes with a sponge and rinsing them with water. It is necessary to learn it +safely without splashing water and without dropping the dishes. In this study, +we propose a safe and dexterous manipulation system. The robot learns a +dynamics model of the object by estimating the state of the object and the +robot itself, the control input, and the amount of human assistance required +(assistance rate) after the human corrects the initial trajectory of the +robot's hands by interruptive direct teaching. By backpropagating the error +between the estimated and the reference value using the acquired dynamics +model, the robot can generate a control input that approaches the reference +value, for example, so that human assistance is not required and the dish does +not move excessively. This allows for adaptive rinsing and scrubbing of dishes +with unknown shapes and properties. As a result, it is possible to generate +safe actions that require less human assistance. + +
+
+ comment: Accepted at Advanced Robotics +
+
+
+
+
+ + ♻ ☆ Towards Explainable Traffic Flow Prediction with Large Language Models + + +
+ Traffic forecasting is crucial for intelligent transportation systems. It has +experienced significant advancements thanks to the power of deep learning in +capturing latent patterns of traffic data. However, recent deep-learning +architectures require intricate model designs and lack an intuitive +understanding of the mapping from input data to predicted results. Achieving +both accuracy and explainability in traffic prediction models remains a +challenge due to the complexity of traffic data and the inherent opacity of +deep learning models. To tackle these challenges, we propose a Traffic flow +Prediction model based on Large Language Models (LLMs) to generate explainable +traffic predictions, named xTP-LLM. By transferring multi-modal traffic data +into natural language descriptions, xTP-LLM captures complex time-series +patterns and external factors from comprehensive traffic data. The LLM +framework is fine-tuned using language-based instructions to align with +spatial-temporal traffic flow data. Empirically, xTP-LLM shows competitive +accuracy compared with deep learning baselines, while providing an intuitive +and reliable explanation for predictions. This paper contributes to advancing +explainable traffic prediction models and lays a foundation for future +exploration of LLM applications in transportation. To the best of our +knowledge, this is the first study to use LLM for explainable prediction of +traffic flows. + +
+
+ comment: 31pages, 16 figures +
+
+
+
+
+ + ♻ ☆ OceanGPT: A Large Language Model for Ocean Science Tasks ACL2024 + + +
+ Ocean science, which delves into the oceans that are reservoirs of life and +biodiversity, is of great significance given that oceans cover over 70% of our +planet's surface. Recently, advances in Large Language Models (LLMs) have +transformed the paradigm in science. Despite the success in other domains, +current LLMs often fall short in catering to the needs of domain experts like +oceanographers, and the potential of LLMs for ocean science is under-explored. +The intrinsic reasons are the immense and intricate nature of ocean data as +well as the necessity for higher granularity and richness in knowledge. To +alleviate these issues, we introduce OceanGPT, the first-ever large language +model in the ocean domain, which is expert in various ocean science tasks. We +also propose OceanGPT, a novel framework to automatically obtain a large volume +of ocean domain instruction data, which generates instructions based on +multi-agent collaboration. Additionally, we construct the first oceanography +benchmark, OceanBench, to evaluate the capabilities of LLMs in the ocean +domain. Though comprehensive experiments, OceanGPT not only shows a higher +level of knowledge expertise for oceans science tasks but also gains +preliminary embodied intelligence capabilities in ocean technology. + +
+
+ comment: ACL2024. Project Website: http://oceangpt.zjukg.cn/ +
+
+
+
+
+ + ♻ ☆ Statistical Context Detection for Deep Lifelong Reinforcement Learning + + +
+ Context detection involves labeling segments of an online stream of data as +belonging to different tasks. Task labels are used in lifelong learning +algorithms to perform consolidation or other procedures that prevent +catastrophic forgetting. Inferring task labels from online experiences remains +a challenging problem. Most approaches assume finite and low-dimension +observation spaces or a preliminary training phase during which task labels are +learned. Moreover, changes in the transition or reward functions can be +detected only in combination with a policy, and therefore are more difficult to +detect than changes in the input distribution. This paper presents an approach +to learning both policies and labels in an online deep reinforcement learning +setting. The key idea is to use distance metrics, obtained via optimal +transport methods, i.e., Wasserstein distance, on suitable latent action-reward +spaces to measure distances between sets of data points from past and current +streams. Such distances can then be used for statistical tests based on an +adapted Kolmogorov-Smirnov calculation to assign labels to sequences of +experiences. A rollback procedure is introduced to learn multiple policies by +ensuring that only the appropriate data is used to train the corresponding +policy. The combination of task detection and policy deployment allows for the +optimization of lifelong reinforcement learning agents without an oracle that +provides task labels. The approach is tested using two benchmarks and the +results show promising performance when compared with related context detection +algorithms. The results suggest that optimal transport statistical methods +provide an explainable and justifiable procedure for online context detection +and reward optimization in lifelong reinforcement learning. + +
+
+ comment: 10 pages excluding references and bibliography. Accepted at CoLLAs + 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Cell Tracking with a Time-Symmetric Deep Learning Approach + + +
+ The accurate tracking of live cells using video microscopy recordings remains +a challenging task for popular state-of-the-art image processing based object +tracking methods. In recent years, several existing and new applications have +attempted to integrate deep-learning based frameworks for this task, but most +of them still heavily rely on consecutive frame based tracking embedded in +their architecture or other premises that hinder generalized learning. To +address this issue, we aimed to develop a new deep-learning based tracking +method that relies solely on the assumption that cells can be tracked based on +their spatio-temporal neighborhood, without restricting it to consecutive +frames. The proposed method has the additional benefit that the motion patterns +of the cells can be learned completely by the predictor without any prior +assumptions, and it has the potential to handle a large number of video frames +with heavy artifacts. The efficacy of the proposed method is demonstrated +through biologically motivated validation strategies and compared against +multiple state-of-the-art cell tracking methods. + +
+
+
+
+
+ + ♻ ☆ Controllable Edge-Type-Specific Interpretation in Multi-Relational Graph + Neural Networks for Drug Response Prediction + + +
+ Graph Neural Networks have been widely applied in critical decision-making +areas that demand interpretable predictions, leading to the flourishing +development of interpretability algorithms. However, current graph +interpretability algorithms tend to emphasize generality and often overlook +biological significance, thereby limiting their applicability in predicting +cancer drug responses. In this paper, we propose a novel post-hoc +interpretability algorithm for cancer drug response prediction, CETExplainer, +which incorporates a controllable edge-type-specific weighting mechanism. It +considers the mutual information between subgraphs and predictions, proposing a +structural scoring approach to provide fine-grained, biologically meaningful +explanations for predictive models. We also introduce a method for constructing +ground truth based on real-world datasets to quantitatively evaluate the +proposed interpretability algorithm. Empirical analysis on the real-world +dataset demonstrates that CETExplainer achieves superior stability and improves +explanation quality compared to leading algorithms, thereby offering a robust +and insightful tool for cancer drug prediction. + +
+
+
+
+
+ + ♻ ☆ GANs Conditioning Methods: A Survey + + +
+ In recent years, Generative Adversarial Networks (GANs) have seen significant +advancements, leading to their widespread adoption across various fields. The +original GAN architecture enables the generation of images without any specific +control over the content, making it an unconditional generation process. +However, many practical applications require precise control over the generated +output, which has led to the development of conditional GANs (cGANs) that +incorporate explicit conditioning to guide the generation process. cGANs extend +the original framework by incorporating additional information (conditions), +enabling the generation of samples that adhere to that specific criteria. +Various conditioning methods have been proposed, each differing in how they +integrate the conditioning information into both the generator and the +discriminator networks. In this work, we review the conditioning methods +proposed for GANs, exploring the characteristics of each method and +highlighting their unique mechanisms and theoretical foundations. Furthermore, +we conduct a comparative analysis of these methods, evaluating their +performance on various image datasets. Through these analyses, we aim to +provide insights into the strengths and limitations of various conditioning +techniques, guiding future research and application in generative modeling. + +
+
+
+
+
+ + ♻ ☆ A Survey on Stability of Learning with Limited Labelled Data and its + Sensitivity to the Effects of Randomness + + +
+ Learning with limited labelled data, such as prompting, in-context learning, +fine-tuning, meta-learning or few-shot learning, aims to effectively train a +model using only a small amount of labelled samples. However, these approaches +have been observed to be excessively sensitive to the effects of uncontrolled +randomness caused by non-determinism in the training process. The randomness +negatively affects the stability of the models, leading to large variances in +results across training runs. When such sensitivity is disregarded, it can +unintentionally, but unfortunately also intentionally, create an imaginary +perception of research progress. Recently, this area started to attract +research attention and the number of relevant studies is continuously growing. +In this survey, we provide a comprehensive overview of 415 papers addressing +the effects of randomness on the stability of learning with limited labelled +data. We distinguish between four main tasks addressed in the papers +(investigate/evaluate; determine; mitigate; benchmark/compare/report randomness +effects), providing findings for each one. Furthermore, we identify and discuss +seven challenges and open problems together with possible directions to +facilitate further research. The ultimate goal of this survey is to emphasise +the importance of this growing research area, which so far has not received an +appropriate level of attention, and reveal impactful directions for future +research. + +
+
+ comment: Accepted to ACM Comput. Surv. 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Heterogeneous Graph Learning via Random Projection + + +
+ Heterogeneous Graph Neural Networks (HGNNs) are powerful tools for deep +learning on heterogeneous graphs. Typical HGNNs require repetitive message +passing during training, limiting efficiency for large-scale real-world graphs. +Recent pre-computation-based HGNNs use one-time message passing to transform a +heterogeneous graph into regular-shaped tensors, enabling efficient mini-batch +training. Existing pre-computation-based HGNNs can be mainly categorized into +two styles, which differ in how much information loss is allowed and +efficiency. We propose a hybrid pre-computation-based HGNN, named Random +Projection Heterogeneous Graph Neural Network (RpHGNN), which combines the +benefits of one style's efficiency with the low information loss of the other +style. To achieve efficiency, the main framework of RpHGNN consists of +propagate-then-update iterations, where we introduce a Random Projection +Squashing step to ensure that complexity increases only linearly. To achieve +low information loss, we introduce a Relation-wise Neighbor Collection +component with an Even-odd Propagation Scheme, which aims to collect +information from neighbors in a finer-grained way. Experimental results +indicate that our approach achieves state-of-the-art results on seven small and +large benchmark datasets while also being 230% faster compared to the most +effective baseline. Surprisingly, our approach not only surpasses +pre-processing-based baselines but also outperforms end-to-end methods. + +
+
+ comment: Accepted by IEEE Transactions on Knowledge and Data Engineering + (TKDE) +
+
+
+
+
+ + ♻ ☆ KTO: Model Alignment as Prospect Theoretic Optimization ICML 2024 + + +
+ Kahneman & Tversky's $\textit{prospect theory}$ tells us that humans perceive +random variables in a biased but well-defined manner (1992); for example, +humans are famously loss-averse. We show that objectives for aligning LLMs with +human feedback implicitly incorporate many of these biases -- the success of +these objectives (e.g., DPO) over cross-entropy minimization can partly be +ascribed to them belonging to a family of loss functions that we call +$\textit{human-aware losses}$ (HALOs). However, the utility functions these +methods attribute to humans still differ from those in the prospect theory +literature. Using a Kahneman-Tversky model of human utility, we propose a HALO +that directly maximizes the utility of generations instead of maximizing the +log-likelihood of preferences, as current methods do. We call this approach +KTO, and it matches or exceeds the performance of preference-based methods at +scales from 1B to 30B, despite only learning from a binary signal of whether an +output is desirable. More broadly, our work suggests that there is no one HALO +that is universally superior; the best loss depends on the inductive biases +most appropriate for a given setting, an oft-overlooked consideration. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ End-to-end Feature Selection Approach for Learning Skinny Trees AISTATS 2024 + + +
+ We propose a new optimization-based approach for feature selection in tree +ensembles, an important problem in statistics and machine learning. Popular +tree ensemble toolkits e.g., Gradient Boosted Trees and Random Forests support +feature selection post-training based on feature importance scores, while very +popular, they are known to have drawbacks. We propose Skinny Trees: an +end-to-end toolkit for feature selection in tree ensembles where we train a +tree ensemble while controlling the number of selected features. Our +optimization-based approach learns an ensemble of differentiable trees, and +simultaneously performs feature selection using a grouped $\ell_0$-regularizer. +We use first-order methods for optimization and present convergence guarantees +for our approach. We use a dense-to-sparse regularization scheduling scheme +that can lead to more expressive and sparser tree ensembles. On 15 synthetic +and real-world datasets, Skinny Trees can achieve $1.5\!\times\! +-~620~\!\times\!$ feature compression rates, leading up to $10\times$ faster +inference over dense trees, without any loss in performance. Skinny Trees lead +to superior feature selection than many existing toolkits e.g., in terms of AUC +performance for 25\% feature budget, Skinny Trees outperforms LightGBM by +$10.2\%$ (up to $37.7\%$), and Random Forests by $3\%$ (up to $12.5\%$). + +
+
+ comment: Accepted in AISTATS 2024 +
+
+
+
+
+ + ♻ ☆ Fair Mixed Effects Support Vector Machine + + +
+ To ensure unbiased and ethical automated predictions, fairness must be a core +principle in machine learning applications. Fairness in machine learning aims +to mitigate biases present in the training data and model imperfections that +could lead to discriminatory outcomes. This is achieved by preventing the model +from making decisions based on sensitive characteristics like ethnicity or +sexual orientation. A fundamental assumption in machine learning is the +independence of observations. However, this assumption often does not hold true +for data describing social phenomena, where data points are often clustered +based. Hence, if the machine learning models do not account for the cluster +correlations, the results may be biased. Especially high is the bias in cases +where the cluster assignment is correlated to the variable of interest. We +present a fair mixed effects support vector machine algorithm that can handle +both problems simultaneously. With a reproducible simulation study we +demonstrate the impact of clustered data on the quality of fair machine +learning predictions. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: Accepted at Journal of Machine Learning Research. This paper + integrates the works arXiv:2306.01129 and arXiv:2308.16271 into a complete + story. In this paper, we improve the writing and organization, and also add + conceptual, empirical, and theoretical improvements over the previous work. + V2: small typo fixes and formatting improvements. V3: improvements from + journal revisions +
+
+
+
+
+ + ♻ ☆ Contrastive Learning and Abstract Concepts: The Case of Natural Numbers + + +
+ Contrastive Learning (CL) has been successfully applied to classification and +other downstream tasks related to concrete concepts, such as objects contained +in the ImageNet dataset. No attempts seem to have been made so far in applying +this promising scheme to more abstract entities. A prominent example of these +could be the concept of (discrete) Quantity. CL can be frequently interpreted +as a self-supervised scheme guided by some profound and ubiquitous conservation +principle (e.g. conservation of identity in object classification tasks). In +this introductory work we apply a suitable conservation principle to the +semi-abstract concept of natural numbers by which discrete quantities can be +estimated or predicted. We experimentally show, by means of a toy problem, that +contrastive learning can be trained to count at a glance with high accuracy +both at human as well as at super-human ranges.. We compare this with the +results of a trained-to-count at a glance supervised learning (SL) neural +network scheme of similar architecture. We show that both schemes exhibit +similar good performance on baseline experiments, where the distributions of +the training and testing stages are equal. Importantly, we demonstrate that in +some generalization scenarios, where training and testing distributions differ, +CL boasts more robust and much better error performance. + +
+
+
+
+
+ + ♻ ☆ NeMo-Aligner: Scalable Toolkit for Efficient Model Alignment + + +
+ Aligning Large Language Models (LLMs) with human values and preferences is +essential for making them helpful and safe. However, building efficient tools +to perform alignment can be challenging, especially for the largest and most +competent LLMs which often contain tens or hundreds of billions of parameters. +We create NeMo-Aligner, a toolkit for model alignment that can efficiently +scale to a thousand GPUs for training the largest open-source LLMs such as +Nemotron 4 340B and Llama 3.1 405B. NeMo-Aligner comes with highly optimized +and scalable implementations for major paradigms of model alignment such as: +Reinforcement Learning from Human Feedback (RLHF), Direct Preference +Optimization (DPO), SteerLM, and Self-Play Fine-Tuning (SPIN). Additionally, +our toolkit supports running most of the alignment techniques in a Parameter +Efficient Fine-Tuning (PEFT) setting. NeMo-Aligner is designed for +extensibility, allowing support for other alignment techniques with minimal +effort. It is open-sourced with Apache 2.0 License and we invite community +contributions at https://github.com/NVIDIA/NeMo-Aligner + +
+
+ comment: 16 pages, 4 figures, Accepted to COLM 2024 +
+
+
+
+
+ + ♻ ☆ On the Optimality of Misspecified Spectral Algorithms + + +
+ In the misspecified spectral algorithms problem, researchers usually assume +the underground true function $f_{\rho}^{*} \in [\mathcal{H}]^{s}$, a +less-smooth interpolation space of a reproducing kernel Hilbert space (RKHS) +$\mathcal{H}$ for some $s\in (0,1)$. The existing minimax optimal results +require $\|f_{\rho}^{*}\|_{L^{\infty}}<\infty$ which implicitly requires $s > +\alpha_{0}$ where $\alpha_{0}\in (0,1)$ is the embedding index, a constant +depending on $\mathcal{H}$. Whether the spectral algorithms are optimal for all +$s\in (0,1)$ is an outstanding problem lasting for years. In this paper, we +show that spectral algorithms are minimax optimal for any +$\alpha_{0}-\frac{1}{\beta} < s < 1$, where $\beta$ is the eigenvalue decay +rate of $\mathcal{H}$. We also give several classes of RKHSs whose embedding +index satisfies $ \alpha_0 = \frac{1}{\beta} $. Thus, the spectral algorithms +are minimax optimal for all $s\in (0,1)$ on these RKHSs. + +
+
+ comment: 50 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Safety Constrained Multi-Agent Reinforcement Learning for Active Voltage + Control IJCAI2024 + + +
+ Active voltage control presents a promising avenue for relieving power +congestion and enhancing voltage quality, taking advantage of the distributed +controllable generators in the power network, such as roof-top photovoltaics. +While Multi-Agent Reinforcement Learning (MARL) has emerged as a compelling +approach to address this challenge, existing MARL approaches tend to overlook +the constrained optimization nature of this problem, failing in guaranteeing +safety constraints. In this paper, we formalize the active voltage control +problem as a constrained Markov game and propose a safety-constrained MARL +algorithm. We expand the primal-dual optimization RL method to multi-agent +settings, and augment it with a novel approach of double safety estimation to +learn the policy and to update the Lagrange-multiplier. In addition, we +proposed different cost functions and investigated their influences on the +behavior of our constrained MARL method. We evaluate our approach in the power +distribution network simulation environment with real-world scale scenarios. +Experimental results demonstrate the effectiveness of the proposed method +compared with the state-of-the-art MARL methods. This paper is published at +\url{https://www.ijcai.org/Proceedings/2024/}. + +
+
+ comment: Accepted by IJCAI2024 +
+
+
+
+
+ + ♻ ☆ Towards reliable respiratory disease diagnosis based on cough sounds and + vision transformers + + +
+ Recent advancements in deep learning techniques have sparked performance +boosts in various real-world applications including disease diagnosis based on +multi-modal medical data. Cough sound data-based respiratory disease (e.g., +COVID-19 and Chronic Obstructive Pulmonary Disease) diagnosis has also +attracted much attention. However, existing works usually utilise traditional +machine learning or deep models of moderate scales. On the other hand, the +developed approaches are trained and evaluated on small-scale data due to the +difficulty of curating and annotating clinical data on scale. To address these +issues in prior works, we create a unified framework to evaluate various deep +models from lightweight Convolutional Neural Networks (e.g., ResNet18) to +modern vision transformers and compare their performance in respiratory disease +classification. Based on the observations from such an extensive empirical +study, we propose a novel approach to cough-based disease classification based +on both self-supervised and supervised learning on a large-scale cough data +set. Experimental results demonstrate our proposed approach outperforms prior +arts consistently on two benchmark datasets for COVID-19 diagnosis and a +proprietary dataset for COPD/non-COPD classification with an AUROC of 92.5%. + +
+
+
+
+
+ + ♻ ☆ TimeSeriesBench: An Industrial-Grade Benchmark for Time Series Anomaly + Detection Models + + +
+ Time series anomaly detection (TSAD) has gained significant attention due to +its real-world applications to improve the stability of modern software +systems. However, there is no effective way to verify whether they can meet the +requirements for real-world deployment. Firstly, current algorithms typically +train a specific model for each time series. Maintaining such many models is +impractical in a large-scale system with tens of thousands of curves. The +performance of using merely one unified model to detect anomalies remains +unknown. Secondly, most TSAD models are trained on the historical part of a +time series and are tested on its future segment. In distributed systems, +however, there are frequent system deployments and upgrades, with new, +previously unseen time series emerging daily. The performance of testing newly +incoming unseen time series on current TSAD algorithms remains unknown. Lastly, +the assumptions of the evaluation metrics in existing benchmarks are far from +practical demands. To solve the above-mentioned problems, we propose an +industrial-grade benchmark TimeSeriesBench. We assess the performance of +existing algorithms across more than 168 evaluation settings and provide +comprehensive analysis for the future design of anomaly detection algorithms. +An industrial dataset is also released along with TimeSeriesBench. + +
+
+ comment: Accepted by ISSRE'24 +
+
+
+
+
+ + ♻ ☆ Investigating Recurrent Transformers with Dynamic Halt + + +
+ In this paper, we comprehensively study the inductive biases of two major +approaches to augmenting Transformers with a recurrent mechanism: (1) the +approach of incorporating a depth-wise recurrence similar to Universal +Transformers; and (2) the approach of incorporating a chunk-wise temporal +recurrence like Temporal Latent Bottleneck. Furthermore, we propose and +investigate novel ways to extend and combine the above methods - for example, +we propose a global mean-based dynamic halting mechanism for Universal +Transformers and an augmentation of Temporal Latent Bottleneck with elements +from Universal Transformer. We compare the models and probe their inductive +biases in several diagnostic tasks, such as Long Range Arena (LRA), flip-flop +language modeling, ListOps, and Logical Inference. The code is released in: +https://github.com/JRC1995/InvestigatingRecurrentTransformers/tree/main + +
+
+
+
+
+ + ♻ ☆ OccamLLM: Fast and Exact Language Model Arithmetic in a Single Step + + +
+ Despite significant advancements in text generation and reasoning, Large +Language Models (LLMs) still face challenges in accurately performing complex +arithmetic operations. Language model systems often enable LLMs to generate +code for arithmetic operations to achieve accurate calculations. However, this +approach compromises speed and security, and fine-tuning risks the language +model losing prior capabilities. We propose a framework that enables exact +arithmetic in a single autoregressive step, providing faster, more secure, and +more interpretable LLM systems with arithmetic capabilities. We use the hidden +states of a LLM to control a symbolic architecture that performs arithmetic. +Our implementation using Llama 3 with OccamNet as a symbolic model (OccamLlama) +achieves 100\% accuracy on single arithmetic operations +($+,-,\times,\div,\sin{},\cos{},\log{},\exp{},\sqrt{}$), outperforming GPT 4o +with and without a code interpreter. Furthermore, OccamLlama outperforms GPT 4o +with and without a code interpreter on average across a range of mathematical +problem solving benchmarks, demonstrating that OccamLLMs can excel in +arithmetic tasks, even surpassing much larger models. We will make our code +public shortly. + +
+
+
+
+
+ + ♻ ☆ MPruner: Optimizing Neural Network Size with CKA-Based Mutual + Information Pruning + + +
+ Determining the optimal size of a neural network is critical, as it directly +impacts runtime performance and memory usage. Pruning is a well-established +model compression technique that reduces the size of neural networks while +mathematically guaranteeing accuracy preservation. However, many recent pruning +methods overlook the global contributions of individual model components, +making it difficult to ensure that a pruned model meets the desired dataset and +performance requirements. To address these challenges, we developed a new +pruning algorithm, MPruner, that leverages mutual information through vector +similarity. MPruner utilizes layer clustering with the Centered Kernel +Alignment (CKA) similarity metric, allowing us to incorporate global +information from the neural network for more precise and efficient layer-wise +pruning. We evaluated MPruner across various architectures and configurations, +demonstrating its versatility and providing practical guidelines. MPruner +achieved up to a 50% reduction in parameters and memory usage for CNN and +transformer-based models, with minimal to no loss in accuracy. + +
+
+
+
+
+ + ♻ ☆ Recursively Feasible Probabilistic Safe Online Learning with Control + Barrier Functions + + +
+ Learning-based control has recently shown great efficacy in performing +complex tasks for various applications. However, to deploy it in real systems, +it is of vital importance to guarantee the system will stay safe. Control +Barrier Functions (CBFs) offer mathematical tools for designing +safety-preserving controllers for systems with known dynamics. In this article, +we first introduce a model-uncertainty-aware reformulation of CBF-based +safety-critical controllers using Gaussian Process (GP) regression to close the +gap between an approximate mathematical model and the real system, which +results in a second-order cone program (SOCP)-based control design. We then +present the pointwise feasibility conditions of the resulting safety +controller, highlighting the level of richness that the available system +information must meet to ensure safety. We use these conditions to devise an +event-triggered online data collection strategy that ensures the recursive +feasibility of the learned safety controller. Our method works by constantly +reasoning about whether the current information is sufficient to ensure safety +or if new measurements under active safe exploration are required to reduce the +uncertainty. As a result, our proposed framework can guarantee the forward +invariance of the safe set defined by the CBF with high probability, even if it +contains a priori unexplored regions. We validate the proposed framework in two +numerical simulation experiments. + +
+
+ comment: Journal article. Includes the results of the 2021 CDC paper titled + "Pointwise feasibility of gaussian process-based safety-critical control + under model uncertainty" and proposes a recursively feasible safe online + learning algorithm as new contribution +
+
+
+
+
+ + ♻ ☆ The Responsible Foundation Model Development Cheatsheet: A Review of + Tools & Resources + + +
+ Foundation model development attracts a rapidly expanding body of +contributors, scientists, and applications. To help shape responsible +development practices, we introduce the Foundation Model Development +Cheatsheet: a growing collection of 250+ tools and resources spanning text, +vision, and speech modalities. We draw on a large body of prior work to survey +resources (e.g. software, documentation, frameworks, guides, and practical +tools) that support informed data selection, processing, and understanding, +precise and limitation-aware artifact documentation, efficient model training, +advance awareness of the environmental impact from training, careful model +evaluation of capabilities, risks, and claims, as well as responsible model +release, licensing and deployment practices. We hope this curated collection of +resources helps guide more responsible development. The process of curating +this list, enabled us to review the AI development ecosystem, revealing what +tools are critically missing, misused, or over-used in existing practices. We +find that (i) tools for data sourcing, model evaluation, and monitoring are +critically under-serving ethical and real-world needs, (ii) evaluations for +model safety, capabilities, and environmental impact all lack reproducibility +and transparency, (iii) text and particularly English-centric analyses continue +to dominate over multilingual and multi-modal analyses, and (iv) evaluation of +systems, rather than just models, is needed so that capabilities and impact are +assessed in context. + +
+
+
+
+
+ + ♻ ☆ Kolmogorov Arnold Networks in Fraud Detection: Bridging the Gap Between + Theory and Practice + + +
+ This study evaluates the applicability of Kolmogorov-Arnold Networks (KAN) in +fraud detection, finding that their effectiveness is context-dependent. We +propose a quick decision rule using Principal Component Analysis (PCA) to +assess the suitability of KAN: if data can be effectively separated in two +dimensions using splines, KAN may outperform traditional models; otherwise, +other methods could be more appropriate. We also introduce a heuristic approach +to hyperparameter tuning, significantly reducing computational costs. These +findings suggest that while KAN has potential, its use should be guided by +data-specific assessments. + +
+
+
+
+
+ + ♻ ☆ Deep-MacroFin: Informed Equilibrium Neural Network for Continuous Time + Economic Models + + +
+ In this paper, we present Deep-MacroFin, a comprehensive framework designed +to solve partial differential equations, with a particular focus on models in +continuous time economics. This framework leverages deep learning +methodologies, including conventional Multi-Layer Perceptrons and the newly +developed Kolmogorov-Arnold Networks. It is optimized using economic +information encapsulated by Hamilton-Jacobi-Bellman equations and coupled +algebraic equations. The application of neural networks holds the promise of +accurately resolving high-dimensional problems with fewer computational demands +and limitations compared to standard numerical methods. This versatile +framework can be readily adapted for elementary differential equations, and +systems of differential equations, even in cases where the solutions may +exhibit discontinuities. Importantly, it offers a more straightforward and +user-friendly implementation than existing libraries. + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ What do we know about Hugging Face? A systematic literature review and + quantitative validation of qualitative claims + + +
+ Background: Collaborative Software Package Registries (SPRs) are an integral +part of the software supply chain. Much engineering work synthesizes SPR +package into applications. Prior research has examined SPRs for traditional +software, such as NPM (JavaScript) and PyPI (Python). Pre-Trained Model (PTM) +Registries are an emerging class of SPR of increasing importance, because they +support the deep learning supply chain. + Aims: Recent empirical research has examined PTM registries in ways such as +vulnerabilities, reuse processes, and evolution. However, no existing research +synthesizes them to provide a systematic understanding of the current +knowledge. Some of the existing research includes qualitative claims lacking +quantitative analysis. Our research fills these gaps by providing a knowledge +synthesis and quantitative analyses. + Methods: We first conduct a systematic literature review (SLR). We then +observe that some of the claims are qualitative. We identify quantifiable +metrics associated with those claims, and measure in order to substantiate +these claims. + Results: From our SLR, we identify 12 claims about PTM reuse on the +HuggingFace platform, 4 of which lack quantitative validation. We successfully +test 3 of these claims through a quantitative analysis, and directly compare +one with traditional software. Our findings corroborate qualitative claims with +quantitative measurements. Our findings are: (1) PTMs have a much higher +turnover rate than traditional software, indicating a dynamic and rapidly +evolving reuse environment within the PTM ecosystem; and (2) There is a strong +correlation between documentation quality and PTM popularity. + Conclusions: We confirm qualitative research claims with concrete metrics, +supporting prior qualitative and case study research. Our measures show further +dynamics of PTM reuse, inspiring research infrastructure and new measures. + +
+
+ comment: [ESEM'24] Proceedings of the 18th ACM/IEEE International Symposium on + Empirical Software Engineering and Measurement (ESEM) 2024 +
+
+
+
+
+ + ♻ ☆ Linear Contextual Bandits with Hybrid Payoff: Revisited ECML + + +
+ We study the Linear Contextual Bandit problem in the hybrid reward setting. +In this setting every arm's reward model contains arm specific parameters in +addition to parameters shared across the reward models of all the arms. We can +reduce this setting to two closely related settings (a) Shared - no arm +specific parameters, and (b) Disjoint - only arm specific parameters, enabling +the application of two popular state of the art algorithms - $\texttt{LinUCB}$ +and $\texttt{DisLinUCB}$ (Algorithm 1 in (Li et al. 2010)). When the arm +features are stochastic and satisfy a popular diversity condition, we provide +new regret analyses for both algorithms, significantly improving on the known +regret guarantees of these algorithms. Our novel analysis critically exploits +the hybrid reward structure and the diversity condition. Moreover, we introduce +a new algorithm $\texttt{HyLinUCB}$ that crucially modifies $\texttt{LinUCB}$ +(using a new exploration coefficient) to account for sparsity in the hybrid +setting. Under the same diversity assumptions, we prove that +$\texttt{HyLinUCB}$ also incurs only $O(\sqrt{T})$ regret for $T$ rounds. We +perform extensive experiments on synthetic and real-world datasets +demonstrating strong empirical performance of $\texttt{HyLinUCB}$.For number of +arm specific parameters much larger than the number of shared parameters, we +observe that $\texttt{DisLinUCB}$ incurs the lowest regret. In this case, +regret of $\texttt{HyLinUCB}$ is the second best and extremely competitive to +$\texttt{DisLinUCB}$. In all other situations, including our real-world +dataset, $\texttt{HyLinUCB}$ has significantly lower regret than +$\texttt{LinUCB}$, $\texttt{DisLinUCB}$ and other SOTA baselines we considered. +We also empirically observe that the regret of $\texttt{HyLinUCB}$ grows much +slower with the number of arms compared to baselines, making it suitable even +for very large action spaces. + +
+
+ comment: Accepted at ECML PKDD 2024 as a Research Track Paper +
+
+
+
+
+ + ♻ ☆ Persian Slang Text Conversion to Formal and Deep Learning of Persian + Short Texts on Social Media for Sentiment Classification + + +
+ The lack of a suitable tool for the analysis of conversational texts in the +Persian language has made various analyses of these texts, including Sentiment +Analysis, difficult. In this research, we tried to make the understanding of +these texts easier for the machine by providing PSC, Persian Slang Converter, a +tool for converting conversational texts into formal ones, and by using the +most up-to-date and best deep learning methods along with the PSC, the +sentiment learning of short Persian language texts for the machine in a better +way. be made More than 10 million unlabeled texts from various social networks +and movie subtitles (as Conversational texts) and about 10 million news texts +(as formal texts) have been used for training unsupervised models and formal +implementation of the tool. 60,000 texts from the comments of Instagram social +network users with positive, negative, and neutral labels are considered +supervised data for training the emotion classification model of short texts. +Using the formal tool, 57% of the words of the corpus of conversation were +converted. Finally, by using the formalizer, FastText model, and deep LSTM +network, an accuracy of 81.91 was obtained on the test data. + +
+
+ comment: 16 pages, 4 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Projected Stochastic Gradient Descent with Quantum Annealed Binary + Gradients + + +
+ We present, QP-SBGD, a novel layer-wise stochastic optimiser tailored towards +training neural networks with binary weights, known as binary neural networks +(BNNs), on quantum hardware. BNNs reduce the computational requirements and +energy consumption of deep learning models with minimal loss in accuracy. +However, training them in practice remains to be an open challenge. Most known +BNN-optimisers either rely on projected updates or binarise weights +post-training. Instead, QP-SBGD approximately maps the gradient onto binary +variables, by solving a quadratic constrained binary optimisation. Under +practically reasonable assumptions, we show that this update rule converges +with a rate of $\mathcal{O}(1 / \sqrt{T})$. Moreover, we show how the +$\mathcal{NP}$-hard projection can be effectively executed on an adiabatic +quantum annealer, harnessing recent advancements in quantum computation. We +also introduce a projected version of this update rule and prove that if a +fixed point exists in the binary variable space, the modified updates will +converge to it. Last but not least, our algorithm is implemented layer-wise, +making it suitable to train larger networks on resource-limited quantum +hardware. Through extensive evaluations, we show that QP-SBGD outperforms or is +on par with competitive and well-established baselines such as BinaryConnect, +signSGD and ProxQuant when optimising the Rosenbrock function, training BNNs as +well as binary graph neural networks. + +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ LSTMSE-Net: Long Short Term Speech Enhancement Network for Audio-visual + Speech Enhancement + + +
+ In this paper, we propose long short term memory speech enhancement network +(LSTMSE-Net), an audio-visual speech enhancement (AVSE) method. This innovative +method leverages the complementary nature of visual and audio information to +boost the quality of speech signals. Visual features are extracted with +VisualFeatNet (VFN), and audio features are processed through an encoder and +decoder. The system scales and concatenates visual and audio features, then +processes them through a separator network for optimized speech enhancement. +The architecture highlights advancements in leveraging multi-modal data and +interpolation techniques for robust AVSE challenge systems. The performance of +LSTMSE-Net surpasses that of the baseline model from the COG-MHEAR AVSE +Challenge 2024 by a margin of 0.06 in scale-invariant signal-to-distortion +ratio (SISDR), $0.03$ in short-time objective intelligibility (STOI), and +$1.32$ in perceptual evaluation of speech quality (PESQ). The source code of +the proposed LSTMSE-Net is available at +\url{https://github.com/mtanveer1/AVSEC-3-Challenge}. + +
+
+
+
+
+ + ☆ Unveiling Deep Shadows: A Survey on Image and Video Shadow Detection, + Removal, and Generation in the Era of Deep Learning + + +
+ Shadows are formed when light encounters obstacles, leading to areas of +diminished illumination. In computer vision, shadow detection, removal, and +generation are crucial for enhancing scene understanding, refining image +quality, ensuring visual consistency in video editing, and improving virtual +environments. This paper presents a comprehensive survey of shadow detection, +removal, and generation in images and videos within the deep learning landscape +over the past decade, covering tasks, deep models, datasets, and evaluation +metrics. Our key contributions include a comprehensive survey of shadow +analysis, standardization of experimental comparisons, exploration of the +relationships among model size, speed, and performance, a cross-dataset +generalization study, identification of open issues and future directions, and +provision of publicly available resources to support further research. + +
+
+ comment: Publicly available results, trained models, and evaluation metrics at + https://github.com/xw-hu/Unveiling-Deep-Shadows +
+
+
+
+
+ + ☆ Towards Real-World Adverse Weather Image Restoration: Enhancing + Clearness and Semantics with Vision-Language Models ECCV 2024 + + +
+ This paper addresses the limitations of adverse weather image restoration +approaches trained on synthetic data when applied to real-world scenarios. We +formulate a semi-supervised learning framework employing vision-language models +to enhance restoration performance across diverse adverse weather conditions in +real-world settings. Our approach involves assessing image clearness and +providing semantics using vision-language models on real data, serving as +supervision signals for training restoration models. For clearness enhancement, +we use real-world data, utilizing a dual-step strategy with pseudo-labels +assessed by vision-language models and weather prompt learning. For semantic +enhancement, we integrate real-world data by adjusting weather conditions in +vision-language model descriptions while preserving semantic meaning. +Additionally, we introduce an effective training strategy to bootstrap +restoration performance. Our approach achieves superior results in real-world +adverse weather image restoration, demonstrated through qualitative and +quantitative comparisons with state-of-the-art works. + +
+
+ comment: Accepted by ECCV 2024 +
+
+
+
+
+ + ☆ Low-Resolution Face Recognition via Adaptable Instance-Relation + Distillation IJCNN 2024 + + +
+ Low-resolution face recognition is a challenging task due to the missing of +informative details. Recent approaches based on knowledge distillation have +proven that high-resolution clues can well guide low-resolution face +recognition via proper knowledge transfer. However, due to the distribution +difference between training and testing faces, the learned models often suffer +from poor adaptability. To address that, we split the knowledge transfer +process into distillation and adaptation steps, and propose an adaptable +instance-relation distillation approach to facilitate low-resolution face +recognition. In the approach, the student distills knowledge from +high-resolution teacher in both instance level and relation level, providing +sufficient cross-resolution knowledge transfer. Then, the learned student can +be adaptable to recognize low-resolution faces with adaptive batch +normalization in inference. In this manner, the capability of recovering +missing details of familiar low-resolution faces can be effectively enhanced, +leading to a better knowledge transfer. Extensive experiments on low-resolution +face recognition clearly demonstrate the effectiveness and adaptability of our +approach. + +
+
+ comment: Accepted by IJCNN 2024 +
+
+
+
+
+ + ☆ PRoGS: Progressive Rendering of Gaussian Splats + + +
+ Over the past year, 3D Gaussian Splatting (3DGS) has received significant +attention for its ability to represent 3D scenes in a perceptually accurate +manner. However, it can require a substantial amount of storage since each +splat's individual data must be stored. While compression techniques offer a +potential solution by reducing the memory footprint, they still necessitate +retrieving the entire scene before any part of it can be rendered. In this +work, we introduce a novel approach for progressively rendering such scenes, +aiming to display visible content that closely approximates the final scene as +early as possible without loading the entire scene into memory. This approach +benefits both on-device rendering applications limited by memory constraints +and streaming applications where minimal bandwidth usage is preferred. To +achieve this, we approximate the contribution of each Gaussian to the final +scene and construct an order of prioritization on their inclusion in the +rendering process. Additionally, we demonstrate that our approach can be +combined with existing compression methods to progressively render (and stream) +3DGS scenes, optimizing bandwidth usage by focusing on the most important +splats within a scene. Overall, our work establishes a foundation for making +remotely hosted 3DGS content more quickly accessible to end-users in +over-the-top consumption scenarios, with our results showing significant +improvements in quality across all metrics compared to existing methods. + +
+
+
+
+
+ + ☆ Privacy-Preserving Multimedia Mobile Cloud Computing Using Protective + Perturbation + + +
+ Mobile cloud computing has been adopted in many multimedia applications, +where the resource-constrained mobile device sends multimedia data (e.g., +images) to remote cloud servers to request computation-intensive multimedia +services (e.g., image recognition). While significantly improving the +performance of the mobile applications, the cloud-based mechanism often causes +privacy concerns as the multimedia data and services are offloaded from the +trusted user device to untrusted cloud servers. Several recent studies have +proposed perturbation-based privacy preserving mechanisms, which obfuscate the +offloaded multimedia data to eliminate privacy exposures without affecting the +functionality of the remote multimedia services. However, the existing privacy +protection approaches require the deployment of computation-intensive +perturbation generation on the resource-constrained mobile devices. Also, the +obfuscated images are typically not compliant with the standard image +compression algorithms and suffer from significant bandwidth consumption. In +this paper, we develop a novel privacy-preserving multimedia mobile cloud +computing framework, namely $PMC^2$, to address the resource and bandwidth +challenges. $PMC^2$ employs secure confidential computing in the cloud to +deploy the perturbation generator, which addresses the resource challenge while +maintaining the privacy. Furthermore, we develop a neural compressor +specifically trained to compress the perturbed images in order to address the +bandwidth challenge. We implement $PMC^2$ in an end-to-end mobile cloud +computing system, based on which our evaluations demonstrate superior latency, +power efficiency, and bandwidth consumption achieved by $PMC^2$ while +maintaining high accuracy in the target multimedia service. + +
+
+
+
+
+ + ☆ Think Twice Before Recognizing: Large Multimodal Models for General + Fine-grained Traffic Sign Recognition + + +
+ We propose a new strategy called think twice before recognizing to improve +fine-grained traffic sign recognition (TSR). Fine-grained TSR in the wild is +difficult due to the complex road conditions, and existing approaches +particularly struggle with cross-country TSR when data is lacking. Our strategy +achieves effective fine-grained TSR by stimulating the multiple-thinking +capability of large multimodal models (LMM). We introduce context, +characteristic, and differential descriptions to design multiple thinking +processes for the LMM. The context descriptions with center coordinate prompt +optimization help the LMM to locate the target traffic sign in the original +road images containing multiple traffic signs and filter irrelevant answers +through the proposed prior traffic sign hypothesis. The characteristic +description is based on few-shot in-context learning of template traffic signs, +which decreases the cross-domain difference and enhances the fine-grained +recognition capability of the LMM. The differential descriptions of similar +traffic signs optimize the multimodal thinking capability of the LMM. The +proposed method is independent of training data and requires only simple and +uniform instructions. We conducted extensive experiments on three benchmark +datasets and two real-world datasets from different countries, and the proposed +method achieves state-of-the-art TSR results on all five datasets. + +
+
+
+
+
+ + ♻ ☆ TALDS-Net: Task-Aware Adaptive Local Descriptors Selection for Few-shot + Image Classification ICASSP 2024 + + +
+ Few-shot image classification aims to classify images from unseen novel +classes with few samples. Recent works demonstrate that deep local descriptors +exhibit enhanced representational capabilities compared to image-level +features. However, most existing methods solely rely on either employing all +local descriptors or directly utilizing partial descriptors, potentially +resulting in the loss of crucial information. Moreover, these methods primarily +emphasize the selection of query descriptors while overlooking support +descriptors. In this paper, we propose a novel Task-Aware Adaptive Local +Descriptors Selection Network (TALDS-Net), which exhibits the capacity for +adaptive selection of task-aware support descriptors and query descriptors. +Specifically, we compare the similarity of each local support descriptor with +other local support descriptors to obtain the optimal support descriptor subset +and then compare the query descriptors with the optimal support subset to +obtain discriminative query descriptors. Extensive experiments demonstrate that +our TALDS-Net outperforms state-of-the-art methods on both general and +fine-grained datasets. + +
+
+ comment: 4 pages, 1 figures, is accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ IDNet: A Novel Dataset for Identity Document Analysis and Fraud + Detection + + +
+ Effective fraud detection and analysis of government-issued identity +documents, such as passports, driver's licenses, and identity cards, are +essential in thwarting identity theft and bolstering security on online +platforms. The training of accurate fraud detection and analysis tools depends +on the availability of extensive identity document datasets. However, current +publicly available benchmark datasets for identity document analysis, including +MIDV-500, MIDV-2020, and FMIDV, fall short in several respects: they offer a +limited number of samples, cover insufficient varieties of fraud patterns, and +seldom include alterations in critical personal identifying fields like +portrait images, limiting their utility in training models capable of detecting +realistic frauds while preserving privacy. + In response to these shortcomings, our research introduces a new benchmark +dataset, IDNet, designed to advance privacy-preserving fraud detection efforts. +The IDNet dataset comprises 837,060 images of synthetically generated identity +documents, totaling approximately 490 gigabytes, categorized into 20 types from +$10$ U.S. states and 10 European countries. We evaluate the utility and present +use cases of the dataset, illustrating how it can aid in training +privacy-preserving fraud detection methods, facilitating the generation of +camera and video capturing of identity documents, and testing schema +unification and other identity document management functionalities. + +
+
+ comment: 40 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 37 + +
+
+
+ + ☆ DiversityMedQA: Assessing Demographic Biases in Medical Diagnosis using + Large Language Models + + +
+ As large language models (LLMs) gain traction in healthcare, concerns about +their susceptibility to demographic biases are growing. We introduce +{DiversityMedQA}, a novel benchmark designed to assess LLM responses to medical +queries across diverse patient demographics, such as gender and ethnicity. By +perturbing questions from the MedQA dataset, which comprises medical board exam +questions, we created a benchmark that captures the nuanced differences in +medical diagnosis across varying patient profiles. Our findings reveal notable +discrepancies in model performance when tested against these demographic +variations. Furthermore, to ensure the perturbations were accurate, we also +propose a filtering strategy that validates each perturbation. By releasing +DiversityMedQA, we provide a resource for evaluating and mitigating demographic +bias in LLM medical diagnoses. + +
+
+
+
+
+ + ☆ The Compressor-Retriever Architecture for Language Model OS + + +
+ Recent advancements in large language models (LLMs) have significantly +enhanced their capacity to aggregate and process information across multiple +modalities, enabling them to perform a wide range of tasks such as multimodal +data querying, tool usage, web interactions, and handling long documents. These +capabilities pave the way for transforming LLMs from mere chatbots into +general-purpose agents capable of interacting with the real world. This paper +explores the concept of using a language model as the core component of an +operating system (OS), effectively acting as a CPU that processes data stored +in a context window, which functions as RAM. A key challenge in realizing such +an LM OS is managing the life-long context and ensuring statefulness across +sessions, a feature limited by the current session-based interaction paradigm +due to context window size limit. To address this, we introduce +compressor-retriever, a model-agnostic architecture designed for life-long +context management. Unlike other long-context solutions such as +retrieval-augmented generation, our approach exclusively uses the base model's +forward function to compress and retrieve context, ensuring end-to-end +differentiability. Preliminary experiments demonstrate the effectiveness of +this architecture in in-context learning tasks, marking a step towards the +development of a fully stateful LLM OS. Project repo available at: +https://github.com/gblackout/LM-OS + +
+
+
+
+
+ + ☆ Revisiting SMoE Language Models by Evaluating Inefficiencies with Task + Specific Expert Pruning + + +
+ Sparse Mixture of Expert (SMoE) models have emerged as a scalable alternative +to dense models in language modeling. These models use conditionally activated +feedforward subnetworks in transformer blocks, allowing for a separation +between total model parameters and per-example computation. However, large +token-routed SMoE models face a significant challenge: during inference, the +entire model must be used for a sequence or a batch, resulting in high +latencies in a distributed setting that offsets the advantages of per-token +sparse activation. Our research explores task-specific model pruning to inform +decisions about designing SMoE architectures, mainly modulating the choice of +expert counts in pretraining. We investigate whether such pruned models offer +advantages over smaller SMoE models trained from scratch, when evaluating and +comparing them individually on tasks. To that end, we introduce an adaptive +task-aware pruning technique UNCURL to reduce the number of experts per MoE +layer in an offline manner post-training. Our findings reveal a threshold +pruning factor for the reduction that depends on the number of experts used in +pretraining, above which, the reduction starts to degrade model performance. +These insights contribute to our understanding of model design choices when +pretraining with SMoE architectures, particularly useful when considering +task-specific inference optimization for later stages. + +
+
+
+
+
+ + ☆ Masked Mixers for Language Generation and Retrieval + + +
+ Attention mechanisms that confer selective focus on a strict subset of input +elements are nearly ubiquitous in language models today. We posit there to be +downside to the use of attention: most information present in the input is +necessarily lost. In support of this idea we observe poor input representation +accuracy in transformers, but find more accurate representation in what we term +masked mixers which replace self-attention with masked convolutions. Applied to +TinyStories the masked mixer learns causal language tasks more efficiently than +early transformer implementations and somewhat less efficiently than optimized, +current implementations. The most efficient learning algorithm observed for +this dataset is a transformer-masked mixer hybrid, suggesting that these models +learn in an orthogonal manner. We hypothesized that the information loss +exhibited by transformers would be much more detrimental to retrieval than +generation, and to test this we introduce an efficient training approach for +retrieval models based on existing generative model embeddings. With this +method, embeddings from masked mixers are found to result in far better +summary-to-story retrieval compared to embeddings from transformers. + +
+
+ comment: 23 pages, 15 figures (11 primary, 4 supplementary) +
+
+
+
+
+ + ☆ PoliPrompt: A High-Performance Cost-Effective LLM-Based Text + Classification Framework for Political Science + + +
+ Recent advancements in large language models (LLMs) have opened new avenues +for enhancing text classification efficiency in political science, surpassing +traditional machine learning methods that often require extensive feature +engineering, human labeling, and task-specific training. However, their +effectiveness in achieving high classification accuracy remains questionable. +This paper introduces a three-stage in-context learning approach that leverages +LLMs to improve classification accuracy while minimizing experimental costs. +Our method incorporates automatic enhanced prompt generation, adaptive exemplar +selection, and a consensus mechanism that resolves discrepancies between two +weaker LLMs, refined by an advanced LLM. We validate our approach using +datasets from the BBC news reports, Kavanaugh Supreme Court confirmation, and +2018 election campaign ads. The results show significant improvements in +classification F1 score (+0.36 for zero-shot classification) with manageable +economic costs (-78% compared with human labeling), demonstrating that our +method effectively addresses the limitations of traditional machine learning +while offering a scalable and reliable solution for text analysis in political +science. + +
+
+ comment: 23 pages, 5 figures +
+
+
+
+
+ + ☆ Efficient and Scalable Estimation of Tool Representations in Vector + Space + + +
+ Recent advancements in function calling and tool use have significantly +enhanced the capabilities of large language models (LLMs) by enabling them to +interact with external information sources and execute complex tasks. However, +the limited context window of LLMs presents challenges when a large number of +tools are available, necessitating efficient methods to manage prompt length +and maintain accuracy. Existing approaches, such as fine-tuning LLMs or +leveraging their reasoning capabilities, either require frequent retraining or +incur significant latency overhead. A more efficient solution involves training +smaller models to retrieve the most relevant tools for a given query, although +this requires high quality, domain-specific data. To address those challenges, +we present a novel framework for generating synthetic data for tool retrieval +applications and an efficient data-driven tool retrieval strategy using small +encoder models. Empowered by LLMs, we create ToolBank, a new tool retrieval +dataset that reflects real human user usages. For tool retrieval methodologies, +we propose novel approaches: (1) Tool2Vec: usage-driven tool embedding +generation for tool retrieval, (2) ToolRefiner: a staged retrieval method that +iteratively improves the quality of retrieved tools, and (3) MLC: framing tool +retrieval as a multi-label classification problem. With these new methods, we +achieve improvements of up to 27.28 in Recall@K on the ToolBench dataset and +30.5 in Recall@K on ToolBank. Additionally, we present further experimental +results to rigorously validate our methods. Our code is available at +\url{https://github.com/SqueezeAILab/Tool2Vec} + +
+
+
+
+
+ + ♻ ☆ Dissociation of Faithful and Unfaithful Reasoning in LLMs + + +
+ Large language models (LLMs) often improve their performance in downstream +tasks when they generate Chain of Thought reasoning text before producing an +answer. We investigate how LLMs recover from errors in Chain of Thought. +Through analysis of error recovery behaviors, we find evidence for +unfaithfulness in Chain of Thought, which occurs when models arrive at the +correct answer despite invalid reasoning text. We identify factors that shift +LLM recovery behavior: LLMs recover more frequently from obvious errors and in +contexts that provide more evidence for the correct answer. Critically, these +factors have divergent effects on faithful and unfaithful recoveries. Our +results indicate that there are distinct mechanisms driving faithful and +unfaithful error recoveries. Selective targeting of these mechanisms may be +able to drive down the rate of unfaithful reasoning and improve model +interpretability. + +
+
+ comment: code published at + https://github.com/CoTErrorRecovery/CoTErrorRecovery +
+
+
+
+
+ + ♻ ☆ Manipulating Large Language Models to Increase Product Visibility + + +
+ Large language models (LLMs) are increasingly being integrated into search +engines to provide natural language responses tailored to user queries. +Customers and end-users are also becoming more dependent on these models for +quick and easy purchase decisions. In this work, we investigate whether +recommendations from LLMs can be manipulated to enhance a product's visibility. +We demonstrate that adding a strategic text sequence (STS) -- a carefully +crafted message -- to a product's information page can significantly increase +its likelihood of being listed as the LLM's top recommendation. To understand +the impact of STS, we use a catalog of fictitious coffee machines and analyze +its effect on two target products: one that seldom appears in the LLM's +recommendations and another that usually ranks second. We observe that the +strategic text sequence significantly enhances the visibility of both products +by increasing their chances of appearing as the top recommendation. This +ability to manipulate LLM-generated search responses provides vendors with a +considerable competitive advantage and has the potential to disrupt fair market +competition. Just as search engine optimization (SEO) revolutionized how +webpages are customized to rank higher in search engine results, influencing +LLM recommendations could profoundly impact content optimization for AI-driven +search services. Code for our experiments is available at +https://github.com/aounon/llm-rank-optimizer. + +
+
+
+
+
+ + ♻ ☆ Balancing Rigor and Utility: Mitigating Cognitive Biases in Large + Language Models for Multiple-Choice Questions + + +
+ This paper examines the role of cognitive biases in the decision-making +processes of large language models (LLMs), challenging the conventional goal of +eliminating all biases. We show that certain cognitive biases when properly +balanced, can enhance decision-making efficiency through rational deviations +and heuristic shortcuts. By introducing heuristic moderation and an abstention +option, which allows LLMs to withhold responses when uncertain, we reduce error +rates, improve decision accuracy, and optimize decision rates. Using the +Balance Rigor and Utility (BRU) dataset, developed through expert +collaboration, our findings demonstrate that targeted inspection of cognitive +biases aligns LLM decisions more closely with human reasoning, enhancing +reliability and suggesting strategies for future improvements. This approach +offers a novel way to leverage cognitive biases to improve the practical +utility of LLMs across various applications. + +
+
+ comment: This article is currently under review. All data will be open on + GitHub once the review is complete. + https://github.com/limanwang/Balancing-Rigor-and-Utility +
+
+
+
+
+ + ♻ ☆ Eliciting Informative Text Evaluations with Large Language Models + + +
+ Peer prediction mechanisms motivate high-quality feedback with provable +guarantees. However, current methods only apply to rather simple reports, like +multiple-choice or scalar numbers. We aim to broaden these techniques to the +larger domain of text-based reports, drawing on the recent developments in +large language models. This vastly increases the applicability of peer +prediction mechanisms as textual feedback is the norm in a large variety of +feedback channels: peer reviews, e-commerce customer reviews, and comments on +social media. + We introduce two mechanisms, the Generative Peer Prediction Mechanism (GPPM) +and the Generative Synopsis Peer Prediction Mechanism (GSPPM). These mechanisms +utilize LLMs as predictors, mapping from one agent's report to a prediction of +her peer's report. Theoretically, we show that when the LLM prediction is +sufficiently accurate, our mechanisms can incentivize high effort and +truth-telling as an (approximate) Bayesian Nash equilibrium. Empirically, we +confirm the efficacy of our mechanisms through experiments conducted on two +real datasets: the Yelp review dataset and the ICLR OpenReview dataset. We +highlight the results that on the ICLR dataset, our mechanisms can +differentiate three quality levels -- human-written reviews, GPT-4-generated +reviews, and GPT-3.5-generated reviews in terms of expected scores. +Additionally, GSPPM penalizes LLM-generated reviews more effectively than GPPM. + +
+
+ comment: Accepted by the Twenty-Fifth ACM Conference on Economics and + Computation (EC'24) +
+
+
+
+
+ + ♻ ☆ Exploring Bias and Prediction Metrics to Characterise the Fairness of + Machine Learning for Equity-Centered Public Health Decision-Making: A + Narrative Review + + +
+ Background: The rapid advancement of Machine Learning (ML) represents novel +opportunities to enhance public health research, surveillance, and +decision-making. However, there is a lack of comprehensive understanding of +algorithmic bias, systematic errors in predicted population health outcomes, +resulting from the public health application of ML. The objective of this +narrative review is to explore the types of bias generated by ML and +quantitative metrics to assess these biases. + Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of +Electrical and Electronics Engineers), ACM (Association for Computing +Machinery) Digital Library, Science Direct, and Springer Nature. We used +keywords to identify studies describing types of bias and metrics to measure +these in the domain of ML and public and population health published in English +between 2008 and 2023, inclusive. + Results: A total of 72 articles met the inclusion criteria. Our review +identified the commonly described types of bias and quantitative metrics to +assess these biases from an equity perspective. + Conclusion : The review will help formalize the evaluation framework for ML +on public health from an equity perspective. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Domain-Specific Improvement on Psychotherapy Chatbot Using Assistant ICASSP 2024 + + +
+ Large language models (LLMs) have demonstrated impressive generalization +capabilities on specific tasks with human-written instruction data. However, +the limited quantity, diversity, and professional expertise of such instruction +data raise concerns about the performance of LLMs in psychotherapy tasks when +provided with domain-specific instructions. To address this, we firstly propose +Domain-Specific Assistant Instructions based on AlexanderStreet therapy, and +secondly, we use an adaption fine-tuning method and retrieval augmented +generation method to improve pre-trained LLMs. Through quantitative evaluation +of linguistic quality using automatic and human evaluation, we observe that +pre-trained LLMs on Psychotherapy Assistant Instructions outperform +state-of-the-art LLMs response baselines. Our Assistant-Instruction approach +offers a half-annotation method to align pre-trained LLMs with instructions and +provide pre-trained LLMs with more psychotherapy knowledge. + +
+
+ comment: Accepted at ICASSP 2024 EIHRC Workshop +
+
+
+
+
+ + ♻ ☆ Exploring neural oscillations during speech perception via surrogate + gradient spiking neural networks + + +
+ Understanding cognitive processes in the brain demands sophisticated models +capable of replicating neural dynamics at large scales. We present a +physiologically inspired speech recognition architecture, compatible and +scalable with deep learning frameworks, and demonstrate that end-to-end +gradient descent training leads to the emergence of neural oscillations in the +central spiking neural network. Significant cross-frequency couplings, +indicative of these oscillations, are measured within and across network layers +during speech processing, whereas no such interactions are observed when +handling background noise inputs. Furthermore, our findings highlight the +crucial inhibitory role of feedback mechanisms, such as spike frequency +adaptation and recurrent connections, in regulating and synchronising neural +activity to improve recognition performance. Overall, on top of developing our +understanding of synchronisation phenomena notably observed in the human +auditory pathway, our architecture exhibits dynamic and efficient information +processing, with relevance to neuromorphic technology. + +
+
+
+
+
+ + ♻ ☆ Analyzing Diversity in Healthcare LLM Research: A Scientometric + Perspective + + +
+ The deployment of large language models (LLMs) in healthcare has demonstrated +substantial potential for enhancing clinical decision-making, administrative +efficiency, and patient outcomes. However, the underrepresentation of diverse +groups in the development and application of these models can perpetuate +biases, leading to inequitable healthcare delivery. This paper presents a +comprehensive scientometric analysis of LLM research for healthcare, including +data from January 1, 2021, to July 1, 2024. By analyzing metadata from PubMed +and Dimensions, including author affiliations, countries, and funding sources, +we assess the diversity of contributors to LLM research. Our findings highlight +significant gender and geographic disparities, with a predominance of male +authors and contributions primarily from high-income countries (HICs). We +introduce a novel journal diversity index based on Gini diversity to measure +the inclusiveness of scientific publications. Our results underscore the +necessity for greater representation in order to ensure the equitable +application of LLMs in healthcare. We propose actionable strategies to enhance +diversity and inclusivity in artificial intelligence research, with the +ultimate goal of fostering a more inclusive and equitable future in healthcare +innovation. + +
+
+
+
+
+ + ♻ ☆ Sentiment Analysis Across Languages: Evaluation Before and After Machine + Translation to English + + +
+ People communicate in more than 7,000 languages around the world, with around +780 languages spoken in India alone. Despite this linguistic diversity, +research on Sentiment Analysis has predominantly focused on English text data, +resulting in a disproportionate availability of sentiment resources for +English. This paper examines the performance of transformer models in Sentiment +Analysis tasks across multilingual datasets and text that has undergone machine +translation. By comparing the effectiveness of these models in different +linguistic contexts, we gain insights into their performance variations and +potential implications for sentiment analysis across diverse languages. We also +discuss the shortcomings and potential for future work towards the end. + +
+
+ comment: 6 pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ ExtractGPT: Exploring the Potential of Large Language Models for Product + Attribute Value Extraction + + +
+ In order to facilitate features such as faceted product search and product +comparison, e-commerce platforms require accurately structured product data, +including precise attribute/value pairs. Vendors often times provide +unstructured product descriptions consisting only of an offer title and a +textual description. Consequently, extracting attribute values from titles and +descriptions is vital for e-commerce platforms. State-of-the-art attribute +value extraction methods based on pre-trained language models, such as BERT, +face two drawbacks (i) the methods require significant amounts of task-specific +training data and (ii) the fine-tuned models have problems with generalising to +unseen attribute values that were not part of the training data. This paper +explores the potential of using large language models as a more training +data-efficient and more robust alternative to existing AVE methods. We propose +prompt templates for describing the target attributes of the extraction to the +LLM, covering both zero-shot and few-shot scenarios. In the zero-shot scenario, +textual and JSON-based target schema representations of the attributes are +compared. In the few-shot scenario, we investigate (i) the provision of example +attribute values, (ii) the selection of in-context demonstrations, (iii) +shuffled ensembling to prevent position bias, and (iv) fine-tuning the LLM. We +evaluate the prompt templates in combination with hosted LLMs, such as GPT-3.5 +and GPT-4, and open-source LLMs which can be run locally. We compare the +performance of the LLMs to the PLM-based methods SU-OpenTag, AVEQA, and MAVEQA. +The highest average F1-score of 86% was achieved by GPT-4. Llama-3-70B performs +only 3% worse than GPT-4, making it a competitive open-source alternative. +Given the same training data, this prompt/GPT-4 combination outperforms the +best PLM baseline by an average of 6% F1-score. + +
+
+
+
+
+ + ♻ ☆ AMERICANO: Argument Generation with Discourse-driven Decomposition and + Agent Interaction + + +
+ Argument generation is a challenging task in natural language processing, +which requires rigorous reasoning and proper content organization. Inspired by +recent chain-of-thought prompting that breaks down a complex task into +intermediate steps, we propose Americano, a novel framework with agent +interaction for argument generation. Our approach decomposes the generation +process into sequential actions grounded on argumentation theory, which first +executes actions sequentially to generate argumentative discourse components, +and then produces a final argument conditioned on the components. To further +mimic the human writing process and improve the left-to-right generation +paradigm of current autoregressive language models, we introduce an argument +refinement module which automatically evaluates and refines argument drafts +based on feedback received. We evaluate our framework on the task of +counterargument generation using a subset of Reddit/CMV dataset. The results +show that our method outperforms both end-to-end and chain-of-thought prompting +methods and can generate more coherent and persuasive arguments with diverse +and rich contents. + +
+
+ comment: INLG 2024 +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models on Spatial Tasks: A Multi-Task + Benchmarking Study + + +
+ The advent of large language models such as ChatGPT, Gemini, and others has +underscored the importance of evaluating their diverse capabilities, ranging +from natural language understanding to code generation. However, their +performance on spatial tasks has not been comprehensively assessed. This study +addresses this gap by introducing a novel multi-task spatial evaluation +dataset, designed to systematically explore and compare the performance of +several advanced models on spatial tasks. The dataset encompasses twelve +distinct task types, including spatial understanding and path planning, each +with verified, accurate answers. We evaluated multiple models, including +OpenAI's gpt-3.5-turbo, gpt-4o, and ZhipuAI's glm-4, through a two-phase +testing approach. Initially, we conducted zero-shot testing, followed by +categorizing the dataset by difficulty and performing prompt tuning tests. +Results indicate that gpt-4o achieved the highest overall accuracy in the first +phase, with an average of 71.3%. Although moonshot-v1-8k slightly +underperformed overall, it surpassed gpt-4o in place name recognition tasks. +The study also highlights the impact of prompt strategies on model performance +in specific tasks. For example, the Chain-of-Thought (COT) strategy increased +gpt-4o's accuracy in path planning from 12.4% to 87.5%, while a one-shot +strategy enhanced moonshot-v1-8k's accuracy in mapping tasks from 10.1% to +76.3%. + +
+
+
+
+
+ + ♻ ☆ LiveFC: A System for Live Fact-Checking of Audio Streams + + +
+ The advances in the digital era have led to rapid dissemination of +information. This has also aggravated the spread of misinformation and +disinformation. This has potentially serious consequences, such as civil +unrest. While fact-checking aims to combat this, manual fact-checking is +cumbersome and not scalable. While automated fact-checking approaches exist, +they do not operate in real-time and do not always account for spread of +misinformation through different modalities. This is particularly important as +proactive fact-checking on live streams in real-time can help people be +informed of false narratives and prevent catastrophic consequences that may +cause civil unrest. This is particularly relevant with the rapid dissemination +of information through video on social media platforms or other streams like +political rallies and debates. Hence, in this work we develop a platform named +LiveFC, that can aid in fact-checking live audio streams in real-time. LiveFC +has a user-friendly interface that displays the claims detected along with +their veracity and evidence for live streams with associated speakers for +claims from respective segments. The app can be accessed at +http://livefc.factiverse.ai and a screen recording of the demo can be found at +https://bit.ly/3WVAoIw. + +
+
+ comment: Under Review, 11 pages +
+
+
+
+
+ + ♻ ☆ Generalizing Fairness to Generative Language Models via Reformulation of + Non-discrimination Criteria + + +
+ Generative AI, such as large language models, has undergone rapid development +within recent years. As these models become increasingly available to the +public, concerns arise about perpetuating and amplifying harmful biases in +applications. Gender stereotypes can be harmful and limiting for the +individuals they target, whether they consist of misrepresentation or +discrimination. Recognizing gender bias as a pervasive societal construct, this +paper studies how to uncover and quantify the presence of gender biases in +generative language models. In particular, we derive generative AI analogues of +three well-known non-discrimination criteria from classification, namely +independence, separation and sufficiency. To demonstrate these criteria in +action, we design prompts for each of the criteria with a focus on occupational +gender stereotype, specifically utilizing the medical test to introduce the +ground truth in the generative AI context. Our results address the presence of +occupational gender bias within such conversational language models. + +
+
+
+
+
+ + ♻ ☆ A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning KDD + + +
+ Retrieval-augmented generation (RAG) is a framework enabling large language +models (LLMs) to enhance their accuracy and reduce hallucinations by +integrating external knowledge bases. In this paper, we introduce a hybrid RAG +system enhanced through a comprehensive suite of optimizations that +significantly improve retrieval quality, augment reasoning capabilities, and +refine numerical computation ability. We refined the text chunks and tables in +web pages, added attribute predictors to reduce hallucinations, conducted LLM +Knowledge Extractor and Knowledge Graph Extractor, and finally built a +reasoning strategy with all the references. We evaluated our system on the CRAG +dataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and +online evaluations demonstrate that our system significantly enhances complex +reasoning capabilities. In local evaluations, we have significantly improved +accuracy and reduced error rates compared to the baseline model, achieving a +notable increase in scores. In the meanwhile, we have attained outstanding +results in online assessments, demonstrating the performance and generalization +capabilities of the proposed system. The source code for our system is released +in \url{https://gitlab.aicrowd.com/shizueyy/crag-new}. + +
+
+ comment: Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024 +
+
+
+
+
+ + ♻ ☆ ReST-MCTS*: LLM Self-Training via Process Reward Guided Tree Search + + +
+ Recent methodologies in LLM self-training mostly rely on LLM generating +responses and filtering those with correct output answers as training data. +This approach often yields a low-quality fine-tuning training set (e.g., +incorrect plans or intermediate reasoning). In this paper, we develop a +reinforced self-training approach, called ReST-MCTS*, based on integrating +process reward guidance with tree search MCTS* for collecting higher-quality +reasoning traces as well as per-step value to train policy and reward models. +ReST-MCTS* circumvents the per-step manual annotation typically used to train +process rewards by tree-search-based reinforcement learning: Given oracle final +correct answers, ReST-MCTS* is able to infer the correct process rewards by +estimating the probability this step can help lead to the correct answer. These +inferred rewards serve dual purposes: they act as value targets for further +refining the process reward model and also facilitate the selection of +high-quality traces for policy model self-training. We first show that the +tree-search policy in ReST-MCTS* achieves higher accuracy compared with prior +LLM reasoning baselines such as Best-of-N and Tree-of-Thought, within the same +search budget. We then show that by using traces searched by this tree-search +policy as training data, we can continuously enhance the three language models +for multiple iterations, and outperform other self-training algorithms such as +ReST$^\text{EM}$ and Self-Rewarding LM. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ♻ ☆ FastMem: Fast Memorization of Prompt Improves Context Awareness of Large + Language Models + + +
+ Large language models (LLMs) excel in generating coherent text, but they +often struggle with context awareness, leading to inaccuracies in tasks +requiring faithful adherence to provided information. We introduce FastMem, a +novel method designed to enhance instruction fine-tuned LLMs' context awareness +through fast memorization of the prompt. FastMem maximizes the likelihood of +the prompt before inference by fine-tuning only the last Feed-Forward Network +(FFN) module. This targeted approach ensures efficient optimization without +overfitting, significantly improving the model's ability to comprehend and +accurately follow the context. Our experiments demonstrate substantial gains in +reading comprehension, text summarization and adherence to output structures. +For instance, FastMem improves the accuracy of Llama 3-8B-Inst on the NQ-SWAP +dataset from 59.1% to 71.6%, and reduces the output structure failure rate of +Qwen 1.5-4B-Chat from 34.9% to 25.5%. Extensive experimental results highlight +FastMem's potential to offer a robust solution to enhance the reliability and +accuracy of LLMs in various applications. Our code is available at: +https://github.com/IAAR-Shanghai/FastMem + +
+
+
+
+
+ + ♻ ☆ A Formal Perspective on Byte-Pair Encoding ACL 2023 + + +
+ Byte-Pair Encoding (BPE) is a popular algorithm used for tokenizing data in +NLP, despite being devised initially as a compression method. BPE appears to be +a greedy algorithm at face value, but the underlying optimization problem that +BPE seeks to solve has not yet been laid down. We formalize BPE as a +combinatorial optimization problem. Via submodular functions, we prove that the +iterative greedy version is a +$\frac{1}{{\sigma(\boldsymbol{\mu}^\star)}}(1-e^{-{\sigma(\boldsymbol{\mu}^\star)}})$-approximation +of an optimal merge sequence, where ${\sigma(\boldsymbol{\mu}^\star)}$ is the +total backward curvature with respect to the optimal merge sequence +$\boldsymbol{\mu}^\star$. Empirically the lower bound of the approximation is +$\approx 0.37$. + We provide a faster implementation of BPE which improves the runtime +complexity from $\mathcal{O}\left(N M\right)$ to $\mathcal{O}\left(N \log +M\right)$, where $N$ is the sequence length and $M$ is the merge count. +Finally, we optimize the brute-force algorithm for optimal BPE using +memoization. + +
+
+ comment: ACL 2023 +
+
+
+
+
+ + ♻ ☆ Pitfalls and Outlooks in Using COMET + + +
+ Since its introduction, the COMET metric has blazed a trail in the machine +translation community, given its strong correlation with human judgements of +translation quality. Its success stems from being a modified pre-trained +multilingual model finetuned for quality assessment. However, it being a +machine learning model also gives rise to a new set of pitfalls that may not be +widely known. We investigate these unexpected behaviours from three aspects: 1) +technical: obsolete software versions and compute precision; 2) data: empty +content, language mismatch, and translationese at test time as well as +distribution and domain biases in training; 3) usage and reporting: +multi-reference support and model referencing in the literature. All of these +problems imply that COMET scores is not comparable between papers or even +technical setups and we put forward our perspective on fixing each issue. +Furthermore, we release the SacreCOMET package that can generate a signature +for the software and model configuration as well as an appropriate citation. +The goal of this work is to help the community make more sound use of the COMET +metric. + +
+
+
+
+
+ + ♻ ☆ AudioBench: A Universal Benchmark for Audio Large Language Models + + +
+ We introduce AudioBench, a universal benchmark designed to evaluate Audio +Large Language Models (AudioLLMs). It encompasses 8 distinct tasks and 26 +datasets, among which, 7 are newly proposed datasets. The evaluation targets +three main aspects: speech understanding, audio scene understanding, and voice +understanding (paralinguistic). Despite recent advancements, there lacks a +comprehensive benchmark for AudioLLMs on instruction following capabilities +conditioned on audio signals. AudioBench addresses this gap by setting up +datasets as well as desired evaluation metrics. Besides, we also evaluated the +capabilities of five popular models and found that no single model excels +consistently across all tasks. We outline the research outlook for AudioLLMs +and anticipate that our open-sourced evaluation toolkit, data, and leaderboard +will offer a robust testbed for future model developments. + +
+
+ comment: v3 - Abundent update on models and evaluation details; Code: + https://github.com/AudioLLMs/AudioBench +
+
+
+
+
+ + ♻ ☆ Contrasting Linguistic Patterns in Human and LLM-Generated News Text + + +
+ We conduct a quantitative analysis contrasting human-written English news +text with comparable large language model (LLM) output from six different LLMs +that cover three different families and four sizes in total. Our analysis spans +several measurable linguistic dimensions, including morphological, syntactic, +psychometric, and sociolinguistic aspects. The results reveal various +measurable differences between human and AI-generated texts. Human texts +exhibit more scattered sentence length distributions, more variety of +vocabulary, a distinct use of dependency and constituent types, shorter +constituents, and more optimized dependency distances. Humans tend to exhibit +stronger negative emotions (such as fear and disgust) and less joy compared to +text generated by LLMs, with the toxicity of these models increasing as their +size grows. LLM outputs use more numbers, symbols and auxiliaries (suggesting +objective language) than human texts, as well as more pronouns. The sexist bias +prevalent in human text is also expressed by LLMs, and even magnified in all of +them but one. Differences between LLMs and humans are larger than between LLMs. + +
+
+ comment: Published at Artificial Intelligence Review vol. 57, 265 +
+
+
+
+
+ + ♻ ☆ LLM-as-a-tutor in EFL Writing Education: Focusing on Evaluation of + Student-LLM Interaction + + +
+ In the context of English as a Foreign Language (EFL) writing education, +LLM-as-a-tutor can assist students by providing real-time feedback on their +essays. However, challenges arise in assessing LLM-as-a-tutor due to differing +standards between educational and general use cases. To bridge this gap, we +integrate pedagogical principles to assess student-LLM interaction. First, we +explore how LLMs can function as English tutors, providing effective essay +feedback tailored to students. Second, we propose three metrics to evaluate +LLM-as-a-tutor specifically designed for EFL writing education, emphasizing +pedagogical aspects. In this process, EFL experts evaluate the feedback from +LLM-as-a-tutor regarding quality and characteristics. On the other hand, EFL +learners assess their learning outcomes from interaction with LLM-as-a-tutor. +This approach lays the groundwork for developing LLMs-as-a-tutor tailored to +the needs of EFL learners, advancing the effectiveness of writing education in +this context. + +
+
+
+
+
+ + ♻ ☆ MLR-Copilot: Autonomous Machine Learning Research based on Large + Language Models Agents + + +
+ Machine learning research, crucial for technological advancements and +innovation, often faces significant challenges due to its inherent complexity, +slow pace of experimentation, and the necessity for specialized expertise. +Motivated by this, we present a new systematic framework, autonomous Machine +Learning Research with large language models (MLR-Copilot), designed to enhance +machine learning research productivity through the automatic generation and +implementation of research ideas using Large Language Model (LLM) agents. The +framework consists of three phases: research idea generation, experiment +implementation, and implementation execution. First, existing research papers +are used to generate hypotheses and experimental plans vis IdeaAgent powered by +LLMs. Next, the implementation generation phase translates these plans into +executables with ExperimentAgent. This phase leverages retrieved prototype code +and optionally retrieves candidate models and data. Finally, the execution +phase, also managed by ExperimentAgent, involves running experiments with +mechanisms for human feedback and iterative debugging to enhance the likelihood +of achieving executable research outcomes. We evaluate our framework on five +machine learning research tasks and the experimental results show the +framework's potential to facilitate the research progress and innovations. + +
+
+
+
+
+ + ♻ ☆ Show Me the World in My Language: Establishing the First Baseline for + Scene-Text to Scene-Text Translation ICPR 2024 + + +
+ In this work, we study the task of ``visually'' translating scene text from a +source language (e.g., Hindi) to a target language (e.g., English). Visual +translation involves not just the recognition and translation of scene text but +also the generation of the translated image that preserves visual features of +the source scene text, such as font, size, and background. There are several +challenges associated with this task, such as translation with limited context, +deciding between translation and transliteration, accommodating varying text +lengths within fixed spatial boundaries, and preserving the font and background +styles of the source scene text in the target language. To address this +problem, we make the following contributions: (i) We study visual translation +as a standalone problem for the first time in the literature. (ii) We present a +cascaded framework for visual translation that combines state-of-the-art +modules for scene text recognition, machine translation, and scene text +synthesis as a baseline for the task. (iii) We propose a set of task-specific +design enhancements to design a variant of the baseline to obtain performance +improvements. (iv) Currently, the existing related literature lacks any +comprehensive performance evaluation for this novel task. To fill this gap, we +introduce several automatic and user-assisted evaluation metrics designed +explicitly for evaluating visual translation. Further, we evaluate presented +baselines for translating scene text between Hindi and English. Our experiments +demonstrate that although we can effectively perform visual translation over a +large collection of scene text images, the presented baseline only partially +addresses challenges posed by visual translation tasks. We firmly believe that +this new task and the limitations of existing models, as reported in this +paper, should encourage further research in visual translation. + +
+
+ comment: Accepted at ICPR 2024, Project Website: + https://vl2g.github.io/projects/visTrans/ +
+
+
+
+
+ + ♻ ☆ An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference + Acceleration for Large Vision-Language Models ECCV 2024 + + +
+ In this study, we identify the inefficient attention phenomena in Large +Vision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5, +QwenVL-Chat and Video-LLaVA. We find out that the attention computation over +visual tokens is of extreme inefficiency in the deep layers of popular LVLMs, +suggesting a need for a sparser approach compared to textual data handling. To +this end, we introduce FastV, a versatile plug-and-play method designed to +optimize computational efficiency by learning adaptive attention patterns in +early layers and pruning visual tokens in subsequent ones. Our evaluations +demonstrate FastV's ability to dramatically reduce computational costs (e.g., a +45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a +wide range of image and video understanding tasks. The computational efficiency +and performance trade-off of FastV are highly customizable and +pareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve +a lower budget than that of a 7B-parameter model, while still maintaining +superior performance. We believe FastV has practical values for deployment of +LVLMs in edge devices and commercial models. Code is released at +https://github.com/pkunlp-icler/FastV. + +
+
+ comment: Accepted to ECCV 2024 (Oral), code is released at + https://github.com/pkunlp-icler/FastV, +
+
+
+
+
+ + ♻ ☆ CHiSafetyBench: A Chinese Hierarchical Safety Benchmark for Large + Language Models + + +
+ With the profound development of large language models(LLMs), their safety +concerns have garnered increasing attention. However, there is a scarcity of +Chinese safety benchmarks for LLMs, and the existing safety taxonomies are +inadequate, lacking comprehensive safety detection capabilities in authentic +Chinese scenarios. In this work, we introduce CHiSafetyBench, a dedicated +safety benchmark for evaluating LLMs' capabilities in identifying risky content +and refusing answering risky questions in Chinese contexts. CHiSafetyBench +incorporates a dataset that covers a hierarchical Chinese safety taxonomy +consisting of 5 risk areas and 31 categories. This dataset comprises two types +of tasks: multiple-choice questions and question-answering, evaluating LLMs +from the perspectives of risk content identification and the ability to refuse +answering risky questions respectively. Utilizing this benchmark, we validate +the feasibility of automatic evaluation as a substitute for human evaluation +and conduct comprehensive automatic safety assessments on mainstream Chinese +LLMs. Our experiments reveal the varying performance of different models across +various safety domains, indicating that all models possess considerable +potential for improvement in Chinese safety capabilities. Our dataset is +publicly available at +https://github.com/UnicomAI/UnicomBenchmark/tree/main/CHiSafetyBench. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ ACORN: Aspect-wise Commonsense Reasoning Explanation Evaluation + + +
+ Evaluating the quality of free-text explanations is a multifaceted, +subjective, and labor-intensive task. Large language models (LLMs) present an +appealing alternative due to their potential for consistency, scalability, and +cost-efficiency. In this work, we present ACORN, a new dataset of 3,500 +free-text explanations and aspect-wise quality ratings, and use it to evaluate +how LLMs rate explanations. We observed that larger models outputted labels +that maintained or increased the inter-annotator agreement, suggesting that +they are within the expected variance between human raters. However, their +correlation with majority-voted human ratings varied across different quality +aspects, indicating that they are not a complete replacement. In turn, using +LLMs as a supplement to a smaller group of human raters in some cases improved +the correlation with the original majority labels. However, the effect was +limited to cases where human raters were scarce, and an additional human rater +had a more pronounced effect in all cases. Overall, we recommend against using +LLMs as a complete replacement for human raters but encourage using them in +configurations that end with targeted human involvement. Data available here: +https://github.com/a-brassard/ACORN + +
+
+ comment: 18 pages, 7 figures, accepted to COLM 2024. Data available here: + https://github.com/a-brassard/ACORN +
+
+
+
+
+ + ♻ ☆ MM-Soc: Benchmarking Multimodal Large Language Models in Social Media + Platforms ACL 2024 + + +
+ Social media platforms are hubs for multimodal information exchange, +encompassing text, images, and videos, making it challenging for machines to +comprehend the information or emotions associated with interactions in online +spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising +solution to these challenges, yet they struggle to accurately interpret human +emotions and complex content such as misinformation. This paper introduces +MM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of +multimodal social media content. MM-Soc compiles prominent multimodal datasets +and incorporates a novel large-scale YouTube tagging dataset, targeting a range +of tasks from misinformation detection, hate speech detection, and social +context generation. Through our exhaustive evaluation on ten size-variants of +four open-source MLLMs, we have identified significant performance disparities, +highlighting the need for advancements in models' social understanding +capabilities. Our analysis reveals that, in a zero-shot setting, various types +of MLLMs generally exhibit difficulties in handling social media tasks. +However, MLLMs demonstrate performance improvements post fine-tuning, +suggesting potential pathways for improvement. Our code and data are available +at https://github.com/claws-lab/MMSoc.git. + +
+
+ comment: In Proceedings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Persuasion Games using Large Language Models + + +
+ Large Language Models (LLMs) have emerged as formidable instruments capable +of comprehending and producing human-like text. This paper explores the +potential of LLMs, to shape user perspectives and subsequently influence their +decisions on particular tasks. This capability finds applications in diverse +domains such as Investment, Credit cards and Insurance, wherein they assist +users in selecting appropriate insurance policies, investment plans, Credit +cards, Retail, as well as in Behavioral Change Support Systems (BCSS). + We present a sophisticated multi-agent framework wherein a consortium of +agents operate in collaborative manner. The primary agent engages directly with +user agents through persuasive dialogue, while the auxiliary agents perform +tasks such as information retrieval, response analysis, development of +persuasion strategies, and validation of facts. Empirical evidence from our +experiments demonstrates that this collaborative methodology significantly +enhances the persuasive efficacy of the LLM. We continuously analyze the +resistance of the user agent to persuasive efforts and counteract it by +employing a combination of rule-based and LLM-based resistance-persuasion +mapping techniques. + We employ simulated personas and generate conversations in insurance, +banking, and retail domains to evaluate the proficiency of large language +models (LLMs) in recognizing, adjusting to, and influencing various personality +types. Concurrently, we examine the resistance mechanisms employed by LLM +simulated personas. Persuasion is quantified via measurable surveys before and +after interaction, LLM-generated scores on conversation, and user decisions +(purchase or non-purchase). + +
+
+
+
+
+ + ♻ ☆ Cultural Compass: Predicting Transfer Learning Success in Offensive + Language Detection with Cultural Features EMNLP 2023 + + +
+ The increasing ubiquity of language technology necessitates a shift towards +considering cultural diversity in the machine learning realm, particularly for +subjective tasks that rely heavily on cultural nuances, such as Offensive +Language Detection (OLD). Current understanding underscores that these tasks +are substantially influenced by cultural values, however, a notable gap exists +in determining if cultural features can accurately predict the success of +cross-cultural transfer learning for such subjective tasks. Addressing this, +our study delves into the intersection of cultural features and transfer +learning effectiveness. The findings reveal that cultural value surveys indeed +possess a predictive power for cross-cultural transfer learning success in OLD +tasks and that it can be further improved using offensive word distance. Based +on these results, we advocate for the integration of cultural information into +datasets. Additionally, we recommend leveraging data sources rich in cultural +information, such as surveys, to enhance cultural adaptability. Our research +signifies a step forward in the quest for more inclusive, culturally sensitive +language technologies. + +
+
+ comment: Findings of EMNLP 2023 (update) +
+
+
+
+
+ + ♻ ☆ From Wide to Deep: Dimension Lifting Network for Parameter-efficient + Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) that maps entities and relations into vector +representations is essential for downstream applications. Conventional KGE +methods require high-dimensional representations to learn the complex structure +of knowledge graph, but lead to oversized model parameters. Recent advances +reduce parameters by low-dimensional entity representations, while developing +techniques (e.g., knowledge distillation or reinvented representation forms) to +compensate for reduced dimension. However, such operations introduce +complicated computations and model designs that may not benefit large knowledge +graphs. To seek a simple strategy to improve the parameter efficiency of +conventional KGE models, we take inspiration from that deeper neural networks +require exponentially fewer parameters to achieve expressiveness comparable to +wider networks for compositional structures. We view all entity representations +as a single-layer embedding network, and conventional KGE methods that adopt +high-dimensional entity representations equal widening the embedding network to +gain expressiveness. To achieve parameter efficiency, we instead propose a +deeper embedding network for entity representations, i.e., a narrow entity +embedding layer plus a multi-layer dimension lifting network (LiftNet). +Experiments on three public datasets show that by integrating LiftNet, four +conventional KGE methods with 16-dimensional representations achieve comparable +link prediction accuracy as original models that adopt 512-dimensional +representations, saving 68.4% to 96.9% parameters. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 44 + +
+
+
+ + ♻ ☆ Inter-Frame Compression for Dynamic Point Cloud Geometry Coding + + +
+ Efficient point cloud compression is essential for applications like virtual +and mixed reality, autonomous driving, and cultural heritage. This paper +proposes a deep learning-based inter-frame encoding scheme for dynamic point +cloud geometry compression. We propose a lossy geometry compression scheme that +predicts the latent representation of the current frame using the previous +frame by employing a novel feature space inter-prediction network. The proposed +network utilizes sparse convolutions with hierarchical multiscale 3D feature +learning to encode the current frame using the previous frame. The proposed +method introduces a novel predictor network for motion compensation in the +feature domain to map the latent representation of the previous frame to the +coordinates of the current frame to predict the current frame's feature +embedding. The framework transmits the residual of the predicted features and +the actual features by compressing them using a learned probabilistic +factorized entropy model. At the receiver, the decoder hierarchically +reconstructs the current frame by progressively rescaling the feature +embedding. The proposed framework is compared to the state-of-the-art +Video-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud +Compression (G-PCC) schemes standardized by the Moving Picture Experts Group +(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta +Rate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against +G-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame +encoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based +inter-frame encoding mode using HEVC. These significant performance gains are +cross-checked and verified in the MPEG working group. + +
+
+
+
+
+ + ♻ ☆ SEDMamba: Enhancing Selective State Space Modelling with Bottleneck + Mechanism and Fine-to-Coarse Temporal Fusion for Efficient Error Detection in + Robot-Assisted Surgery + + +
+ Automated detection of surgical errors can improve robotic-assisted surgery. +Despite promising progress, existing methods still face challenges in capturing +rich temporal context to establish long-term dependencies while maintaining +computational efficiency. In this paper, we propose a novel hierarchical model +named SEDMamba, which incorporates the selective state space model (SSM) into +surgical error detection, facilitating efficient long sequence modelling with +linear complexity. SEDMamba enhances selective SSM with a bottleneck mechanism +and fine-to-coarse temporal fusion (FCTF) to detect and temporally localize +surgical errors in long videos. The bottleneck mechanism compresses and +restores features within their spatial dimension, thereby reducing +computational complexity. FCTF utilizes multiple dilated 1D convolutional +layers to merge temporal information across diverse scale ranges, accommodating +errors of varying duration. Our work also contributes the first-of-its-kind, +frame-level, in-vivo surgical error dataset to support error detection in real +surgical cases. Specifically, we deploy the clinically validated observational +clinical human reliability assessment tool (OCHRA) to annotate the errors +during suturing tasks in an open-source radical prostatectomy dataset +(SAR-RARP50). Experimental results demonstrate that our SEDMamba outperforms +state-of-the-art methods with at least 1.82% AUC and 3.80% AP performance gains +with significantly reduced computational complexity. The corresponding error +annotations, code and models will be released at +https://github.com/wzjialang/SEDMamba. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Efficient Video Object Segmentation via Modulated Cross-Attention Memory WACV 2025 + + +
+ Recently, transformer-based approaches have shown promising results for +semi-supervised video object segmentation. However, these approaches typically +struggle on long videos due to increased GPU memory demands, as they frequently +expand the memory bank every few frames. We propose a transformer-based +approach, named MAVOS, that introduces an optimized and dynamic long-term +modulated cross-attention (MCA) memory to model temporal smoothness without +requiring frequent memory expansion. The proposed MCA effectively encodes both +local and global features at various levels of granularity while efficiently +maintaining consistent speed regardless of the video length. Extensive +experiments on multiple benchmarks, LVOS, Long-Time Video, and DAVIS 2017, +demonstrate the effectiveness of our proposed contributions leading to +real-time inference and markedly reduced memory demands without any degradation +in segmentation accuracy on long videos. Compared to the best existing +transformer-based approach, our MAVOS increases the speed by 7.6x, while +significantly reducing the GPU memory by 87% with comparable segmentation +performance on short and long video datasets. Notably on the LVOS dataset, our +MAVOS achieves a J&F score of 63.3% while operating at 37 frames per second +(FPS) on a single V100 GPU. Our code and models will be publicly available at: +https://github.com/Amshaker/MAVOS. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation + and Retrieval-Guidance + + +
+ Diffusion-based models demonstrate impressive generation capabilities. +However, they also have a massive number of parameters, resulting in enormous +model sizes, thus making them unsuitable for deployment on resource-constraint +devices. Block-wise generation can be a promising alternative for designing +compact-sized (parameter-efficient) deep generative models since the model can +generate one block at a time instead of generating the whole image at once. +However, block-wise generation is also considerably challenging because +ensuring coherence across generated blocks can be non-trivial. To this end, we +design a retrieval-augmented generation (RAG) approach and leverage the +corresponding blocks of the images retrieved by the RAG module to condition the +training and generation stages of a block-wise denoising diffusion model. Our +conditioning schemes ensure coherence across the different blocks during +training and, consequently, during generation. While we showcase our approach +using the latent diffusion model (LDM) as the base model, it can be used with +other variants of denoising diffusion models. We validate the solution of the +coherence problem through the proposed approach by reporting substantive +experiments to demonstrate our approach's effectiveness in compact model size +and excellent generation quality. + +
+
+
+
+
+ + ♻ ☆ Multi-Visual-Inertial System: Analysis, Calibration and Estimation + + +
+ In this paper, we study state estimation of multi-visual-inertial systems +(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary +number of asynchronous inertial measurement units (IMUs) or gyroscopes and +global and(or) rolling shutter cameras. We are especially interested in the +full calibration of the associated visual-inertial sensors, including the IMU +or camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as +well as the image readout time of rolling-shutter cameras (if used). To this +end, we develop a new analytic combined IMU integration with intrinsics-termed +ACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary +IMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial +measurements to include all the necessary inertial intrinsic and IMU-IMU +spatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body +constraints to eliminate the necessity of auxiliary inertial poses and thus +reducing computational complexity. By performing observability analysis of +MVIS, we prove that the standard four unobservable directions remain - no +matter how many inertial sensors are used, and also identify, for the first +time, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary +inertial intrinsics. In addition to the extensive simulations that validate our +analysis and algorithms, we have built our own MVIS sensor rig and collected +over 25 real-world datasets to experimentally verify the proposed calibration +against the state-of-the-art calibration method such as Kalibr. We show that +the proposed MVIS calibration is able to achieve competing accuracy with +improved convergence and repeatability, which is open sourced to better benefit +the community. + +
+
+
+
+
+ + ♻ ☆ On Evaluating Adversarial Robustness of Volumetric Medical Segmentation + Models + + +
+ Volumetric medical segmentation models have achieved significant success on +organ and tumor-based segmentation tasks in recent years. However, their +vulnerability to adversarial attacks remains largely unexplored, raising +serious concerns regarding the real-world deployment of tools employing such +models in the healthcare sector. This underscores the importance of +investigating the robustness of existing models. In this context, our work aims +to empirically examine the adversarial robustness across current volumetric +segmentation architectures, encompassing Convolutional, Transformer, and +Mamba-based models. We extend this investigation across four volumetric +segmentation datasets, evaluating robustness under both white box and black box +adversarial attacks. Overall, we observe that while both pixel and +frequency-based attacks perform reasonably well under \emph{white box} setting, +the latter performs significantly better under transfer-based black box +attacks. Across our experiments, we observe transformer-based models show +higher robustness than convolution-based models with Mamba-based models being +the most vulnerable. Additionally, we show that large-scale training of +volumetric segmentation models improves the model's robustness against +adversarial attacks. The code and robust models are available at +https://github.com/HashmatShadab/Robustness-of-Volumetric-Medical-Segmentation-Models. + +
+
+ comment: Accepted at British Machine Vision Conference 2024 +
+
+
+
+
+ + ♻ ☆ Training-free Long Video Generation with Chain of Diffusion Model + Experts + + +
+ Video generation models hold substantial potential in areas such as +filmmaking. However, current video diffusion models need high computational +costs and produce suboptimal results due to high complexity of video generation +task. In this paper, we propose \textbf{ConFiner}, an efficient high-quality +video generation framework that decouples video generation into easier +subtasks: structure \textbf{con}trol and spatial-temporal re\textbf{fine}ment. +It can generate high-quality videos with chain of off-the-shelf diffusion model +experts, each expert responsible for a decoupled subtask. During the +refinement, we introduce coordinated denoising, which can merge multiple +diffusion experts' capabilities into a single sampling. Furthermore, we design +ConFiner-Long framework, which can generate long coherent video with three +constraint strategies on ConFiner. Experimental results indicate that with only +10\% of the inference cost, our ConFiner surpasses representative models like +Lavie and Modelscope across all objective and subjective metrics. And +ConFiner-Long can generate high-quality and coherent videos with up to 600 +frames. + +
+
+
+
+
+ + ♻ ☆ TRAM: Global Trajectory and Motion of 3D Humans from in-the-wild Videos + + +
+ We propose TRAM, a two-stage method to reconstruct a human's global +trajectory and motion from in-the-wild videos. TRAM robustifies SLAM to recover +the camera motion in the presence of dynamic humans and uses the scene +background to derive the motion scale. Using the recovered camera as a +metric-scale reference frame, we introduce a video transformer model (VIMO) to +regress the kinematic body motion of a human. By composing the two motions, we +achieve accurate recovery of 3D humans in the world space, reducing global +motion errors by a large margin from prior work. +https://yufu-wang.github.io/tram4d/ + +
+
+ comment: The project website: https://yufu-wang.github.io/tram4d/ +
+
+
+
+
+ + ♻ ☆ Privacy-Aware Document Visual Question Answering ICDAR 2024 + + +
+ Document Visual Question Answering (DocVQA) has quickly grown into a central +task of document understanding. But despite the fact that documents contain +sensitive or copyrighted information, none of the current DocVQA methods offers +strong privacy guarantees. In this work, we explore privacy in the domain of +DocVQA for the first time, highlighting privacy issues in state of the art +multi-modal LLM models used for DocVQA, and explore possible solutions. +Specifically, we focus on invoice processing as a realistic document +understanding scenario, and propose a large scale DocVQA dataset comprising +invoice documents and associated questions and answers. We employ a federated +learning scheme, that reflects the real-life distribution of documents in +different businesses, and we explore the use case where the data of the invoice +provider is the sensitive information to be protected. We demonstrate that +non-private models tend to memorise, a behaviour that can lead to exposing +private information. We then evaluate baseline training schemes employing +federated learning and differential privacy in this multi-modal scenario, where +the sensitive information might be exposed through either or both of the two +input modalities: vision (document image) or language (OCR tokens). Finally, we +design attacks exploiting the memorisation effect of the model, and demonstrate +their effectiveness in probing a representative DocVQA models. + +
+
+ comment: 35 pages, 12 figures, accepted for publication at the 18th + International Conference on Document Analysis and Recognition, ICDAR 2024 +
+
+
+
+
+ + ♻ ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? ECCV 2024 + + +
+ Foundation models have emerged as robust models with label efficiency in +diverse domains. In medical imaging, these models contribute to the advancement +of medical diagnoses due to the difficulty in obtaining labeled data. However, +it is unclear whether using a large amount of unlabeled data, biased by the +presence of sensitive attributes during pre-training, influences the fairness +of the model. This research examines the bias in the Foundation model +(RetFound) when it is applied to fine-tune the Brazilian Multilabel +Ophthalmological Dataset (BRSET), which has a different population than the +pre-training dataset. The model evaluation, in comparison with supervised +learning, shows that the Foundation Model has the potential to reduce the gap +between the maximum AUC and minimum AUC evaluations across gender and age +groups. However, in a data-efficient generalization, the model increases the +bias when the data amount decreases. These findings suggest that when deploying +a Foundation Model in real-life scenarios with limited data, the possibility of +fairness issues should be considered. + +
+
+ comment: Preprint of paper to be presented at Fairness and Ethics Towards + Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during + ECCV 2024 +
+
+
+
+
+ + ♻ ☆ LSMS: Language-guided Scale-aware MedSegmentor for Medical Image + Referring Segmentation + + +
+ Conventional medical image segmentation methods have been found inadequate in +facilitating physicians with the identification of specific lesions for +diagnosis and treatment. Given the utility of text as an instructional format, +we introduce a novel task termed Medical Image Referring Segmentation (MIRS), +which requires segmenting specified lesions in images based on the given +language expressions. Due to the varying object scales in medical images, MIRS +demands robust vision-language modeling and comprehensive multi-scale +interaction for precise localization and segmentation under linguistic +guidance. However, existing medical image segmentation methods fall short in +meeting these demands, resulting in insufficient segmentation accuracy. In +response, we propose an approach named Language-guided Scale-aware MedSegmentor +(LSMS), incorporating two appealing designs: (1)~a Scale-aware Vision-Language +Attention module that leverages diverse convolutional kernels to acquire rich +visual knowledge and interact closely with linguistic features, thereby +enhancing lesion localization capability; (2)~a Full-Scale Decoder that +globally models multi-modal features across various scales, capturing +complementary information between scales to accurately outline lesion +boundaries. Addressing the lack of suitable datasets for MIRS, we constructed a +vision-language medical dataset called Reference Hepatic Lesion Segmentation +(RefHL-Seg). This dataset comprises 2,283 abdominal CT slices from 231 cases, +with corresponding textual annotations and segmentation masks for various liver +lesions in images. We validated the performance of LSMS for MIRS and +conventional medical image segmentation tasks across various datasets. Our LSMS +consistently outperforms on all datasets with lower computational costs. The +code and datasets will be released. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Implicit Concept Removal of Diffusion Models ECCV2024 + + +
+ Text-to-image (T2I) diffusion models often inadvertently generate unwanted +concepts such as watermarks and unsafe images. These concepts, termed as the +"implicit concepts", could be unintentionally learned during training and then +be generated uncontrollably during inference. Existing removal methods still +struggle to eliminate implicit concepts primarily due to their dependency on +the model's ability to recognize concepts it actually can not discern. To +address this, we utilize the intrinsic geometric characteristics of implicit +concepts and present the Geom-Erasing, a novel concept removal method based on +the geometric-driven control. Specifically, once an unwanted implicit concept +is identified, we integrate the existence and geometric information of the +concept into the text prompts with the help of an accessible classifier or +detector model. Subsequently, the model is optimized to identify and +disentangle this information, which is then adopted as negative prompts during +generation. Moreover, we introduce the Implicit Concept Dataset (ICD), a novel +image-text dataset imbued with three typical implicit concepts (i.e., QR codes, +watermarks, and text), reflecting real-life situations where implicit concepts +are easily injected. Geom-Erasing effectively mitigates the generation of +implicit concepts, achieving the state-of-the-art results on the Inappropriate +Image Prompts (I2P) and our challenging Implicit Concept Dataset (ICD) +benchmarks. + +
+
+ comment: Accepted by ECCV2024. Project Page: + https://kaichen1998.github.io/projects/geom-erasing/ +
+
+
+
+
+ + ♻ ☆ GUing: A Mobile GUI Search Engine using a Vision-Language Model + + +
+ App developers use the Graphical User Interface (GUI) of other apps as a +source of inspiration for designing and improving their own apps. Recent +research has thus suggested retrieving relevant GUI designs that match a +certain text query from screenshot datasets acquired through crowdsourced or +automated exploration of GUIs. However, such text-to-GUI retrieval approaches +only leverage the textual information of the GUI elements, neglecting visual +information such as icons or background images. In addition, retrieved +screenshots are not steered by app developers and often lack important app +features that require particular input data. + To overcome these limitations, this paper proposes GUing, a GUI search engine +based on a vision-language model called GUIClip, which we trained specifically +for the problem of designing app GUIs. For this, we first collected from Google +Play app introduction images which usually display the most representative +screenshots and are often captioned (i.e.~labeled) by app vendors. Then, we +developed an automated pipeline to classify, crop, and extract the captions +from these images. This resulted in a large dataset which we share with this +paper: including 303k app screenshots, out of which 135k have captions. We used +this dataset to train a novel vision-language model, which is, to the best of +our knowledge, the first of its kind in GUI retrieval. We evaluated our +approach on various datasets from related work and in manual experiment. The +results demonstrate that our model outperforms previous approaches in +text-to-GUI retrieval achieving a Recall@10 of up to 0.69 and a HIT@10 of 0.91. +We also explored the performance of GUIClip for other GUI tasks including GUI +classification and sketch-to-GUI retrieval with encouraging results. + +
+
+
+
+
+ + ♻ ☆ An open dataset for oracle bone script recognition and decipherment + + +
+ Oracle bone script, one of the earliest known forms of ancient Chinese +writing, presents invaluable research materials for scholars studying the +humanities and geography of the Shang Dynasty, dating back 3,000 years. The +immense historical and cultural significance of these writings cannot be +overstated. However, the passage of time has obscured much of their meaning, +presenting a significant challenge in deciphering these ancient texts. With the +advent of Artificial Intelligence (AI), employing AI to assist in deciphering +Oracle Bone Characters (OBCs) has become a feasible option. Yet, progress in +this area has been hindered by a lack of high-quality datasets. To address this +issue, this paper details the creation of the HUST-OBC dataset. This dataset +encompasses 77,064 images of 1,588 individual deciphered characters and 62,989 +images of 9,411 undeciphered characters, with a total of 140,053 images, +compiled from diverse sources. The hope is that this dataset could inspire and +assist future research in deciphering those unknown OBCs. All the codes and +datasets are available at https://github.com/Yuliang-Liu/Open-Oracle. + +
+
+
+
+
+ + ♻ ☆ SABER-6D: Shape Representation Based Implicit Object Pose Estimation ECCV 2024 + + +
+ In this paper, we propose a novel encoder-decoder architecture, named SABER, +to learn the 6D pose of the object in the embedding space by learning shape +representation at a given pose. This model enables us to learn pose by +performing shape representation at a target pose from RGB image input. We +perform shape representation as an auxiliary task which helps us in learning +rotations space for an object based on 2D images. An image encoder predicts the +rotation in the embedding space and the DeepSDF based decoder learns to +represent the object's shape at the given pose. As our approach is shape based, +the pipeline is suitable for any type of object irrespective of the symmetry. +Moreover, we need only a CAD model of the objects to train SABER. Our pipeline +is synthetic data based and can also handle symmetric objects without symmetry +labels and, thus, no additional labeled training data is needed. The +experimental evaluation shows that our method achieves close to benchmark +results for both symmetric objects and asymmetric objects on Occlusion-LineMOD, +and T-LESS datasets. + +
+
+ comment: ECCV 2024 R6D workshop +
+
+
+
+
+ + ♻ ☆ A Deep-Learning-Based Label-free No-Reference Image Quality Assessment + Metric: Application in Sodium MRI Denoising + + +
+ New multinuclear MRI techniques, such as sodium MRI, generally suffer from +low image quality due to an inherently low signal. Postprocessing methods, such +as image denoising, have been developed for image enhancement. However, the +assessment of these enhanced images is challenging especially considering when +there is a lack of high resolution and high signal images as reference, such as +in sodium MRI. No-reference Image Quality Assessment (NR-IQA) metrics are +approaches to solve this problem. Existing learning-based NR-IQA metrics rely +on labels derived from subjective human opinions or metrics like +Signal-to-Noise Ratio (SNR), which are either time-consuming or lack accurate +ground truths, resulting in unreliable assessment. We note that deep learning +(DL) models have a unique characteristic in that they are specialized to a +characteristic training set, meaning that deviations between the input testing +data from the training data will reduce prediction accuracy. Therefore, we +propose a novel DL-based NR-IQA metric, the Model Specialization Metric (MSM), +which does not depend on ground-truth images or labels. MSM measures the +difference between the input image and the model's prediction for evaluating +the quality of the input image. Experiments conducted on both simulated +distorted proton T1-weighted MR images and denoised sodium MR images +demonstrate that MSM exhibits a superior evaluation performance on various +simulated noises and distortions. MSM also has a substantial agreement with the +expert evaluations, achieving an averaged Cohen's Kappa coefficient of 0.6528, +outperforming the existing NR-IQA metrics. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State + Space Model ACM MM 2024 + + +
+ Existing Transformer-based models for point cloud analysis suffer from +quadratic complexity, leading to compromised point cloud resolution and +information loss. In contrast, the newly proposed Mamba model, based on state +space models (SSM), outperforms Transformer in multiple areas with only linear +complexity. However, the straightforward adoption of Mamba does not achieve +satisfactory performance on point cloud tasks. In this work, we present +Mamba3D, a state space model tailored for point cloud learning to enhance local +feature extraction, achieving superior performance, high efficiency, and +scalability potential. Specifically, we propose a simple yet effective Local +Norm Pooling (LNP) block to extract local geometric features. Additionally, to +obtain better global features, we introduce a bidirectional SSM (bi-SSM) with +both a token forward SSM and a novel backward SSM that operates on the feature +channel. Extensive experimental results show that Mamba3D surpasses +Transformer-based counterparts and concurrent works in multiple tasks, with or +without pre-training. Notably, Mamba3D achieves multiple SoTA, including an +overall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1% +(with single-modal pre-training) on the ModelNet40 classification task, with +only linear complexity. Our code and weights are available at +https://github.com/xhanxu/Mamba3D. + +
+
+ comment: ACM MM 2024. Code and weights are available at + https://github.com/xhanxu/Mamba3D +
+
+
+
+
+ + ♻ ☆ Enabling Local Editing in Diffusion Models by Joint and Individual + Component Analysis BMVC2024 + + +
+ Recent advances in Diffusion Models (DMs) have led to significant progress in +visual synthesis and editing tasks, establishing them as a strong competitor to +Generative Adversarial Networks (GANs). However, the latent space of DMs is not +as well understood as that of GANs. Recent research has focused on unsupervised +semantic discovery in the latent space of DMs by leveraging the bottleneck +layer of the denoising network, which has been shown to exhibit properties of a +semantic latent space. However, these approaches are limited to discovering +global attributes. In this paper we address, the challenge of local image +manipulation in DMs and introduce an unsupervised method to factorize the +latent semantics learned by the denoising network of pre-trained DMs. Given an +arbitrary image and defined regions of interest, we utilize the Jacobian of the +denoising network to establish a relation between the regions of interest and +their corresponding subspaces in the latent space. Furthermore, we disentangle +the joint and individual components of these subspaces to identify latent +directions that enable local image manipulation. Once discovered, these +directions can be applied to different images to produce semantically +consistent edits, making our method suitable for practical applications. +Experimental results on various datasets demonstrate that our method can +produce semantic edits that are more localized and have better fidelity +compared to the state-of-the-art. + +
+
+ comment: Accepted at BMVC2024 +
+
+
+
+
+ + ♻ ☆ Structured Generative Models for Scene Understanding + + +
+ This position paper argues for the use of \emph{structured generative models} +(SGMs) for the understanding of static scenes. This requires the reconstruction +of a 3D scene from an input image (or a set of multi-view images), whereby the +contents of the image(s) are causally explained in terms of models of +instantiated objects, each with their own type, shape, appearance and pose, +along with global variables like scene lighting and camera parameters. This +approach also requires scene models which account for the co-occurrences and +inter-relationships of objects in a scene. The SGM approach has the merits that +it is compositional and generative, which lead to interpretability and +editability. \\\\ To pursue the SGM agenda, we need models for objects and +scenes, and approaches to carry out inference. We first review models for +objects, which include ``things'' (object categories that have a well defined +shape), and ``stuff'' (categories which have amorphous spatial extent). We then +move on to review \emph{scene models} which describe the inter-relationships of +objects. Perhaps the most challenging problem for SGMs is \emph{inference} of +the objects, lighting and camera parameters, and scene inter-relationships from +input consisting of a single or multiple images. We conclude with a discussion +of issues that need addressing to advance the SGM agenda. + +
+
+ comment: 32 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ MAPL: Memory Augmentation and Pseudo-Labeling for Semi-Supervised + Anomaly Detection + + +
+ Large unlabeled data and difficult-to-identify anomalies are the urgent +issues need to overcome in most industrial scene. In order to address this +issue, a new meth-odology for detecting surface defects in in-dustrial settings +is introduced, referred to as Memory Augmentation and Pseudo-Labeling(MAPL). +The methodology first in-troduces an anomaly simulation strategy, which +significantly improves the model's ability to recognize rare or unknown +anom-aly types by generating simulated anomaly samples. To cope with the +problem of the lack of labeling of anomalous simulated samples, a +pseudo-labeler method based on a one-classifier ensemble was employed in this +study, which enhances the robustness of the model in the case of limited +labeling data by automatically selecting key pseudo-labeling hyperparameters. +Meanwhile, a memory-enhanced learning mechanism is introduced to effectively +predict abnormal regions by analyzing the difference be-tween the input samples +and the normal samples in the memory pool. An end-to-end learning framework is +employed by MAPL to identify the abnormal regions directly from the input data, +which optimizes the ef-ficiency and real-time performance of de-tection. By +conducting extensive trials on the recently developed BHAD dataset (in-cluding +MVTec AD [1], Visa [2], and MDPP [3]), MAPL achieves an average im-age-level +AUROC score of 86.2%, demon-strating a 5.1% enhancement compared to the +original MemSeg [4] model. The source code is available at +https://github.com/jzc777/MAPL. + +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+ + ♻ ☆ UniUSNet: A Promptable Framework for Universal Ultrasound Disease + Prediction and Tissue Segmentation + + +
+ Ultrasound is widely used in clinical practice due to its affordability, +portability, and safety. However, current AI research often overlooks combined +disease prediction and tissue segmentation. We propose UniUSNet, a universal +framework for ultrasound image classification and segmentation. This model +handles various ultrasound types, anatomical positions, and input formats, +excelling in both segmentation and classification tasks. Trained on a +comprehensive dataset with over 9.7K annotations from 7 distinct anatomical +positions, our model matches state-of-the-art performance and surpasses +single-dataset and ablated models. Zero-shot and fine-tuning experiments show +strong generalization and adaptability with minimal fine-tuning. We plan to +expand our dataset and refine the prompting mechanism, with model weights and +code available at (https://github.com/Zehui-Lin/UniUSNet). + +
+
+ comment: Accepted to BIBM 2024 +
+
+
+
+
+ + ♻ ☆ GuidedNet: Semi-Supervised Multi-Organ Segmentation via Labeled Data + Guide Unlabeled Data ACM MM2024 + + +
+ Semi-supervised multi-organ medical image segmentation aids physicians in +improving disease diagnosis and treatment planning and reduces the time and +effort required for organ annotation.Existing state-of-the-art methods train +the labeled data with ground truths and train the unlabeled data with +pseudo-labels. However, the two training flows are separate, which does not +reflect the interrelationship between labeled and unlabeled data.To address +this issue, we propose a semi-supervised multi-organ segmentation method called +GuidedNet, which leverages the knowledge from labeled data to guide the +training of unlabeled data. The primary goals of this study are to improve the +quality of pseudo-labels for unlabeled data and to enhance the network's +learning capability for both small and complex organs.A key concept is that +voxel features from labeled and unlabeled data that are close to each other in +the feature space are more likely to belong to the same class.On this basis, a +3D Consistent Gaussian Mixture Model (3D-CGMM) is designed to leverage the +feature distributions from labeled data to rectify the generated +pseudo-labels.Furthermore, we introduce a Knowledge Transfer Cross Pseudo +Supervision (KT-CPS) strategy, which leverages the prior knowledge obtained +from the labeled data to guide the training of the unlabeled data, thereby +improving the segmentation accuracy for both small and complex organs. +Extensive experiments on two public datasets, FLARE22 and AMOS, demonstrated +that GuidedNet is capable of achieving state-of-the-art performance. The source +code with our proposed model are available at +https://github.com/kimjisoo12/GuidedNet. + +
+
+ comment: Accepted by ACM MM2024, 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ PD-APE: A Parallel Decoding Framework with Adaptive Position Encoding + for 3D Visual Grounding + + +
+ 3D visual grounding aims to identify objects in 3D point cloud scenes that +match specific natural language descriptions. This requires the model to not +only focus on the target object itself but also to consider the surrounding +environment to determine whether the descriptions are met. Most previous works +attempt to accomplish both tasks within the same module, which can easily lead +to a distraction of attention. To this end, we propose PD-APE, a dual-branch +decoding framework that separately decodes target object attributes and +surrounding layouts. Specifically, in the target object branch, the decoder +processes text tokens that describe features of the target object (e.g., +category and color), guiding the queries to pay attention to the target object +itself. In the surrounding branch, the queries align with other text tokens +that carry surrounding environment information, making the attention maps +accurately capture the layout described in the text. Benefiting from the +proposed dual-branch design, the queries are allowed to focus on points +relevant to each branch's specific objective. Moreover, we design an adaptive +position encoding method for each branch respectively. In the target object +branch, the position encoding relies on the relative positions between seed +points and predicted 3D boxes. In the surrounding branch, the attention map is +additionally guided by the confidence between visual and text features, +enabling the queries to focus on points that have valuable layout information. +Extensive experiments demonstrate that we surpass the state-of-the-art on two +widely adopted 3D visual grounding datasets, ScanRefer and Nr3D. + +
+
+
+
+
+ + ♻ ☆ DORec: Decomposed Object Reconstruction and Segmentation Utilizing 2D + Self-Supervised Features + + +
+ Recovering 3D geometry and textures of individual objects is crucial for many +robotics applications, such as manipulation, pose estimation, and autonomous +driving. However, decomposing a target object from a complex background is +challenging. Most existing approaches rely on costly manual labels to acquire +object instance perception. Recent advancements in 2D self-supervised learning +offer new prospects for identifying objects of interest, yet leveraging such +noisy 2D features for clean decomposition remains difficult. In this paper, we +propose a Decomposed Object Reconstruction (DORec) network based on neural +implicit representations. Our key idea is to use 2D self-supervised features to +create two levels of masks for supervision: a binary mask for foreground +regions and a K-cluster mask for semantically similar regions. These +complementary masks result in robust decomposition. Experimental results on +different datasets show DORec's superiority in segmenting and reconstructing +diverse foreground objects from varied backgrounds enabling downstream tasks +such as pose estimation. + +
+
+
+
+
+ + ♻ ☆ FRDiff : Feature Reuse for Universal Training-free Acceleration of + Diffusion Models ECCV 2024 + + +
+ The substantial computational costs of diffusion models, especially due to +the repeated denoising steps necessary for high-quality image generation, +present a major obstacle to their widespread adoption. While several studies +have attempted to address this issue by reducing the number of score function +evaluations (NFE) using advanced ODE solvers without fine-tuning, the decreased +number of denoising iterations misses the opportunity to update fine details, +resulting in noticeable quality degradation. In our work, we introduce an +advanced acceleration technique that leverages the temporal redundancy inherent +in diffusion models. Reusing feature maps with high temporal similarity opens +up a new opportunity to save computation resources without compromising output +quality. To realize the practical benefits of this intuition, we conduct an +extensive analysis and propose a novel method, FRDiff. FRDiff is designed to +harness the advantages of both reduced NFE and feature reuse, achieving a +Pareto frontier that balances fidelity and latency trade-offs in various +generative tasks. + +
+
+ comment: Accepted at ECCV 2024. Code : + https://github.com/ECoLab-POSTECH/FRDiff +
+
+
+
+
+ + ♻ ☆ Evolution-aware VAriance (EVA) Coreset Selection for Medical Image + Classification + + +
+ In the medical field, managing high-dimensional massive medical imaging data +and performing reliable medical analysis from it is a critical challenge, +especially in resource-limited environments such as remote medical facilities +and mobile devices. This necessitates effective dataset compression techniques +to reduce storage, transmission, and computational cost. However, existing +coreset selection methods are primarily designed for natural image datasets, +and exhibit doubtful effectiveness when applied to medical image datasets due +to challenges such as intra-class variation and inter-class similarity. In this +paper, we propose a novel coreset selection strategy termed as Evolution-aware +VAriance (EVA), which captures the evolutionary process of model training +through a dual-window approach and reflects the fluctuation of sample +importance more precisely through variance measurement. Extensive experiments +on medical image datasets demonstrate the effectiveness of our strategy over +previous SOTA methods, especially at high compression rates. EVA achieves +98.27% accuracy with only 10% training data, compared to 97.20% for the full +training set. None of the compared baseline methods can exceed Random at 5% +selection rate, while EVA outperforms Random by 5.61%, showcasing its potential +for efficient medical image analysis. + +
+
+ comment: Accepted by ACM Multimedia 2024 (oral), see: + https://openreview.net/forum?id=m1qrB9KSYD +
+
+
+
+
+ + ♻ ☆ Biometrics and Behavior Analysis for Detecting Distractions in + e-Learning + + +
+ In this article, we explore computer vision approaches to detect abnormal +head pose during e-learning sessions and we introduce a study on the effects of +mobile phone usage during these sessions. We utilize behavioral data collected +from 120 learners monitored while participating in a MOOC learning sessions. +Our study focuses on the influence of phone-usage events on behavior and +physiological responses, specifically attention, heart rate, and meditation, +before, during, and after phone usage. Additionally, we propose an approach for +estimating head pose events using images taken by the webcam during the MOOC +learning sessions to detect phone-usage events. Our hypothesis suggests that +head posture undergoes significant changes when learners interact with a mobile +phone, contrasting with the typical behavior seen when learners face a computer +during e-learning sessions. We propose an approach designed to detect +deviations in head posture from the average observed during a learner's +session, operating as a semi-supervised method. This system flags events +indicating alterations in head posture for subsequent human review and +selection of mobile phone usage occurrences with a sensitivity over 90%. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ VAAD: Visual Attention Analysis Dashboard applied to e-Learning + + +
+ In this paper, we present an approach in the Multimodal Learning Analytics +field. Within this approach, we have developed a tool to visualize and analyze +eye movement data collected during learning sessions in online courses. The +tool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These +eye movement data have been gathered using an eye-tracker and subsequently +processed and visualized for interpretation. The purpose of the tool is to +conduct a descriptive analysis of the data by facilitating its visualization, +enabling the identification of differences and learning patterns among various +learner populations. Additionally, it integrates a predictive module capable of +anticipating learner activities during a learning session. Consequently, VAAD +holds the potential to offer valuable insights into online learning behaviors +from both descriptive and predictive perspectives. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ Dual-scale Enhanced and Cross-generative Consistency Learning for + Semi-supervised Medical Image Segmentation + + +
+ Medical image segmentation plays a crucial role in computer-aided diagnosis. +However, existing methods heavily rely on fully supervised training, which +requires a large amount of labeled data with time-consuming pixel-wise +annotations. Moreover, accurately segmenting lesions poses challenges due to +variations in shape, size, and location. To address these issues, we propose a +novel Dual-scale Enhanced and Cross-generative consistency learning framework +for semi-supervised medical image Segmentation (DEC-Seg). First, we propose a +Cross-level Feature Aggregation (CFA) module that integrates cross-level +adjacent layers to enhance the feature representation ability across different +resolutions. To address scale variation, we present a scale-enhanced +consistency constraint, which ensures consistency in the segmentation maps +generated from the same input image at different scales. This constraint helps +handle variations in lesion sizes and improves the robustness of the model. +Furthermore, we propose a cross-generative consistency scheme, in which the +original and perturbed images can be reconstructed using cross-segmentation +maps. This consistency constraint allows us to mine effective feature +representations and boost the segmentation performance. To further exploit the +scale information, we propose a Dual-scale Complementary Fusion (DCF) module +that integrates features from two scale-specific decoders operating at +different scales to help produce more accurate segmentation maps. Extensive +experimental results on multiple medical segmentation tasks (polyp, skin +lesion, and brain glioma) demonstrate the effectiveness of our DEC-Seg against +other state-of-the-art semi-supervised segmentation approaches. The +implementation code will be released at https://github.com/taozh2017/DECSeg. + +
+
+ comment: 12 pages 10 figures +
+
+
+
+
+ + ♻ ☆ Pedestrian Attribute Recognition via CLIP based Prompt Vision-Language + Fusion + + +
+ Existing pedestrian attribute recognition (PAR) algorithms adopt pre-trained +CNN (e.g., ResNet) as their backbone network for visual feature learning, which +might obtain sub-optimal results due to the insufficient employment of the +relations between pedestrian images and attribute labels. In this paper, we +formulate PAR as a vision-language fusion problem and fully exploit the +relations between pedestrian images and attribute labels. Specifically, the +attribute phrases are first expanded into sentences, and then the pre-trained +vision-language model CLIP is adopted as our backbone for feature embedding of +visual images and attribute descriptions. The contrastive learning objective +connects the vision and language modalities well in the CLIP-based feature +space, and the Transformer layers used in CLIP can capture the long-range +relations between pixels. Then, a multi-modal Transformer is adopted to fuse +the dual features effectively and feed-forward network is used to predict +attributes. To optimize our network efficiently, we propose the region-aware +prompt tuning technique to adjust very few parameters (i.e., only the prompt +vectors and classification heads) and fix both the pre-trained VL model and +multi-modal Transformer. Our proposed PAR algorithm only adjusts 0.75% +learnable parameters compared with the fine-tuning strategy. It also achieves +new state-of-the-art performance on both standard and zero-shot settings for +PAR, including RAPv1, RAPv2, WIDER, PA100K, and PETA-ZS, RAP-ZS datasets. The +source code and pre-trained models will be released on +https://github.com/Event-AHU/OpenPAR. + +
+
+ comment: Accepted by IEEE TCSVT 2024, Camera Ready Version +
+
+
+
+
+ + ♻ ☆ Disease Classification and Impact of Pretrained Deep Convolution Neural + Networks on Diverse Medical Imaging Datasets across Imaging Modalities + + +
+ Imaging techniques such as Chest X-rays, whole slide images, and optical +coherence tomography serve as the initial screening and detection for a wide +variety of medical pulmonary and ophthalmic conditions respectively. This paper +investigates the intricacies of using pretrained deep convolutional neural +networks with transfer learning across diverse medical imaging datasets with +varying modalities for binary and multiclass classification. We conducted a +comprehensive performance analysis with ten network architectures and model +families each with pretraining and random initialization. Our finding showed +that the use of pretrained models as fixed feature extractors yields poor +performance irrespective of the datasets. Contrary, histopathology microscopy +whole slide images have better performance. It is also found that deeper and +more complex architectures did not necessarily result in the best performance. +This observation implies that the improvements in ImageNet are not parallel to +the medical imaging tasks. Within a medical domain, the performance of the +network architectures varies within model families with shifts in datasets. +This indicates that the performance of models within a specific modality may +not be conclusive for another modality within the same domain. This study +provides a deeper understanding of the applications of deep learning techniques +in medical imaging and highlights the impact of pretrained networks across +different medical imaging datasets under five different experimental settings. + +
+
+ comment: 15 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Instant Adversarial Purification with Adversarial Consistency + Distillation + + +
+ Neural networks, despite their remarkable performance in widespread +applications, including image classification, are also known to be vulnerable +to subtle adversarial noise. Although some diffusion-based purification methods +have been proposed, for example, DiffPure, those methods are time-consuming. In +this paper, we propose One Step Control Purification (OSCP), a diffusion-based +purification model that can purify the adversarial image in one Neural Function +Evaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and +ControlNet for our one-step purification. OSCP is computationally friendly and +time efficient compared to other diffusion-based purification methods; we +achieve defense success rate of 74.19\% on ImageNet, only requiring 0.1s for +each purification. Moreover, there is a fundamental incongruence between +consistency distillation and adversarial perturbation. To address this +ontological dissonance, we propose Gaussian Adversarial Noise Distillation +(GAND), a novel consistency distillation framework that facilitates a more +nuanced reconciliation of the latent space dynamics, effectively bridging the +natural and adversarial manifolds. Our experiments show that the GAND does not +need a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient. + +
+
+
+
+
+ + ♻ ☆ Show Me the World in My Language: Establishing the First Baseline for + Scene-Text to Scene-Text Translation ICPR 2024 + + +
+ In this work, we study the task of ``visually'' translating scene text from a +source language (e.g., Hindi) to a target language (e.g., English). Visual +translation involves not just the recognition and translation of scene text but +also the generation of the translated image that preserves visual features of +the source scene text, such as font, size, and background. There are several +challenges associated with this task, such as translation with limited context, +deciding between translation and transliteration, accommodating varying text +lengths within fixed spatial boundaries, and preserving the font and background +styles of the source scene text in the target language. To address this +problem, we make the following contributions: (i) We study visual translation +as a standalone problem for the first time in the literature. (ii) We present a +cascaded framework for visual translation that combines state-of-the-art +modules for scene text recognition, machine translation, and scene text +synthesis as a baseline for the task. (iii) We propose a set of task-specific +design enhancements to design a variant of the baseline to obtain performance +improvements. (iv) Currently, the existing related literature lacks any +comprehensive performance evaluation for this novel task. To fill this gap, we +introduce several automatic and user-assisted evaluation metrics designed +explicitly for evaluating visual translation. Further, we evaluate presented +baselines for translating scene text between Hindi and English. Our experiments +demonstrate that although we can effectively perform visual translation over a +large collection of scene text images, the presented baseline only partially +addresses challenges posed by visual translation tasks. We firmly believe that +this new task and the limitations of existing models, as reported in this +paper, should encourage further research in visual translation. + +
+
+ comment: Accepted at ICPR 2024, Project Website: + https://vl2g.github.io/projects/visTrans/ +
+
+
+
+
+ + ♻ ☆ An Image is Worth 1/2 Tokens After Layer 2: Plug-and-Play Inference + Acceleration for Large Vision-Language Models ECCV 2024 + + +
+ In this study, we identify the inefficient attention phenomena in Large +Vision-Language Models (LVLMs), notably within prominent models like LLaVA-1.5, +QwenVL-Chat and Video-LLaVA. We find out that the attention computation over +visual tokens is of extreme inefficiency in the deep layers of popular LVLMs, +suggesting a need for a sparser approach compared to textual data handling. To +this end, we introduce FastV, a versatile plug-and-play method designed to +optimize computational efficiency by learning adaptive attention patterns in +early layers and pruning visual tokens in subsequent ones. Our evaluations +demonstrate FastV's ability to dramatically reduce computational costs (e.g., a +45 reduction in FLOPs for LLaVA-1.5-13B) without sacrificing performance in a +wide range of image and video understanding tasks. The computational efficiency +and performance trade-off of FastV are highly customizable and +pareto-efficient. It can compress the FLOPs of a 13B-parameter model to achieve +a lower budget than that of a 7B-parameter model, while still maintaining +superior performance. We believe FastV has practical values for deployment of +LVLMs in edge devices and commercial models. Code is released at +https://github.com/pkunlp-icler/FastV. + +
+
+ comment: Accepted to ECCV 2024 (Oral), code is released at + https://github.com/pkunlp-icler/FastV, +
+
+
+
+
+ + ♻ ☆ Video Diffusion Models are Strong Video Inpainter + + +
+ Propagation-based video inpainting using optical flow at the pixel or feature +level has recently garnered significant attention. However, it has limitations +such as the inaccuracy of optical flow prediction and the propagation of noise +over time. These issues result in non-uniform noise and time consistency +problems throughout the video, which are particularly pronounced when the +removed area is large and involves substantial movement. To address these +issues, we propose a novel First Frame Filling Video Diffusion Inpainting model +(FFF-VDI). We design FFF-VDI inspired by the capabilities of pre-trained +image-to-video diffusion models that can transform the first frame image into a +highly natural video. To apply this to the video inpainting task, we propagate +the noise latent information of future frames to fill the masked areas of the +first frame's noise latent code. Next, we fine-tune the pre-trained +image-to-video diffusion model to generate the inpainted video. The proposed +model addresses the limitations of existing methods that rely on optical flow +quality, producing much more natural and temporally consistent videos. This +proposed approach is the first to effectively integrate image-to-video +diffusion models into video inpainting tasks. Through various comparative +experiments, we demonstrate that the proposed model can robustly handle diverse +inpainting types with high quality. + +
+
+
+
+
+ + ♻ ☆ A Grey-box Attack against Latent Diffusion Model-based Image Editing by + Posterior Collapse + + +
+ Recent advancements in generative AI, particularly Latent Diffusion Models +(LDMs), have revolutionized image synthesis and manipulation. However, these +generative techniques raises concerns about data misappropriation and +intellectual property infringement. Adversarial attacks on machine learning +models have been extensively studied, and a well-established body of research +has extended these techniques as a benign metric to prevent the underlying +misuse of generative AI. Current approaches to safeguarding images from +manipulation by LDMs are limited by their reliance on model-specific knowledge +and their inability to significantly degrade semantic quality of generated +images. In response to these shortcomings, we propose the Posterior Collapse +Attack (PCA) based on the observation that VAEs suffer from posterior collapse +during training. Our method minimizes dependence on the white-box information +of target models to get rid of the implicit reliance on model-specific +knowledge. By accessing merely a small amount of LDM parameters, in specific +merely the VAE encoder of LDMs, our method causes a substantial semantic +collapse in generation quality, particularly in perceptual consistency, and +demonstrates strong transferability across various model architectures. +Experimental results show that PCA achieves superior perturbation effects on +image generation of LDMs with lower runtime and VRAM. Our method outperforms +existing techniques, offering a more robust and generalizable solution that is +helpful in alleviating the socio-technical challenges posed by the rapidly +evolving landscape of generative AI. + +
+
+ comment: 21 pages, 7 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Event Voxel Set Transformer for Spatiotemporal Representation Learning + on Event Streams + + +
+ Event cameras are neuromorphic vision sensors that record a scene as sparse +and asynchronous event streams. Most event-based methods project events into +dense frames and process them using conventional vision models, resulting in +high computational complexity. A recent trend is to develop point-based +networks that achieve efficient event processing by learning sparse +representations. However, existing works may lack robust local information +aggregators and effective feature interaction operations, thus limiting their +modeling capabilities. To this end, we propose an attention-aware model named +Event Voxel Set Transformer (EVSTr) for efficient spatiotemporal representation +learning on event streams. It first converts the event stream into voxel sets +and then hierarchically aggregates voxel features to obtain robust +representations. The core of EVSTr is an event voxel transformer encoder that +consists of two well-designed components, including the Multi-Scale Neighbor +Embedding Layer (MNEL) for local information aggregation and the Voxel +Self-Attention Layer (VSAL) for global feature interaction. Enabling the +network to incorporate a long-range temporal structure, we introduce a segment +modeling strategy (S$^{2}$TM) to learn motion patterns from a sequence of +segmented voxel sets. The proposed model is evaluated on two recognition tasks, +including object classification and action recognition. To provide a convincing +model evaluation, we present a new event-based action recognition dataset +(NeuroHAR) recorded in challenging scenarios. Comprehensive experiments show +that EVSTr achieves state-of-the-art performance while maintaining low model +complexity. + +
+
+ comment: Accepted by IEEE Transactions on Circuits and Systems for Video + Technology (TCSVT) +
+
+
+
+
+ + ♻ ☆ MM-Soc: Benchmarking Multimodal Large Language Models in Social Media + Platforms ACL 2024 + + +
+ Social media platforms are hubs for multimodal information exchange, +encompassing text, images, and videos, making it challenging for machines to +comprehend the information or emotions associated with interactions in online +spaces. Multimodal Large Language Models (MLLMs) have emerged as a promising +solution to these challenges, yet they struggle to accurately interpret human +emotions and complex content such as misinformation. This paper introduces +MM-Soc, a comprehensive benchmark designed to evaluate MLLMs' understanding of +multimodal social media content. MM-Soc compiles prominent multimodal datasets +and incorporates a novel large-scale YouTube tagging dataset, targeting a range +of tasks from misinformation detection, hate speech detection, and social +context generation. Through our exhaustive evaluation on ten size-variants of +four open-source MLLMs, we have identified significant performance disparities, +highlighting the need for advancements in models' social understanding +capabilities. Our analysis reveals that, in a zero-shot setting, various types +of MLLMs generally exhibit difficulties in handling social media tasks. +However, MLLMs demonstrate performance improvements post fine-tuning, +suggesting potential pathways for improvement. Our code and data are available +at https://github.com/claws-lab/MMSoc.git. + +
+
+ comment: In Proceedings of ACL 2024 +
+
+
+
+
+ + ♻ ☆ Adapting Segment Anything Model to Multi-modal Salient Object Detection + with Semantic Feature Fusion Guidance + + +
+ Although most existing multi-modal salient object detection (SOD) methods +demonstrate effectiveness through training models from scratch, the limited +multi-modal data hinders these methods from reaching optimality. In this paper, +we propose a novel framework to explore and exploit the powerful feature +representation and zero-shot generalization ability of the pre-trained Segment +Anything Model (SAM) for multi-modal SOD. Despite serving as a recent vision +fundamental model, driving the class-agnostic SAM to comprehend and detect +salient objects accurately is non-trivial, especially in challenging scenes. To +this end, we develop \underline{SAM} with se\underline{m}antic +f\underline{e}ature fu\underline{s}ion guidanc\underline{e} (Sammese), which +incorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to +multi-modal SOD tasks. However, it is difficult for SAM trained on single-modal +data to directly mine the complementary benefits of multi-modal inputs and +comprehensively utilize them to achieve accurate saliency prediction. To +address these issues, we first design a multi-modal complementary fusion module +to extract robust multi-modal semantic features by integrating information from +visible and thermal or depth image pairs. Then, we feed the extracted +multi-modal semantic features into both the SAM image encoder and mask decoder +for fine-tuning and prompting, respectively. Specifically, in the image +encoder, a multi-modal adapter is proposed to adapt the single-modal SAM to +multi-modal information. In the mask decoder, a semantic-geometric prompt +generation strategy is proposed to produce corresponding embeddings with +various saliency cues. Extensive experiments on both RGB-D and RGB-T SOD +benchmarks show the effectiveness of the proposed framework. The code will be +available at \url{https://github.com/Angknpng/Sammese}. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ S-NeRF++: Autonomous Driving Simulation via Neural Reconstruction and + Generation + + +
+ Autonomous driving simulation system plays a crucial role in enhancing +self-driving data and simulating complex and rare traffic scenarios, ensuring +navigation safety. However, traditional simulation systems, which often heavily +rely on manual modeling and 2D image editing, struggled with scaling to +extensive scenes and generating realistic simulation data. In this study, we +present S-NeRF++, an innovative autonomous driving simulation system based on +neural reconstruction. Trained on widely-used self-driving datasets such as +nuScenes and Waymo, S-NeRF++ can generate a large number of realistic street +scenes and foreground objects with high rendering quality as well as offering +considerable flexibility in manipulation and simulation. Specifically, S-NeRF++ +is an enhanced neural radiance field for synthesizing large-scale scenes and +moving vehicles, with improved scene parameterization and camera pose learning. +The system effectively utilizes noisy and sparse LiDAR data to refine training +and address depth outliers, ensuring high-quality reconstruction and novel-view +rendering. It also provides a diverse foreground asset bank by reconstructing +and generating different foreground vehicles to support comprehensive scenario +creation.Moreover, we have developed an advanced foreground-background fusion +pipeline that skillfully integrates illumination and shadow effects, further +enhancing the realism of our simulations. With the high-quality simulated data +provided by our S-NeRF++, we found the perception methods enjoy performance +boosts on several autonomous driving downstream tasks, further demonstrating +our proposed simulator's effectiveness. + +
+
+
+
+
+ + ♻ ☆ NutritionVerse: Empirical Study of Various Dietary Intake Estimation + Approaches + + +
+ Accurate dietary intake estimation is critical for informing policies and +programs to support healthy eating, as malnutrition has been directly linked to +decreased quality of life. However self-reporting methods such as food diaries +suffer from substantial bias. Other conventional dietary assessment techniques +and emerging alternative approaches such as mobile applications incur high time +costs and may necessitate trained personnel. Recent work has focused on using +computer vision and machine learning to automatically estimate dietary intake +from food images, but the lack of comprehensive datasets with diverse +viewpoints, modalities and food annotations hinders the accuracy and realism of +such methods. To address this limitation, we introduce NutritionVerse-Synth, +the first large-scale dataset of 84,984 photorealistic synthetic 2D food images +with associated dietary information and multimodal annotations (including depth +images, instance masks, and semantic masks). Additionally, we collect a real +image dataset, NutritionVerse-Real, containing 889 images of 251 dishes to +evaluate realism. Leveraging these novel datasets, we develop and benchmark +NutritionVerse, an empirical study of various dietary intake estimation +approaches, including indirect segmentation-based and direct prediction +networks. We further fine-tune models pretrained on synthetic data with real +images to provide insights into the fusion of synthetic and real data. Finally, +we release both datasets (NutritionVerse-Synth, NutritionVerse-Real) on +https://www.kaggle.com/nutritionverse/datasets as part of an open initiative to +accelerate machine learning for dietary sensing. + +
+
+ comment: Corrections made to Tables 6, 7, and 8, and corrections made to + Experiments Part C. Additional clarification made in Section 4 +
+
+
+
+
+ + ♻ ☆ DarkGS: Learning Neural Illumination and 3D Gaussians Relighting for + Robotic Exploration in the Dark + + +
+ Humans have the remarkable ability to construct consistent mental models of +an environment, even under limited or varying levels of illumination. We wish +to endow robots with this same capability. In this paper, we tackle the +challenge of constructing a photorealistic scene representation under poorly +illuminated conditions and with a moving light source. We approach the task of +modeling illumination as a learning problem, and utilize the developed +illumination model to aid in scene reconstruction. We introduce an innovative +framework that uses a data-driven approach, Neural Light Simulators (NeLiS), to +model and calibrate the camera-light system. Furthermore, we present DarkGS, a +method that applies NeLiS to create a relightable 3D Gaussian scene model +capable of real-time, photorealistic rendering from novel viewpoints. We show +the applicability and robustness of our proposed simulator and system in a +variety of real-world environments. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Towards Non-invasive and Personalized Management of Breast Cancer + Patients from Multiparametric MRI via A Large Mixture-of-Modality-Experts + Model + + +
+ Breast magnetic resonance imaging (MRI) is the imaging technique with the +highest sensitivity for detecting breast cancer and is routinely used for women +at high risk. Despite the comprehensive multiparametric protocol of breast MRI, +existing artificial intelligence-based studies predominantly rely on single +sequences and have limited validation. Here we report a large +mixture-of-modality-experts model (MOME) that integrates multiparametric MRI +information within a unified structure, offering a noninvasive method for +personalized breast cancer management. We have curated the largest +multiparametric breast MRI dataset, involving 5,205 patients from three +hospitals in the north, southeast, and southwest of China, for the development +and extensive evaluation of our model. MOME demonstrated accurate and robust +identification of breast cancer. It achieved comparable performance for +malignancy recognition to that of four senior radiologists and significantly +outperformed a junior radiologist, with 0.913 AUROC, 0.948 AUPRC, 0.905 F1 +score, and 0.723 MCC. Our findings suggest that MOME could reduce the need for +biopsies in BI-RADS 4 patients with a ratio of 7.3%, classify triple-negative +breast cancer with an AUROC of 0.709, and predict pathological complete +response to neoadjuvant chemotherapy with an AUROC of 0.694. The model further +supports scalable and interpretable inference, adapting to missing modalities +and providing decision explanations by highlighting lesions and measuring +modality contributions. MOME exemplifies a discriminative, robust, scalable, +and interpretable multimodal model, paving the way for noninvasive, +personalized management of breast cancer patients based on multiparametric +breast imaging data. + +
+
+ comment: 27 pages, 8 figures, 10 tables +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ Sync from the Sea: Retrieving Alignable Videos from Large-Scale Datasets ECCV 2024 + + +
+ Temporal video alignment aims to synchronize the key events like object +interactions or action phase transitions in two videos. Such methods could +benefit various video editing, processing, and understanding tasks. However, +existing approaches operate under the restrictive assumption that a suitable +video pair for alignment is given, significantly limiting their broader +applicability. To address this, we re-pose temporal alignment as a search +problem and introduce the task of Alignable Video Retrieval (AVR). Given a +query video, our approach can identify well-alignable videos from a large +collection of clips and temporally synchronize them to the query. To achieve +this, we make three key contributions: 1) we introduce DRAQ, a video +alignability indicator to identify and re-rank the best alignable video from a +set of candidates; 2) we propose an effective and generalizable frame-level +video feature design to improve the alignment performance of several +off-the-shelf feature representations, and 3) we propose a novel benchmark and +evaluation protocol for AVR using cycle-consistency metrics. Our experiments on +3 datasets, including large-scale Kinetics700, demonstrate the effectiveness of +our approach in identifying alignable video pairs from diverse datasets. +Project Page: https://daveishan.github.io/avr-webpage/. + +
+
+ comment: ECCV 2024 Oral +
+
+
+
+
+ + ☆ Know When to Fuse: Investigating Non-English Hybrid Retrieval in the + Legal Domain + + +
+ Hybrid search has emerged as an effective strategy to offset the limitations +of different matching paradigms, especially in out-of-domain contexts where +notable improvements in retrieval quality have been observed. However, existing +research predominantly focuses on a limited set of retrieval methods, evaluated +in pairs on domain-general datasets exclusively in English. In this work, we +study the efficacy of hybrid search across a variety of prominent retrieval +models within the unexplored field of law in the French language, assessing +both zero-shot and in-domain scenarios. Our findings reveal that in a zero-shot +context, fusing different domain-general models consistently enhances +performance compared to using a standalone model, regardless of the fusion +method. Surprisingly, when models are trained in-domain, we find that fusion +generally diminishes performance relative to using the best single system, +unless fusing scores with carefully tuned weights. These novel insights, among +others, expand the applicability of prior findings across a new field and +language, and contribute to a deeper understanding of hybrid search in +non-English specialized domains. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ SSD4Rec: A Structured State Space Duality Model for Efficient Sequential + Recommendation + + +
+ Sequential recommendation methods are crucial in modern recommender systems +for their remarkable capability to understand a user's changing interests based +on past interactions. However, a significant challenge faced by current methods +(e.g., RNN- or Transformer-based models) is to effectively and efficiently +capture users' preferences by modeling long behavior sequences, which impedes +their various applications like short video platforms where user interactions +are numerous. Recently, an emerging architecture named Mamba, built on state +space models (SSM) with efficient hardware-aware designs, has showcased the +tremendous potential for sequence modeling, presenting a compelling avenue for +addressing the challenge effectively. Inspired by this, we propose a novel +generic and efficient sequential recommendation backbone, SSD4Rec, which +explores the seamless adaptation of Mamba for sequential recommendations. +Specifically, SSD4Rec marks the variable- and long-length item sequences with +sequence registers and processes the item representations with bidirectional +Structured State Space Duality (SSD) blocks. This not only allows for +hardware-aware matrix multiplication but also empowers outstanding capabilities +in variable-length and long-range sequence modeling. Extensive evaluations on +four benchmark datasets demonstrate that the proposed model achieves +state-of-the-art performance while maintaining near-linear scalability with +user sequence length. Our code is publicly available at +https://github.com/ZhangYifeng1995/SSD4Rec. + +
+
+
+
+
+ + ☆ Real World Conversational Entity Linking Requires More Than Zeroshots + + +
+ Entity linking (EL) in conversations faces notable challenges in practical +applications, primarily due to the scarcity of entity-annotated conversational +datasets and sparse knowledge bases (KB) containing domain-specific, long-tail +entities. We designed targeted evaluation scenarios to measure the efficacy of +EL models under resource constraints. Our evaluation employs two KBs: Fandom, +exemplifying real-world EL complexities, and the widely used Wikipedia. First, +we assess EL models' ability to generalize to a new unfamiliar KB using Fandom +and a novel zero-shot conversational entity linking dataset that we curated +based on Reddit discussions on Fandom entities. We then evaluate the +adaptability of EL models to conversational settings without prior training. +Our results indicate that current zero-shot EL models falter when introduced to +new, domain-specific KBs without prior training, significantly dropping in +performance. Our findings reveal that previous evaluation approaches fall short +of capturing real-world complexities for zero-shot EL, highlighting the +necessity for new approaches to design and assess conversational EL models to +adapt to limited resources. The evaluation setup and the dataset proposed in +this research are made publicly available. + +
+
+
+
+
+ + ☆ LLM-PQA: LLM-enhanced Prediction Query Answering CIKM 2024 + + +
+ The advent of Large Language Models (LLMs) provides an opportunity to change +the way queries are processed, moving beyond the constraints of conventional +SQL-based database systems. However, using an LLM to answer a prediction query +is still challenging, since an external ML model has to be employed and +inference has to be performed in order to provide an answer. This paper +introduces LLM-PQA, a novel tool that addresses prediction queries formulated +in natural language. LLM-PQA is the first to combine the capabilities of LLMs +and retrieval-augmented mechanism for the needs of prediction queries by +integrating data lakes and model zoos. This integration provides users with +access to a vast spectrum of heterogeneous data and diverse ML models, +facilitating dynamic prediction query answering. In addition, LLM-PQA can +dynamically train models on demand, based on specific query requirements, +ensuring reliable and relevant results even when no pre-trained model in a +model zoo, available for the task. + +
+
+ comment: This paper is accepted as a demo at CIKM 2024 +
+
+
+
+
+ + ☆ Evidential Transformers for Improved Image Retrieval ECCV 2024 + + +
+ We introduce the Evidential Transformer, an uncertainty-driven transformer +model for improved and robust image retrieval. In this paper, we make several +contributions to content-based image retrieval (CBIR). We incorporate +probabilistic methods into image retrieval, achieving robust and reliable +results, with evidential classification surpassing traditional training based +on multiclass classification as a baseline for deep metric learning. +Furthermore, we improve the state-of-the-art retrieval results on several +datasets by leveraging the Global Context Vision Transformer (GC ViT) +architecture. Our experimental results consistently demonstrate the reliability +of our approach, setting a new benchmark in CBIR in all test settings on the +Stanford Online Products (SOP) and CUB-200-2011 datasets. + +
+
+ comment: 6 pages, 6 figures, To be presented at the 3rd Workshop on + Uncertainty Quantification for Computer Vision, at the ECCV 2024 conference + in Milan, Italy +
+
+
+
+
+ + ☆ Improved Diversity-Promoting Collaborative Metric Learning for + Recommendation + + +
+ Collaborative Metric Learning (CML) has recently emerged as a popular method +in recommendation systems (RS), closing the gap between metric learning and +collaborative filtering. Following the convention of RS, existing practices +exploit unique user representation in their model design. This paper focuses on +a challenging scenario where a user has multiple categories of interests. Under +this setting, the unique user representation might induce preference bias, +especially when the item category distribution is imbalanced. To address this +issue, we propose a novel method called \textit{Diversity-Promoting +Collaborative Metric Learning} (DPCML), with the hope of considering the +commonly ignored minority interest of the user. The key idea behind DPCML is to +introduce a set of multiple representations for each user in the system where +users' preference toward an item is aggregated by taking the minimum item-user +distance among their embedding set. Specifically, we instantiate two effective +assignment strategies to explore a proper quantity of vectors for each user. +Meanwhile, a \textit{Diversity Control Regularization Scheme} (DCRS) is +developed to accommodate the multi-vector representation strategy better. +Theoretically, we show that DPCML could induce a smaller generalization error +than traditional CML. Furthermore, we notice that CML-based approaches usually +require \textit{negative sampling} to reduce the heavy computational burden +caused by the pairwise objective therein. In this paper, we reveal the +fundamental limitation of the widely adopted hard-aware sampling from the +One-Way Partial AUC (OPAUC) perspective and then develop an effective sampling +alternative for the CML-based paradigm. Finally, comprehensive experiments over +a range of benchmark datasets speak to the efficacy of DPCML. Code are +available at \url{https://github.com/statusrank/LibCML}. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.15292 +
+
+
+
+
+ + ☆ Towards Investigating Biases in Spoken Conversational Search + + +
+ Voice-based systems like Amazon Alexa, Google Assistant, and Apple Siri, +along with the growing popularity of OpenAI's ChatGPT and Microsoft's Copilot, +serve diverse populations, including visually impaired and low-literacy +communities. This reflects a shift in user expectations from traditional search +to more interactive question-answering models. However, presenting information +effectively in voice-only channels remains challenging due to their linear +nature. This limitation can impact the presentation of complex queries +involving controversial topics with multiple perspectives. Failing to present +diverse viewpoints may perpetuate or introduce biases and affect user +attitudes. Balancing information load and addressing biases is crucial in +designing a fair and effective voice-based system. To address this, we (i) +review how biases and user attitude changes have been studied in screen-based +web search, (ii) address challenges in studying these changes in voice-based +settings like SCS, (iii) outline research questions, and (iv) propose an +experimental setup with variables, data, and instruments to explore biases in a +voice-based setting like Spoken Conversational Search. + +
+
+ comment: Accepted Late-Breaking Results at ACM ICMI Companion 2024 +
+
+
+
+
+ + ♻ ☆ Manipulating Large Language Models to Increase Product Visibility + + +
+ Large language models (LLMs) are increasingly being integrated into search +engines to provide natural language responses tailored to user queries. +Customers and end-users are also becoming more dependent on these models for +quick and easy purchase decisions. In this work, we investigate whether +recommendations from LLMs can be manipulated to enhance a product's visibility. +We demonstrate that adding a strategic text sequence (STS) -- a carefully +crafted message -- to a product's information page can significantly increase +its likelihood of being listed as the LLM's top recommendation. To understand +the impact of STS, we use a catalog of fictitious coffee machines and analyze +its effect on two target products: one that seldom appears in the LLM's +recommendations and another that usually ranks second. We observe that the +strategic text sequence significantly enhances the visibility of both products +by increasing their chances of appearing as the top recommendation. This +ability to manipulate LLM-generated search responses provides vendors with a +considerable competitive advantage and has the potential to disrupt fair market +competition. Just as search engine optimization (SEO) revolutionized how +webpages are customized to rank higher in search engine results, influencing +LLM recommendations could profoundly impact content optimization for AI-driven +search services. Code for our experiments is available at +https://github.com/aounon/llm-rank-optimizer. + +
+
+
+
+
+ + ♻ ☆ A multi-language toolkit for supporting automated checking of research + outputs + + +
+ This article presents the automatic checking of research outputs package +acro, which assists researchers and data governance teams by automatically +applying best-practice principles-based statistical disclosure control (SDC) +techniques on-the-fly as researchers conduct their analyses. acro distinguishes +between: research output that is safe to publish; output that requires further +analysis; and output that cannot be published because it creates substantial +risk of disclosing private data. This is achieved through the use of a +lightweight Python wrapper that sits over well-known analysis tools that +produce outputs such as tables, plots, and statistical models. This adds +functionality to (i) identify potentially disclosive outputs against a range of +commonly used disclosure tests; (ii) apply disclosure mitigation strategies +where required; (iii) report reasons for applying SDC; and (iv) produce simple +summary documents trusted research environment staff can use to streamline +their workflow. The major analytical programming languages used by researchers +are supported: Python, R, and Stata. The acro code and documentation are +available under an MIT license at https://github.com/AI-SDC/ACRO + +
+
+
+
+
+ + ♻ ☆ VM-Rec: A Variational Mapping Approach for Cold-start User + Recommendation + + +
+ The cold-start problem is a common challenge for most recommender systems. +The practical application of most cold-start methods is hindered by the +deficiency in auxiliary content information for users. Moreover, most methods +necessitate simultaneous updates to the extensive parameters of recommender +models, leading to significant training costs, particularly in large-scale +industrial scenarios. We observe that the model can generate expressive +embeddings for warm users with relatively more interactions. Initially, these +users were cold-start users, and after transitioning to warm users, they +exhibit clustering patterns in their embeddings with consistent initial +interactions. Based on this motivation, we propose a Variational Mapping +approach for cold-start user Recommendation (VM-Rec), mapping from few initial +interactions to expressive embeddings for cold-start users. Specifically, we +encode the initial interactions into a latent representation, where each +dimension disentangledly signifies the degree of association with each warm +user. Subsequently, we utilize this latent representation as the parameters for +the mapping function, mapping (decoding) it into an expressive embedding, which +can be integrated into a pre-trained recommender model directly. Our method is +evaluated on three datasets using the same base model, demonstrating superior +performance compared to other popular cold-start methods. + +
+
+
+
+
+ + ♻ ☆ A Hybrid RAG System with Comprehensive Enhancement on Complex Reasoning KDD + + +
+ Retrieval-augmented generation (RAG) is a framework enabling large language +models (LLMs) to enhance their accuracy and reduce hallucinations by +integrating external knowledge bases. In this paper, we introduce a hybrid RAG +system enhanced through a comprehensive suite of optimizations that +significantly improve retrieval quality, augment reasoning capabilities, and +refine numerical computation ability. We refined the text chunks and tables in +web pages, added attribute predictors to reduce hallucinations, conducted LLM +Knowledge Extractor and Knowledge Graph Extractor, and finally built a +reasoning strategy with all the references. We evaluated our system on the CRAG +dataset through the Meta CRAG KDD Cup 2024 Competition. Both the local and +online evaluations demonstrate that our system significantly enhances complex +reasoning capabilities. In local evaluations, we have significantly improved +accuracy and reduced error rates compared to the baseline model, achieving a +notable increase in scores. In the meanwhile, we have attained outstanding +results in online assessments, demonstrating the performance and generalization +capabilities of the proposed system. The source code for our system is released +in \url{https://gitlab.aicrowd.com/shizueyy/crag-new}. + +
+
+ comment: Technical report for 3rd prize in Task 1 of Meta CRAG KDD Cup 2024 +
+
+
+
+
+ + ♻ ☆ PEPT: Expert Finding Meets Personalized Pre-training + + +
+ Finding experts is essential in Community Question Answering (CQA) platforms +as it enables the effective routing of questions to potential users who can +provide relevant answers. The key is to personalized learning expert +representations based on their historical answered questions, and accurately +matching them with target questions. There have been some preliminary works +exploring the usability of PLMs in expert finding, such as pre-training expert +or question representations. However, these models usually learn pure text +representations of experts from histories, disregarding personalized and +fine-grained expert modeling. For alleviating this, we present a personalized +pre-training and fine-tuning paradigm, which could effectively learn expert +interest and expertise simultaneously. Specifically, in our pre-training +framework, we integrate historical answered questions of one expert with one +target question, and regard it as a candidate aware expert-level input unit. +Then, we fuse expert IDs into the pre-training for guiding the model to model +personalized expert representations, which can help capture the unique +characteristics and expertise of each individual expert. Additionally, in our +pre-training task, we design: 1) a question-level masked language model task to +learn the relatedness between histories, enabling the modeling of +question-level expert interest; 2) a vote-oriented task to capture +question-level expert expertise by predicting the vote score the expert would +receive. Through our pre-training framework and tasks, our approach could +holistically learn expert representations including interests and expertise. +Our method has been extensively evaluated on six real-world CQA datasets, and +the experimental results consistently demonstrate the superiority of our +approach over competitive baseline methods. + +
+
+
+
+
+
+
+
+ + Machine Learning 52 + +
+
+
+ + ♻ ☆ Advanced Predictive Modeling for Enhanced Mortality Prediction in ICU + Stroke Patients Using Clinical Data + + +
+ Background: Stroke is second-leading cause of disability and death among +adults. Approximately 17 million people suffer from a stroke annually, with +about 85% being ischemic strokes. Predicting mortality of ischemic stroke +patients in intensive care unit (ICU) is crucial for optimizing treatment +strategies, allocating resources, and improving survival rates. Methods: We +acquired data on ICU ischemic stroke patients from MIMIC-IV database, including +diagnoses, vital signs, laboratory tests, medications, procedures, treatments, +and clinical notes. Stroke patients were randomly divided into training (70%, +n=2441), test (15%, n=523), and validation (15%, n=523) sets. To address data +imbalances, we applied Synthetic Minority Over-sampling Technique (SMOTE). We +selected 30 features for model development, significantly reducing feature +number from 1095 used in the best study. We developed a deep learning model to +assess mortality risk and implemented several baseline machine learning models +for comparison. Results: XGB-DL model, combining XGBoost for feature selection +and deep learning, effectively minimized false positives. Model's AUROC +improved from 0.865 (95% CI: 0.821 - 0.905) on first day to 0.903 (95% CI: +0.868 - 0.936) by fourth day using data from 3,646 ICU mortality patients in +the MIMIC-IV database with 0.945 AUROC (95% CI: 0.944 - 0.947) during training. +Although other ML models also performed well in terms of AUROC, we chose Deep +Learning for its higher specificity. Conclusions: Through enhanced feature +selection and data cleaning, proposed model demonstrates a 13% AUROC +improvement compared to existing models while reducing feature number from 1095 +in previous studies to 30. + +
+
+
+
+
+ + ♻ ☆ AlphaFold Meets Flow Matching for Generating Protein Ensembles ICML 2024 + + +
+ The biological functions of proteins often depend on dynamic structural +ensembles. In this work, we develop a flow-based generative modeling approach +for learning and sampling the conformational landscapes of proteins. We +repurpose highly accurate single-state predictors such as AlphaFold and ESMFold +and fine-tune them under a custom flow matching framework to obtain +sequence-conditoned generative models of protein structure called AlphaFlow and +ESMFlow. When trained and evaluated on the PDB, our method provides a superior +combination of precision and diversity compared to AlphaFold with MSA +subsampling. When further trained on ensembles from all-atom MD, our method +accurately captures conformational flexibility, positional distributions, and +higher-order ensemble observables for unseen proteins. Moreover, our method can +diversify a static PDB structure with faster wall-clock convergence to certain +equilibrium properties than replicate MD trajectories, demonstrating its +potential as a proxy for expensive physics-based simulations. Code is available +at https://github.com/bjing2016/alphaflow. + +
+
+ comment: ICML 2024 +
+
+
+
+
+ + ♻ ☆ Automatic Differentiation is Essential in Training Neural Networks for + Solving Differential Equations + + +
+ Neural network-based approaches have recently shown significant promise in +solving partial differential equations (PDEs) in science and engineering, +especially in scenarios featuring complex domains or incorporation of empirical +data. One advantage of the neural network methods for PDEs lies in its +automatic differentiation (AD), which necessitates only the sample points +themselves, unlike traditional finite difference (FD) approximations that +require nearby local points to compute derivatives. In this paper, we +quantitatively demonstrate the advantage of AD in training neural networks. The +concept of truncated entropy is introduced to characterize the training +property. Specifically, through comprehensive experimental and theoretical +analyses conducted on random feature models and two-layer neural networks, we +discover that the defined truncated entropy serves as a reliable metric for +quantifying the residual loss of random feature models and the training speed +of neural networks for both AD and FD methods. Our experimental and theoretical +analyses demonstrate that, from a training perspective, AD outperforms FD in +solving PDEs. + +
+
+
+
+
+ + ♻ ☆ On the limits of neural network explainability via descrambling + + +
+ We characterize the exact solutions to neural network descrambling--a +mathematical model for explaining the fully connected layers of trained neural +networks (NNs). By reformulating the problem to the minimization of the +Brockett function arising in graph matching and complexity theory we show that +the principal components of the hidden layer preactivations can be +characterized as the optimal explainers or descramblers for the layer weights, +leading to descrambled weight matrices. We show that in typical deep learning +contexts these descramblers take diverse and interesting forms including (1) +matching largest principal components with the lowest frequency modes of the +Fourier basis for isotropic hidden data, (2) discovering the semantic +development in two-layer linear NNs for signal recovery problems, and (3) +explaining CNNs by optimally permuting the neurons. Our numerical experiments +indicate that the eigendecompositions of the hidden layer data--now understood +as the descramblers--can also reveal the layer's underlying transformation. +These results illustrate that the SVD is more directly related to the +explainability of NNs than previously thought and offers a promising avenue for +discovering interpretable motifs for the hidden action of NNs, especially in +contexts of operator learning or physics-informed NNs, where the input/output +data has limited human readability. + +
+
+
+
+
+ + ♻ MAP: Low-compute Model Merging with Amortized Pareto Fronts via + Quadratic Approximation + + +
+ Model merging has emerged as an effective approach to combine multiple +single-task models, fine-tuned from the same pre-trained model, into a +multitask model. This process typically involves computing a weighted average +of the model parameters without any additional training. Existing model-merging +methods focus on enhancing average task accuracy. However, interference and +conflicts between the objectives of different tasks can lead to trade-offs +during model merging. In real-world applications, a set of solutions with +various trade-offs can be more informative, helping practitioners make +decisions based on diverse preferences. In this paper, we introduce a novel +low-compute algorithm, Model Merging with Amortized Pareto Front (MAP). MAP +identifies a Pareto set of scaling coefficients for merging multiple models to +reflect the trade-offs. The core component of MAP is approximating the +evaluation metrics of the various tasks using a quadratic approximation +surrogate model derived from a pre-selected set of scaling coefficients, +enabling amortized inference. Experimental results on vision and natural +language processing tasks show that MAP can accurately identify the Pareto +front. To further reduce the required computation of MAP, we propose (1) a +Bayesian adaptive sampling algorithm and (2) a nested merging scheme with +multiple stages. + +
+
+
+
+
+ + ♻ ☆ RISSOLE: Parameter-efficient Diffusion Models via Block-wise Generation + and Retrieval-Guidance + + +
+ Diffusion-based models demonstrate impressive generation capabilities. +However, they also have a massive number of parameters, resulting in enormous +model sizes, thus making them unsuitable for deployment on resource-constraint +devices. Block-wise generation can be a promising alternative for designing +compact-sized (parameter-efficient) deep generative models since the model can +generate one block at a time instead of generating the whole image at once. +However, block-wise generation is also considerably challenging because +ensuring coherence across generated blocks can be non-trivial. To this end, we +design a retrieval-augmented generation (RAG) approach and leverage the +corresponding blocks of the images retrieved by the RAG module to condition the +training and generation stages of a block-wise denoising diffusion model. Our +conditioning schemes ensure coherence across the different blocks during +training and, consequently, during generation. While we showcase our approach +using the latent diffusion model (LDM) as the base model, it can be used with +other variants of denoising diffusion models. We validate the solution of the +coherence problem through the proposed approach by reporting substantive +experiments to demonstrate our approach's effectiveness in compact model size +and excellent generation quality. + +
+
+
+
+
+ + ♻ ☆ Uplift Modeling Under Limited Supervision + + +
+ Estimating causal effects in e-commerce tends to involve costly treatment +assignments which can be impractical in large-scale settings. Leveraging +machine learning to predict such treatment effects without actual intervention +is a standard practice to diminish the risk. However, existing methods for +treatment effect prediction tend to rely on training sets of substantial size, +which are built from real experiments and are thus inherently risky to create. +In this work we propose a graph neural network to diminish the required +training set size, relying on graphs that are common in e-commerce data. +Specifically, we view the problem as node regression with a restricted number +of labeled instances, develop a two-model neural architecture akin to previous +causal effect estimators, and test varying message-passing layers for encoding. +Furthermore, as an extra step, we combine the model with an acquisition +function to guide the creation of the training set in settings with extremely +low experimental budget. The framework is flexible since each step can be used +separately with other models or treatment policies. The experiments on real +large-scale networks indicate a clear advantage of our methodology over the +state of the art, which in many cases performs close to random, underlining the +need for models that can generalize with limited supervision to reduce +experimental risks. + +
+
+
+
+
+ + ♻ ☆ Globally Stable Neural Imitation Policies + + +
+ Imitation learning presents an effective approach to alleviate the +resource-intensive and time-consuming nature of policy learning from scratch in +the solution space. Even though the resulting policy can mimic expert +demonstrations reliably, it often lacks predictability in unexplored regions of +the state-space, giving rise to significant safety concerns in the face of +perturbations. To address these challenges, we introduce the Stable Neural +Dynamical System (SNDS), an imitation learning regime which produces a policy +with formal stability guarantees. We deploy a neural policy architecture that +facilitates the representation of stability based on Lyapunov theorem, and +jointly train the policy and its corresponding Lyapunov candidate to ensure +global stability. We validate our approach by conducting extensive experiments +in simulation and successfully deploying the trained policies on a real-world +manipulator arm. The experimental results demonstrate that our method overcomes +the instability, accuracy, and computational intensity problems associated with +previous imitation learning methods, making our method a promising solution for +stable policy learning in complex planning scenarios. + +
+
+
+
+
+ + ♻ AI-Assisted Generation of Difficult Math Questions + + +
+ Current LLM training positions mathematical reasoning as a core capability. +With publicly available sources fully tapped, there is unmet demand for diverse +and challenging math questions. Relying solely on human experts is both +time-consuming and costly, while LLM-generated questions often lack the +requisite diversity and difficulty. We present a design framework that combines +the strengths of LLMs with a human-in-the-loop approach to generate a diverse +array of challenging math questions. We leverage LLM metacognition skills +[Didolkar et al., 2024] of a strong LLM to extract core "skills" from existing +math datasets. These skills serve as the basis for generating novel and +difficult questions by prompting the LLM with random pairs of core skills. The +use of two different skills within each question makes finding such questions +an "out of distribution" task for both LLMs and humans. Our pipeline employs +LLMs to iteratively generate and refine questions and solutions through +multiturn prompting. Human annotators then verify and further refine the +questions, with their efficiency enhanced via further LLM interactions. +Applying this pipeline on skills extracted from the MATH dataset [Hendrycks et +al., 2021] resulted in MATH$^2$ - a dataset of higher-quality math questions, +as evidenced by: (a) Lower performance of all models on MATH$^2$ than on MATH +(b) Higher performance on MATH when using MATH$^2$ questions as in-context +examples. Although focused on mathematics, our methodology seems applicable to +other domains requiring structured reasoning, and potentially as a component of +scalable oversight. Also of interest is a striking relationship observed +between models' performance on the new dataset: the success rate on MATH$^2$ is +the square on MATH, suggesting that successfully solving the question in +MATH$^2$ requires a nontrivial combination of two distinct math skills. + +
+
+
+
+
+ + ♻ ☆ Online Detection of Anomalies in Temporal Knowledge Graphs with + Interpretability SIGMOD 2025 + + +
+ Temporal knowledge graphs (TKGs) are valuable resources for capturing +evolving relationships among entities, yet they are often plagued by noise, +necessitating robust anomaly detection mechanisms. Existing dynamic graph +anomaly detection approaches struggle to capture the rich semantics introduced +by node and edge categories within TKGs, while TKG embedding methods lack +interpretability, undermining the credibility of anomaly detection. Moreover, +these methods falter in adapting to pattern changes and semantic drifts +resulting from knowledge updates. To tackle these challenges, we introduce +AnoT, an efficient TKG summarization method tailored for interpretable online +anomaly detection in TKGs. AnoT begins by summarizing a TKG into a novel rule +graph, enabling flexible inference of complex patterns in TKGs. When new +knowledge emerges, AnoT maps it onto a node in the rule graph and traverses the +rule graph recursively to derive the anomaly score of the knowledge. The +traversal yields reachable nodes that furnish interpretable evidence for the +validity or the anomalous of the new knowledge. Overall, AnoT embodies a +detector-updater-monitor architecture, encompassing a detector for offline TKG +summarization and online scoring, an updater for real-time rule graph updates +based on emerging knowledge, and a monitor for estimating the approximation +error of the rule graph. Experimental results on four real-world datasets +demonstrate that AnoT surpasses existing methods significantly in terms of +accuracy and interoperability. All of the raw datasets and the implementation +of AnoT are provided in https://github.com/zjs123/ANoT. + +
+
+ comment: 26 pages, 10 figures. Accepted by SIGMOD 2025 +
+
+
+
+
+ + ♻ ☆ NeuFair: Neural Network Fairness Repair with Dropout ISSTA 2024 + + +
+ This paper investigates neuron dropout as a post-processing bias mitigation +for deep neural networks (DNNs). Neural-driven software solutions are +increasingly applied in socially critical domains with significant fairness +implications. While neural networks are exceptionally good at finding +statistical patterns from data, they may encode and amplify existing biases +from the historical data. Existing bias mitigation algorithms often require +modifying the input dataset or the learning algorithms. We posit that the +prevalent dropout methods that prevent over-fitting during training by randomly +dropping neurons may be an effective and less intrusive approach to improve the +fairness of pre-trained DNNs. However, finding the ideal set of neurons to drop +is a combinatorial problem. We propose NeuFair, a family of post-processing +randomized algorithms that mitigate unfairness in pre-trained DNNs via dropouts +during inference after training. Our randomized search is guided by an +objective to minimize discrimination while maintaining the model's utility. We +show that our design of randomized algorithms is effective and efficient in +improving fairness (up to 69%) with minimal or no model performance +degradation. We provide intuitive explanations of these phenomena and carefully +examine the influence of various hyperparameters of search algorithms on the +results. Finally, we empirically and conceptually compare NeuFair to different +state-of-the-art bias mitigators. + +
+
+ comment: Paper accepted at ACM ISSTA 2024 +
+
+
+
+
+ + ♻ ☆ Privacy-Aware Document Visual Question Answering ICDAR 2024 + + +
+ Document Visual Question Answering (DocVQA) has quickly grown into a central +task of document understanding. But despite the fact that documents contain +sensitive or copyrighted information, none of the current DocVQA methods offers +strong privacy guarantees. In this work, we explore privacy in the domain of +DocVQA for the first time, highlighting privacy issues in state of the art +multi-modal LLM models used for DocVQA, and explore possible solutions. +Specifically, we focus on invoice processing as a realistic document +understanding scenario, and propose a large scale DocVQA dataset comprising +invoice documents and associated questions and answers. We employ a federated +learning scheme, that reflects the real-life distribution of documents in +different businesses, and we explore the use case where the data of the invoice +provider is the sensitive information to be protected. We demonstrate that +non-private models tend to memorise, a behaviour that can lead to exposing +private information. We then evaluate baseline training schemes employing +federated learning and differential privacy in this multi-modal scenario, where +the sensitive information might be exposed through either or both of the two +input modalities: vision (document image) or language (OCR tokens). Finally, we +design attacks exploiting the memorisation effect of the model, and demonstrate +their effectiveness in probing a representative DocVQA models. + +
+
+ comment: 35 pages, 12 figures, accepted for publication at the 18th + International Conference on Document Analysis and Recognition, ICDAR 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Bias and Prediction Metrics to Characterise the Fairness of + Machine Learning for Equity-Centered Public Health Decision-Making: A + Narrative Review + + +
+ Background: The rapid advancement of Machine Learning (ML) represents novel +opportunities to enhance public health research, surveillance, and +decision-making. However, there is a lack of comprehensive understanding of +algorithmic bias, systematic errors in predicted population health outcomes, +resulting from the public health application of ML. The objective of this +narrative review is to explore the types of bias generated by ML and +quantitative metrics to assess these biases. + Methods : We performed search on PubMed, MEDLINE, IEEE (Institute of +Electrical and Electronics Engineers), ACM (Association for Computing +Machinery) Digital Library, Science Direct, and Springer Nature. We used +keywords to identify studies describing types of bias and metrics to measure +these in the domain of ML and public and population health published in English +between 2008 and 2023, inclusive. + Results: A total of 72 articles met the inclusion criteria. Our review +identified the commonly described types of bias and quantitative metrics to +assess these biases from an equity perspective. + Conclusion : The review will help formalize the evaluation framework for ML +on public health from an equity perspective. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Does Data-Efficient Generalization Exacerbate Bias in Foundation Models? ECCV 2024 + + +
+ Foundation models have emerged as robust models with label efficiency in +diverse domains. In medical imaging, these models contribute to the advancement +of medical diagnoses due to the difficulty in obtaining labeled data. However, +it is unclear whether using a large amount of unlabeled data, biased by the +presence of sensitive attributes during pre-training, influences the fairness +of the model. This research examines the bias in the Foundation model +(RetFound) when it is applied to fine-tune the Brazilian Multilabel +Ophthalmological Dataset (BRSET), which has a different population than the +pre-training dataset. The model evaluation, in comparison with supervised +learning, shows that the Foundation Model has the potential to reduce the gap +between the maximum AUC and minimum AUC evaluations across gender and age +groups. However, in a data-efficient generalization, the model increases the +bias when the data amount decreases. These findings suggest that when deploying +a Foundation Model in real-life scenarios with limited data, the possibility of +fairness issues should be considered. + +
+
+ comment: Preprint of paper to be presented at Fairness and Ethics Towards + Transparent AI: Facing the Challenge through Model Debiasing (FAILED) during + ECCV 2024 +
+
+
+
+
+ + ♻ ☆ Ancestral Reinforcement Learning: Unifying Zeroth-Order Optimization and + Genetic Algorithms for Reinforcement Learning + + +
+ Reinforcement Learning (RL) offers a fundamental framework for discovering +optimal action strategies through interactions within unknown environments. +Recent advancement have shown that the performance and applicability of RL can +significantly be enhanced by exploiting a population of agents in various ways. +Zeroth-Order Optimization (ZOO) leverages an agent population to estimate the +gradient of the objective function, enabling robust policy refinement even in +non-differentiable scenarios. As another application, Genetic Algorithms (GA) +boosts the exploration of policy landscapes by mutational generation of policy +diversity in an agent population and its refinement by selection. A natural +question is whether we can have the best of two worlds that the agent +population can have. In this work, we propose Ancestral Reinforcement Learning +(ARL), which synergistically combines the robust gradient estimation of ZOO +with the exploratory power of GA. The key idea in ARL is that each agent within +a population infers gradient by exploiting the history of its ancestors, i.e., +the ancestor population in the past, while maintaining the diversity of +policies in the current population as in GA. We also theoretically reveal that +the populational search in ARL implicitly induces the KL-regularization of the +objective function, resulting in the enhanced exploration. Our results extend +the applicability of populational algorithms for RL. + +
+
+ comment: 16pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Analysis of Failures and Risks in Deep Learning Model Converters: A Case + Study in the ONNX Ecosystem ISSTA'24 + + +
+ Software engineers develop, fine-tune, and deploy deep learning (DL) models +using a variety of development frameworks and runtime environments. DL model +converters move models between frameworks and to runtime environments. +Conversion errors compromise model quality and disrupt deployment. However, the +failure characteristics of DL model converters are unknown, adding risk when +using DL interoperability technologies. + This paper analyzes failures in DL model converters. We survey software +engineers about DL interoperability tools, use cases, and pain points (N=92). +Then, we characterize failures in model converters associated with the main +interoperability tool, ONNX (N=200 issues in PyTorch and TensorFlow). Finally, +we formulate and test two hypotheses about structural causes for the failures +we studied. We find that the node conversion stage of a model converter +accounts for ~75% of the defects and 33% of reported failure are related to +semantically incorrect models. The cause of semantically incorrect models is +elusive, but models with behaviour inconsistencies share operator sequences. +Our results motivate future research on making DL interoperability software +simpler to maintain, extend, and validate. Research into behavioural tolerances +and architectural coverage metrics could be fruitful. + +
+
+ comment: [ISSTA'24] Proceedings of the 33rd ACM SIGSOFT International + Symposium on Software Testing and Analysis (ISSTA) 2024 +
+
+
+
+
+ + ♻ ☆ HyperInterval: Hypernetwork approach to training weight interval regions + in continual learning + + +
+ Recently, a new Continual Learning (CL) paradigm was presented to control +catastrophic forgetting, called Interval Continual Learning (InterContiNet), +which relies on enforcing interval constraints on the neural network parameter +space. Unfortunately, InterContiNet training is challenging due to the high +dimensionality of the weight space, making intervals difficult to manage. To +address this issue, we introduce \our{} \footnote{The source code is available +at https://github.com/gmum/HyperInterval}, a technique that employs interval +arithmetic within the embedding space and utilizes a hypernetwork to map these +intervals to the target network parameter space. We train interval embeddings +for consecutive tasks and train a hypernetwork to transform these embeddings +into weights of the target network. An embedding for a given task is trained +along with the hypernetwork, preserving the response of the target network for +the previous task embeddings. Interval arithmetic works with a more manageable, +lower-dimensional embedding space rather than directly preparing intervals in a +high-dimensional weight space. Our model allows faster and more efficient +training. Furthermore, \our{} maintains the guarantee of not forgetting. At the +end of training, we can choose one universal embedding to produce a single +network dedicated to all tasks. In such a framework, hypernetwork is used only +for training and, finally, we can utilize one set of weights. \our{} obtains +significantly better results than InterContiNet and gives SOTA results on +several benchmarks. + +
+
+
+
+
+ + ♻ ☆ ResQuNNs:Towards Enabling Deep Learning in Quantum Convolution Neural + Networks + + +
+ In this paper, we present a novel framework for enhancing the performance of +Quanvolutional Neural Networks (QuNNs) by introducing trainable quanvolutional +layers and addressing the critical challenges associated with them. Traditional +quanvolutional layers, although beneficial for feature extraction, have largely +been static, offering limited adaptability. Unlike state-of-the-art, our +research overcomes this limitation by enabling training within these layers, +significantly increasing the flexibility and potential of QuNNs. However, the +introduction of multiple trainable quanvolutional layers induces complexities +in gradient-based optimization, primarily due to the difficulty in accessing +gradients across these layers. To resolve this, we propose a novel +architecture, Residual Quanvolutional Neural Networks (ResQuNNs), leveraging +the concept of residual learning, which facilitates the flow of gradients by +adding skip connections between layers. By inserting residual blocks between +quanvolutional layers, we ensure enhanced gradient access throughout the +network, leading to improved training performance. Moreover, we provide +empirical evidence on the strategic placement of these residual blocks within +QuNNs. Through extensive experimentation, we identify an efficient +configuration of residual blocks, which enables gradients across all the layers +in the network that eventually results in efficient training. Our findings +suggest that the precise location of residual blocks plays a crucial role in +maximizing the performance gains in QuNNs. Our results mark a substantial step +forward in the evolution of quantum deep learning, offering new avenues for +both theoretical development and practical quantum computing applications. + +
+
+
+
+
+ + ♻ ☆ Stabilizing Extreme Q-learning by Maclaurin Expansion + + +
+ In offline reinforcement learning, in-sample learning methods have been +widely used to prevent performance degradation caused by evaluating +out-of-distribution actions from the dataset. Extreme Q-learning (XQL) employs +a loss function based on the assumption that Bellman error follows a Gumbel +distribution, enabling it to model the soft optimal value function in an +in-sample manner. It has demonstrated strong performance in both offline and +online reinforcement learning settings. However, issues remain, such as the +instability caused by the exponential term in the loss function and the risk of +the error distribution deviating from the Gumbel distribution. Therefore, we +propose Maclaurin Expanded Extreme Q-learning to enhance stability. In this +method, applying Maclaurin expansion to the loss function in XQL enhances +stability against large errors. This approach involves adjusting the modeled +value function between the value function under the behavior policy and the +soft optimal value function, thus achieving a trade-off between stability and +optimality depending on the order of expansion. It also enables adjustment of +the error distribution assumption from a normal distribution to a Gumbel +distribution. Our method significantly stabilizes learning in online RL tasks +from DM Control, where XQL was previously unstable. Additionally, it improves +performance in several offline RL tasks from D4RL. + +
+
+ comment: Accepted at RLC 2024: The first Reinforcement Learning Conference +
+
+
+
+
+ + ♻ ☆ An Effective Information Theoretic Framework for Channel Pruning + + +
+ Channel pruning is a promising method for accelerating and compressing +convolutional neural networks. However, current pruning algorithms still remain +unsolved problems that how to assign layer-wise pruning ratios properly and +discard the least important channels with a convincing criterion. In this +paper, we present a novel channel pruning approach via information theory and +interpretability of neural networks. Specifically, we regard information +entropy as the expected amount of information for convolutional layers. In +addition, if we suppose a matrix as a system of linear equations, a higher-rank +matrix represents there exist more solutions to it, which indicates more +uncertainty. From the point of view of information theory, the rank can also +describe the amount of information. In a neural network, considering the rank +and entropy as two information indicators of convolutional layers, we propose a +fusion function to reach a compromise of them, where the fusion results are +defined as ``information concentration''. When pre-defining layer-wise pruning +ratios, we employ the information concentration as a reference instead of +heuristic and engineering tuning to provide a more interpretable solution. +Moreover, we leverage Shapley values, which are a potent tool in the +interpretability of neural networks, to evaluate the channel contributions and +discard the least important channels for model compression while maintaining +its performance. Extensive experiments demonstrate the effectiveness and +promising performance of our method. For example, our method improves the +accuracy by 0.21% when reducing 45.5% FLOPs and removing 40.3% parameters for +ResNet-56 on CIFAR-10. Moreover, our method obtains loss in Top-1/Top-5 +accuracies of 0.43%/0.11% by reducing 41.6% FLOPs and removing 35.0% parameters +for ResNet-50 on ImageNet. + +
+
+
+
+
+ + ♻ ☆ Mamba3D: Enhancing Local Features for 3D Point Cloud Analysis via State + Space Model ACM MM 2024 + + +
+ Existing Transformer-based models for point cloud analysis suffer from +quadratic complexity, leading to compromised point cloud resolution and +information loss. In contrast, the newly proposed Mamba model, based on state +space models (SSM), outperforms Transformer in multiple areas with only linear +complexity. However, the straightforward adoption of Mamba does not achieve +satisfactory performance on point cloud tasks. In this work, we present +Mamba3D, a state space model tailored for point cloud learning to enhance local +feature extraction, achieving superior performance, high efficiency, and +scalability potential. Specifically, we propose a simple yet effective Local +Norm Pooling (LNP) block to extract local geometric features. Additionally, to +obtain better global features, we introduce a bidirectional SSM (bi-SSM) with +both a token forward SSM and a novel backward SSM that operates on the feature +channel. Extensive experimental results show that Mamba3D surpasses +Transformer-based counterparts and concurrent works in multiple tasks, with or +without pre-training. Notably, Mamba3D achieves multiple SoTA, including an +overall accuracy of 92.6% (train from scratch) on the ScanObjectNN and 95.1% +(with single-modal pre-training) on the ModelNet40 classification task, with +only linear complexity. Our code and weights are available at +https://github.com/xhanxu/Mamba3D. + +
+
+ comment: ACM MM 2024. Code and weights are available at + https://github.com/xhanxu/Mamba3D +
+
+
+
+
+ + ♻ ☆ Identifying Weight-Variant Latent Causal Models + + +
+ The task of causal representation learning aims to uncover latent +higher-level causal representations that affect lower-level observations. +Identifying true latent causal representations from observed data, while +allowing instantaneous causal relations among latent variables, remains a +challenge, however. To this end, we start from the analysis of three intrinsic +properties in identifying latent space from observations: transitivity, +permutation indeterminacy, and scaling indeterminacy. We find that transitivity +acts as a key role in impeding the identifiability of latent causal +representations. To address the unidentifiable issue due to transitivity, we +introduce a novel identifiability condition where the underlying latent causal +model satisfies a linear-Gaussian model, in which the causal coefficients and +the distribution of Gaussian noise are modulated by an additional observed +variable. Under some mild assumptions, we can show that the latent causal +representations can be identified up to trivial permutation and scaling. +Furthermore, based on this theoretical result, we propose a novel method, +termed Structural caUsAl Variational autoEncoder, which directly learns latent +causal representations and causal relationships among them, together with the +mapping from the latent causal variables to the observed ones. We show that the +proposed method learns the true parameters asymptotically. Experimental results +on synthetic and real data demonstrate the identifiability and consistency +results and the efficacy of the proposed method in learning latent causal +representations. + +
+
+
+
+
+ + ♻ ☆ Deep Learning-based Target-To-User Association in Integrated Sensing and + Communication Systems + + +
+ In Integrated Sensing and Communication (ISAC) systems, matching the radar +targets with communication user equipments (UEs) is functional to several +communication tasks, such as proactive handover and beam prediction. In this +paper, we consider a radar-assisted communication system where a base station +(BS) is equipped with a multiple-input-multiple-output (MIMO) radar that has a +double aim: (i) associate vehicular radar targets to vehicular equipments (VEs) +in the communication beamspace and (ii) predict the beamforming vector for each +VE from radar data. The proposed target-to-user (T2U) association consists of +two stages. First, vehicular radar targets are detected from range-angle +images, and, for each, a beamforming vector is estimated. Then, the inferred +per-target beamforming vectors are matched with the ones utilized at the BS for +communication to perform target-to-user (T2U) association. Joint multi-target +detection and beam inference is obtained by modifying the you only look once +(YOLO) model, which is trained over simulated range-angle radar images. +Simulation results over different urban vehicular mobility scenarios show that +the proposed T2U method provides a probability of correct association that +increases with the size of the BS antenna array, highlighting the respective +increase of the separability of the VEs in the beamspace. Moreover, we show +that the modified YOLO architecture can effectively perform both beam +prediction and radar target detection, with similar performance in mean average +precision on the latter over different antenna array sizes. + +
+
+
+
+
+ + ♻ ☆ Barlow Twins Deep Neural Network for Advanced 1D Drug-Target Interaction + Prediction + + +
+ Accurate prediction of drug-target interactions is critical for advancing +drug discovery. By reducing time and cost, machine learning and deep learning +can accelerate this laborious discovery process. In a novel approach, +BarlowDTI, we utilise the powerful Barlow Twins architecture for +feature-extraction while considering the structure of the target protein. Our +method achieves state-of-the-art predictive performance against multiple +established benchmarks using only one-dimensional input. The use of gradient +boosting machine as the underlying predictor ensures fast and efficient +predictions without the need for substantial computational resources. We also +investigate how the model reaches its decision based on individual training +samples. By comparing co-crystal structures, we find that BarlowDTI effectively +exploits catalytically active and stabilising residues, highlighting the +model's ability to generalise from one-dimensional input data. In addition, we +further benchmark new baselines against existing methods. Together, these +innovations improve the efficiency and effectiveness of drug-target interaction +predictions, providing robust tools for accelerating drug development and +deepening the understanding of molecular interactions. Therefore, we provide an +easy-to-use web interface that can be freely accessed at +https://www.bio.nat.tum.de/oc2/barlowdti . + +
+
+ comment: Refined model architecture, additional results added +
+
+
+
+
+ + ♻ ☆ TRNet: Two-level Refinement Network leveraging Speech Enhancement for + Noise Robust Speech Emotion Recognition + + +
+ One persistent challenge in Speech Emotion Recognition (SER) is the +ubiquitous environmental noise, which frequently results in deteriorating SER +performance in practice. In this paper, we introduce a Two-level Refinement +Network, dubbed TRNet, to address this challenge. Specifically, a pre-trained +speech enhancement module is employed for front-end noise reduction and noise +level estimation. Later, we utilize clean speech spectrograms and their +corresponding deep representations as reference signals to refine the +spectrogram distortion and representation shift of enhanced speech during model +training. Experimental results validate that the proposed TRNet substantially +promotes the robustness of the proposed system in both matched and unmatched +noisy environments, without compromising its performance in noise-free +environments. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ DiffLoad: Uncertainty Quantification in Electrical Load Forecasting with + the Diffusion Model + + +
+ Electrical load forecasting plays a crucial role in decision-making for power +systems, including unit commitment and economic dispatch. The integration of +renewable energy sources and the occurrence of external events, such as the +COVID-19 pandemic, have rapidly increased uncertainties in load forecasting. +The uncertainties in load forecasting can be divided into two types: epistemic +uncertainty and aleatoric uncertainty. Separating these types of uncertainties +can help decision-makers better understand where and to what extent the +uncertainty is, thereby enhancing their confidence in the following +decision-making. This paper proposes a diffusion-based Seq2Seq structure to +estimate epistemic uncertainty and employs the robust additive Cauchy +distribution to estimate aleatoric uncertainty. Our method not only ensures the +accuracy of load forecasting but also demonstrates the ability to separate the +two types of uncertainties and be applicable to different levels of loads. The +relevant code can be found at +\url{https://anonymous.4open.science/r/DiffLoad-4714/}. + +
+
+ comment: Accepted by IEEE Transactions on Power Systems, 2024 +
+
+
+
+
+ + ♻ ☆ Near-Optimal Policy Identification in Robust Constrained Markov Decision + Processes via Epigraph Form + + +
+ Designing a safe policy for uncertain environments is crucial in real-world +control applications. However, this challenge remains inadequately addressed +within the Markov decision process (MDP) framework. This paper presents the +first algorithm capable of identifying a near-optimal policy in a robust +constrained MDP (RCMDP), where an optimal policy minimizes cumulative cost +while satisfying constraints in the worst-case scenario across a set of +environments. We first prove that the conventional Lagrangian max-min +formulation with policy gradient methods can become trapped in suboptimal +solutions by encountering a sum of conflicting gradients from the objective and +constraint functions during its inner minimization problem. To address this, we +leverage the epigraph form of the RCMDP problem, which resolves the conflict by +selecting a single gradient from either the objective or the constraints. +Building on the epigraph form, we propose a binary search algorithm with a +policy gradient subroutine and prove that it identifies an +$\varepsilon$-optimal policy in an RCMDP with +$\tilde{\mathcal{O}}(\varepsilon^{-4})$ policy evaluations. + +
+
+
+
+
+ + ♻ ☆ EuroPED-NN: Uncertainty aware surrogate model + + +
+ This work successfully generates an uncertainty-aware surrogate model of the +EuroPED plasma pedestal model using the Bayesian neural network with noise +contrastive prior (BNN-NCP) technique. This model is trained using data from +the JET-ILW pedestal database and subsequent model evaluations, conforming to +EuroPED-NN. The BNN-NCP technique has been proven to be a suitable method for +generating uncertainty-aware surrogate models. It matches the output results of +a regular neural network while providing confidence estimates for predictions +as uncertainties. Additionally, it highlights out-of-distribution (OOD) regions +using surrogate model uncertainties. This provides critical insights into model +robustness and reliability. EuroPED-NN has been physically validated, first, +analyzing electron density $n_e\!\left(\psi_{\text{pol}}=0.94\right)$ with +respect to increasing plasma current, $I_p$, and second, validating the +$\Delta-\beta_{p,ped}$ relation associated with the EuroPED model. This affirms +the robustness of the underlying physics learned by the surrogate model. On top +of that, the method was used to develop a EuroPED-like model fed with +experimental data, i.e. an uncertainty aware experimental model, which is +functional in JET database. Both models have been also tested in $\sim 50$ AUG +shots. + +
+
+
+
+
+ + ♻ ☆ The Initial Screening Order Problem + + +
+ We investigate the role of the initial screening order (ISO) in candidate +screening tasks, such as employee hiring and academic admissions, in which a +screener is tasked with selecting $k$ candidates from a candidate pool. The ISO +refers to the order in which the screener searches the candidate pool. Today, +it is common for the ISO to be the product of an information access system, +such as an online platform or a database query. The ISO has been largely +overlooked in the literature, despite its potential impact on the optimality +and fairness of the chosen $k$ candidates, especially under a human screener. +We define two problem formulations describing the search behavior of the +screener under the ISO: the best-$k$, where the screener selects the $k$ best +candidates; and the good-$k$, where the screener selects the $k$ first +good-enough candidates. To study the impact of the ISO, we introduce a +human-like screener and compare it to its algorithmic counterpart, where the +human-like screener is conceived to be inconsistent over time due to fatigue. +In particular, our analysis shows that the ISO, under a human-like screener +solving for the good-$k$ problem, hinders individual fairness despite meeting +group level fairness, and hampers the optimality of the selected $k$ +candidates. This is due to position bias, where a candidate's evaluation is +affected by its position within the ISO. We report extensive simulated +experiments exploring the parameters of the best-$k$ and good-$k$ problems for +the algorithmic and human-like screeners. The simulation framework is flexible +enough to account for multiple screening settings, being an alternative to +running real-world candidate screening procedures. This work is motivated by a +real-world candidate screening problem studied in collaboration with an +European company. + +
+
+
+
+
+ + ♻ ☆ Enabling Local Editing in Diffusion Models by Joint and Individual + Component Analysis BMVC2024 + + +
+ Recent advances in Diffusion Models (DMs) have led to significant progress in +visual synthesis and editing tasks, establishing them as a strong competitor to +Generative Adversarial Networks (GANs). However, the latent space of DMs is not +as well understood as that of GANs. Recent research has focused on unsupervised +semantic discovery in the latent space of DMs by leveraging the bottleneck +layer of the denoising network, which has been shown to exhibit properties of a +semantic latent space. However, these approaches are limited to discovering +global attributes. In this paper we address, the challenge of local image +manipulation in DMs and introduce an unsupervised method to factorize the +latent semantics learned by the denoising network of pre-trained DMs. Given an +arbitrary image and defined regions of interest, we utilize the Jacobian of the +denoising network to establish a relation between the regions of interest and +their corresponding subspaces in the latent space. Furthermore, we disentangle +the joint and individual components of these subspaces to identify latent +directions that enable local image manipulation. Once discovered, these +directions can be applied to different images to produce semantically +consistent edits, making our method suitable for practical applications. +Experimental results on various datasets demonstrate that our method can +produce semantic edits that are more localized and have better fidelity +compared to the state-of-the-art. + +
+
+ comment: Accepted at BMVC2024 +
+
+
+
+
+ + ♻ ☆ Autonomous Payload Thermal Control SP + + +
+ In small satellites there is less room for heat control equipment, scientific +instruments, and electronic components. Furthermore, the near proximity of +electronic components makes power dissipation difficult, with the risk of not +being able to control the temperature appropriately, reducing component +lifetime and mission performance. To address this challenge, taking advantage +of the advent of increasing intelligence on board satellites, an autonomous +thermal control tool that uses deep reinforcement learning is proposed for +learning the thermal control policy onboard. The tool was evaluated in a real +space edge processing computer that will be used in a demonstration payload +hosted in the International Space Station (ISS). The experiment results show +that the proposed framework is able to learn to control the payload processing +power to maintain the temperature under operational ranges, complementing +traditional thermal control systems. + +
+
+ comment: To be included in the proceedings of ESA's SPAICE conference at + ECSAT, UK, 2024 +
+
+
+
+
+ + ♻ ☆ Fast Robust Kernel Regression through Sign Gradient Descent with Early + Stopping + + +
+ Kernel ridge regression, KRR, is a generalization of linear ridge regression +that is non-linear in the data, but linear in the model parameters. Here, we +introduce an equivalent formulation of the objective function of KRR, which +opens up both for replacing the ridge penalty with the $\ell_\infty$ and +$\ell_1$ penalties and for studying kernel ridge regression from the +perspective of gradient descent. + Using the $\ell_\infty$ and $\ell_1$ penalties, we obtain robust and sparse +kernel regression, respectively. We further study the similarities between +explicitly regularized kernel regression and the solutions obtained by early +stopping of iterative gradient-based methods, where we connect $\ell_\infty$ +regularization to sign gradient descent, $\ell_1$ regularization to forward +stagewise regression (also known as coordinate descent), and $\ell_2$ +regularization to gradient descent, and, in the last case, theoretically bound +for the differences. We exploit the close relations between $\ell_\infty$ +regularization and sign gradient descent, and between $\ell_1$ regularization +and coordinate descent to propose computationally efficient methods for robust +and sparse kernel regression. + We finally compare robust kernel regression through sign gradient descent to +existing methods for robust kernel regression on five real data sets, +demonstrating that our method is one to two orders of magnitude faster, without +compromising accuracy. + +
+
+ comment: Article arXiv:2306.16838v1 has been updated and split into two + articles: this article and arXiv:2311.01762. Thus, some of the content in + arXiv:2306.16838v1 is not a part of arXiv:2306.16838v2, but of + arXiv:2311.01762 +
+
+
+
+
+ + ♻ ☆ Simplifying the Theory on Over-Smoothing + + +
+ Graph convolutions have gained popularity due to their ability to efficiently +operate on data with an irregular geometric structure. However, graph +convolutions cause over-smoothing, which refers to representations becoming +more similar with increased depth. However, many different definitions and +intuitions currently coexist, leading to research efforts focusing on +incompatible directions. This paper attempts to align these directions by +showing that over-smoothing is merely a special case of power iteration. This +greatly simplifies the existing theory on over-smoothing, making it more +accessible. Based on the theory, we provide a novel comprehensive definition of +rank collapse as a generalized form of over-smoothing and introduce the +rank-one distance as a corresponding metric. Our empirical evaluation of 14 +commonly used methods shows that more models than were previously known suffer +from this issue. + +
+
+
+
+
+ + ♻ ☆ BadMerging: Backdoor Attacks Against Model Merging CCS + + +
+ Fine-tuning pre-trained models for downstream tasks has led to a +proliferation of open-sourced task-specific models. Recently, Model Merging +(MM) has emerged as an effective approach to facilitate knowledge transfer +among these independently fine-tuned models. MM directly combines multiple +fine-tuned task-specific models into a merged model without additional +training, and the resulting model shows enhanced capabilities in multiple +tasks. Although MM provides great utility, it may come with security risks +because an adversary can exploit MM to affect multiple downstream tasks. +However, the security risks of MM have barely been studied. In this paper, we +first find that MM, as a new learning paradigm, introduces unique challenges +for existing backdoor attacks due to the merging process. To address these +challenges, we introduce BadMerging, the first backdoor attack specifically +designed for MM. Notably, BadMerging allows an adversary to compromise the +entire merged model by contributing as few as one backdoored task-specific +model. BadMerging comprises a two-stage attack mechanism and a novel +feature-interpolation-based loss to enhance the robustness of embedded +backdoors against the changes of different merging parameters. Considering that +a merged model may incorporate tasks from different domains, BadMerging can +jointly compromise the tasks provided by the adversary (on-task attack) and +other contributors (off-task attack) and solve the corresponding unique +challenges with novel attack designs. Extensive experiments show that +BadMerging achieves remarkable attacks against various MM algorithms. Our +ablation study demonstrates that the proposed attack designs can progressively +contribute to the attack performance. Finally, we show that prior defense +mechanisms fail to defend against our attacks, highlighting the need for more +advanced defense. + +
+
+ comment: To appear in ACM Conference on Computer and Communications Security + (CCS), 2024 +
+
+
+
+
+ + ♻ ☆ OriGen:Enhancing RTL Code Generation with Code-to-Code Augmentation and + Self-Reflection + + +
+ Recent studies have demonstrated the significant potential of Large Language +Models (LLMs) in generating Register Transfer Level (RTL) code, with notable +advancements showcased by commercial models such as GPT-4 and Claude3-Opus. +However, these proprietary LLMs often raise concerns regarding privacy and +security. While open-source LLMs offer solutions to these concerns, they +typically underperform commercial models in RTL code generation tasks, +primarily due to the scarcity of high-quality open-source RTL datasets. To +address this challenge, we introduce OriGen , a fully open-source framework +that incorporates self-reflection capabilities and a novel dataset augmentation +methodology for generating high-quality, large-scale RTL code. Our approach +employs a code-tocode augmentation technique to enhance the quality of +open-source RTL code datasets. Furthermore, OriGen can rectify syntactic errors +through a self-reflection process that leverages compiler feedback. +Experimental results demonstrate that OriGen significantly outperforms other +open-source alternatives in RTL code generation. It surpasses the previous +best-performing open-source LLM by 12.8% and even exceeds GPT-4 Turbo in the +pass@1 metric on the VerilogEval-Human benchmark. Moreover, OriGen exhibits +superior capabilities in self-reflection and error correction, outperforming +GPT-4 by 19.9% on a benchmark designed to evaluate self-reflection +capabilities. + +
+
+
+
+
+ + ♻ ☆ Biometrics and Behavior Analysis for Detecting Distractions in + e-Learning + + +
+ In this article, we explore computer vision approaches to detect abnormal +head pose during e-learning sessions and we introduce a study on the effects of +mobile phone usage during these sessions. We utilize behavioral data collected +from 120 learners monitored while participating in a MOOC learning sessions. +Our study focuses on the influence of phone-usage events on behavior and +physiological responses, specifically attention, heart rate, and meditation, +before, during, and after phone usage. Additionally, we propose an approach for +estimating head pose events using images taken by the webcam during the MOOC +learning sessions to detect phone-usage events. Our hypothesis suggests that +head posture undergoes significant changes when learners interact with a mobile +phone, contrasting with the typical behavior seen when learners face a computer +during e-learning sessions. We propose an approach designed to detect +deviations in head posture from the average observed during a learner's +session, operating as a semi-supervised method. This system flags events +indicating alterations in head posture for subsequent human review and +selection of mobile phone usage occurrences with a sensitivity over 90%. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ VAAD: Visual Attention Analysis Dashboard applied to e-Learning + + +
+ In this paper, we present an approach in the Multimodal Learning Analytics +field. Within this approach, we have developed a tool to visualize and analyze +eye movement data collected during learning sessions in online courses. The +tool is named VAAD, an acronym for Visual Attention Analysis Dashboard. These +eye movement data have been gathered using an eye-tracker and subsequently +processed and visualized for interpretation. The purpose of the tool is to +conduct a descriptive analysis of the data by facilitating its visualization, +enabling the identification of differences and learning patterns among various +learner populations. Additionally, it integrates a predictive module capable of +anticipating learner activities during a learning session. Consequently, VAAD +holds the potential to offer valuable insights into online learning behaviors +from both descriptive and predictive perspectives. + +
+
+ comment: Published in IEEE Intl. Symposium on Computers in Education (SIIE) + 2024 +
+
+
+
+
+ + ♻ ☆ From Static to Dynamic Structures: Improving Binding Affinity Prediction + with Graph-Based Deep Learning + + +
+ Accurate prediction of protein-ligand binding affinities is an essential +challenge in structure-based drug design. Despite recent advances in +data-driven methods for affinity prediction, their accuracy is still limited, +partially because they only take advantage of static crystal structures while +the actual binding affinities are generally determined by the thermodynamic +ensembles between proteins and ligands. One effective way to approximate such a +thermodynamic ensemble is to use molecular dynamics (MD) simulation. Here, an +MD dataset containing 3,218 different protein-ligand complexes is curated, and +Dynaformer, a graph-based deep learning model is further developed to predict +the binding affinities by learning the geometric characteristics of the +protein-ligand interactions from the MD trajectories. In silico experiments +demonstrated that the model exhibits state-of-the-art scoring and ranking power +on the CASF-2016 benchmark dataset, outperforming the methods hitherto +reported. Moreover, in a virtual screening on heat shock protein 90 (HSP90) +using Dynaformer, 20 candidates are identified and their binding affinities are +further experimentally validated. Dynaformer displayed promising results in +virtual drug screening, revealing 12 hit compounds (two are in the +submicromolar range), including several novel scaffolds. Overall, these results +demonstrated that the approach offer a promising avenue for accelerating the +early drug discovery process. + +
+
+ comment: Update the content according to the published version on Advanced + Science (https://doi.org/10.1002/advs.202405404) +
+
+
+
+
+ + ♻ ☆ Directly Handling Missing Data in Linear Discriminant Analysis for + Enhancing Classification Accuracy and Interpretability + + +
+ As the adoption of Artificial Intelligence (AI) models expands into critical +real-world applications, ensuring the explainability of these models becomes +paramount, particularly in sensitive fields such as medicine and finance. +Linear Discriminant Analysis (LDA) remains a popular choice for classification +due to its interpretable nature, derived from its capacity to model class +distributions and enhance class separation through linear combinations of +features. However, real-world datasets often suffer from incomplete data, +posing substantial challenges for both classification accuracy and model +interpretability. In this paper, we introduce a novel and robust classification +method, termed Weighted missing Linear Discriminant Analysis (WLDA), which +extends LDA to handle datasets with missing values without the need for +imputation. Our approach innovatively incorporates a weight matrix that +penalizes missing entries, thereby refining parameter estimation directly on +incomplete data. This methodology not only preserves the interpretability of +LDA but also significantly enhances classification performance in scenarios +plagued by missing data. We conduct an in-depth theoretical analysis to +establish the properties of WLDA and thoroughly evaluate its explainability. +Experimental results across various datasets demonstrate that WLDA consistently +outperforms traditional methods, especially in challenging environments where +missing values are prevalent in both training and test datasets. This +advancement provides a critical tool for improving classification accuracy and +maintaining model transparency in the face of incomplete data. + +
+
+
+
+
+ + ♻ ☆ ERATTA: Extreme RAG for Table To Answers with Large Language Models + + +
+ Large language models (LLMs) with retrieval augmented-generation (RAG) have +been the optimal choice for scalable generative AI solutions in the recent +past. Although RAG implemented with AI agents (agentic-RAG) has been recently +popularized, its suffers from unstable cost and unreliable performances for +Enterprise-level data-practices. Most existing use-cases that incorporate RAG +with LLMs have been either generic or extremely domain specific, thereby +questioning the scalability and generalizability of RAG-LLM approaches. In this +work, we propose a unique LLM-based system where multiple LLMs can be invoked +to enable data authentication, user-query routing, data-retrieval and custom +prompting for question-answering capabilities from Enterprise-data tables. The +source tables here are highly fluctuating and large in size and the proposed +framework enables structured responses in under 10 seconds per query. +Additionally, we propose a five metric scoring module that detects and reports +hallucinations in the LLM responses. Our proposed system and scoring metrics +achieve >90% confidence scores across hundreds of user queries in the +sustainability, financial health and social media domains. Extensions to the +proposed extreme RAG architectures can enable heterogeneous source querying +using LLMs. + +
+
+ comment: 5 pages, 4 tables, IEEE Big Data, 2024 +
+
+
+
+
+ + ♻ ☆ Disease Classification and Impact of Pretrained Deep Convolution Neural + Networks on Diverse Medical Imaging Datasets across Imaging Modalities + + +
+ Imaging techniques such as Chest X-rays, whole slide images, and optical +coherence tomography serve as the initial screening and detection for a wide +variety of medical pulmonary and ophthalmic conditions respectively. This paper +investigates the intricacies of using pretrained deep convolutional neural +networks with transfer learning across diverse medical imaging datasets with +varying modalities for binary and multiclass classification. We conducted a +comprehensive performance analysis with ten network architectures and model +families each with pretraining and random initialization. Our finding showed +that the use of pretrained models as fixed feature extractors yields poor +performance irrespective of the datasets. Contrary, histopathology microscopy +whole slide images have better performance. It is also found that deeper and +more complex architectures did not necessarily result in the best performance. +This observation implies that the improvements in ImageNet are not parallel to +the medical imaging tasks. Within a medical domain, the performance of the +network architectures varies within model families with shifts in datasets. +This indicates that the performance of models within a specific modality may +not be conclusive for another modality within the same domain. This study +provides a deeper understanding of the applications of deep learning techniques +in medical imaging and highlights the impact of pretrained networks across +different medical imaging datasets under five different experimental settings. + +
+
+ comment: 15 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator + + +
+ We present Piecewise Rectified Flow (PeRFlow), a flow-based method for +accelerating diffusion models. PeRFlow divides the sampling process of +generative flows into several time windows and straightens the trajectories in +each interval via the reflow operation, thereby approaching piecewise linear +flows. PeRFlow achieves superior performance in a few-step generation. +Moreover, through dedicated parameterizations, the PeRFlow models inherit +knowledge from the pretrained diffusion models. Thus, the training converges +fast and the obtained models show advantageous transfer ability, serving as +universal plug-and-play accelerators that are compatible with various workflows +based on the pre-trained diffusion models. Codes for training and inference are +publicly released. https://github.com/magic-research/piecewise-rectified-flow + +
+
+
+
+
+ + ♻ ☆ Instant Adversarial Purification with Adversarial Consistency + Distillation + + +
+ Neural networks, despite their remarkable performance in widespread +applications, including image classification, are also known to be vulnerable +to subtle adversarial noise. Although some diffusion-based purification methods +have been proposed, for example, DiffPure, those methods are time-consuming. In +this paper, we propose One Step Control Purification (OSCP), a diffusion-based +purification model that can purify the adversarial image in one Neural Function +Evaluation (NFE) in diffusion models. We use Latent Consistency Model (LCM) and +ControlNet for our one-step purification. OSCP is computationally friendly and +time efficient compared to other diffusion-based purification methods; we +achieve defense success rate of 74.19\% on ImageNet, only requiring 0.1s for +each purification. Moreover, there is a fundamental incongruence between +consistency distillation and adversarial perturbation. To address this +ontological dissonance, we propose Gaussian Adversarial Noise Distillation +(GAND), a novel consistency distillation framework that facilitates a more +nuanced reconciliation of the latent space dynamics, effectively bridging the +natural and adversarial manifolds. Our experiments show that the GAND does not +need a Full Fine Tune (FFT); PEFT, e.g., LoRA is sufficient. + +
+
+
+
+
+ + ♻ ☆ MLR-Copilot: Autonomous Machine Learning Research based on Large + Language Models Agents + + +
+ Machine learning research, crucial for technological advancements and +innovation, often faces significant challenges due to its inherent complexity, +slow pace of experimentation, and the necessity for specialized expertise. +Motivated by this, we present a new systematic framework, autonomous Machine +Learning Research with large language models (MLR-Copilot), designed to enhance +machine learning research productivity through the automatic generation and +implementation of research ideas using Large Language Model (LLM) agents. The +framework consists of three phases: research idea generation, experiment +implementation, and implementation execution. First, existing research papers +are used to generate hypotheses and experimental plans vis IdeaAgent powered by +LLMs. Next, the implementation generation phase translates these plans into +executables with ExperimentAgent. This phase leverages retrieved prototype code +and optionally retrieves candidate models and data. Finally, the execution +phase, also managed by ExperimentAgent, involves running experiments with +mechanisms for human feedback and iterative debugging to enhance the likelihood +of achieving executable research outcomes. We evaluate our framework on five +machine learning research tasks and the experimental results show the +framework's potential to facilitate the research progress and innovations. + +
+
+
+
+
+ + ♻ ☆ A Grey-box Attack against Latent Diffusion Model-based Image Editing by + Posterior Collapse + + +
+ Recent advancements in generative AI, particularly Latent Diffusion Models +(LDMs), have revolutionized image synthesis and manipulation. However, these +generative techniques raises concerns about data misappropriation and +intellectual property infringement. Adversarial attacks on machine learning +models have been extensively studied, and a well-established body of research +has extended these techniques as a benign metric to prevent the underlying +misuse of generative AI. Current approaches to safeguarding images from +manipulation by LDMs are limited by their reliance on model-specific knowledge +and their inability to significantly degrade semantic quality of generated +images. In response to these shortcomings, we propose the Posterior Collapse +Attack (PCA) based on the observation that VAEs suffer from posterior collapse +during training. Our method minimizes dependence on the white-box information +of target models to get rid of the implicit reliance on model-specific +knowledge. By accessing merely a small amount of LDM parameters, in specific +merely the VAE encoder of LDMs, our method causes a substantial semantic +collapse in generation quality, particularly in perceptual consistency, and +demonstrates strong transferability across various model architectures. +Experimental results show that PCA achieves superior perturbation effects on +image generation of LDMs with lower runtime and VRAM. Our method outperforms +existing techniques, offering a more robust and generalizable solution that is +helpful in alleviating the socio-technical challenges posed by the rapidly +evolving landscape of generative AI. + +
+
+ comment: 21 pages, 7 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Pearl: A Production-ready Reinforcement Learning Agent + + +
+ Reinforcement learning (RL) is a versatile framework for optimizing long-term +goals. Although many real-world problems can be formalized with RL, learning +and deploying a performant RL policy requires a system designed to address +several important challenges, including the exploration-exploitation dilemma, +partial observability, dynamic action spaces, and safety concerns. While the +importance of these challenges has been well recognized, existing open-source +RL libraries do not explicitly address them. This paper introduces Pearl, a +Production-Ready RL software package designed to embrace these challenges in a +modular way. In addition to presenting benchmarking results, we also highlight +examples of Pearl's ongoing industry adoption to demonstrate its advantages for +production use cases. Pearl is open sourced on GitHub at +github.com/facebookresearch/pearl and its official website is +pearlagent.github.io. + +
+
+
+
+
+ + ♻ ☆ Fault Tolerant ML: Efficient Meta-Aggregation and Synchronous Training + + +
+ In this paper, we investigate the challenging framework of Byzantine-robust +training in distributed machine learning (ML) systems, focusing on enhancing +both efficiency and practicality. As distributed ML systems become integral for +complex ML tasks, ensuring resilience against Byzantine failures-where workers +may contribute incorrect updates due to malice or error-gains paramount +importance. Our first contribution is the introduction of the Centered Trimmed +Meta Aggregator (CTMA), an efficient meta-aggregator that upgrades baseline +aggregators to optimal performance levels, while requiring low computational +demands. Additionally, we propose harnessing a recently developed gradient +estimation technique based on a double-momentum strategy within the Byzantine +context. Our paper highlights its theoretical and practical advantages for +Byzantine-robust training, especially in simplifying the tuning process and +reducing the reliance on numerous hyperparameters. The effectiveness of this +technique is supported by theoretical insights within the stochastic convex +optimization (SCO) framework and corroborated by empirical evidence. + +
+
+
+
+
+ + ♻ ☆ An Idiosyncrasy of Time-discretization in Reinforcement Learning + + +
+ Many reinforcement learning algorithms are built on an assumption that an +agent interacts with an environment over fixed-duration, discrete time steps. +However, physical systems are continuous in time, requiring a choice of +time-discretization granularity when digitally controlling them. Furthermore, +such systems do not wait for decisions to be made before advancing the +environment state, necessitating the study of how the choice of discretization +may affect a reinforcement learning algorithm. In this work, we consider the +relationship between the definitions of the continuous-time and discrete-time +returns. Specifically, we acknowledge an idiosyncrasy with naively applying a +discrete-time algorithm to a discretized continuous-time environment, and note +how a simple modification can better align the return definitions. This +observation is of practical consideration when dealing with environments where +time-discretization granularity is a choice, or situations where such +granularity is inherently stochastic. + +
+
+ comment: RLC 2024 +
+
+
+
+
+ + ♻ ☆ On the Benefits of Public Representations for Private Transfer Learning + under Distribution Shift + + +
+ Public pretraining is a promising approach to improve differentially private +model training. However, recent work has noted that many positive research +results studying this paradigm only consider in-distribution tasks, and may not +apply to settings where there is distribution shift between the pretraining and +finetuning data -- a scenario that is likely when finetuning private tasks due +to the sensitive nature of the data. In this work, we show empirically across +three tasks that even in settings with large distribution shift, where both +zero-shot performance from public data and training from scratch with private +data give unusably weak results, public features can in fact improve private +training accuracy by up to 67\% over private training from scratch. We provide +a theoretical explanation for this phenomenon, showing that if the public and +private data share a low-dimensional representation, public representations can +improve the sample complexity of private training even if it is impossible to +learn the private task from the public data alone. Altogether, our results +provide evidence that public data can indeed make private training practical in +realistic settings of extreme distribution shift. + +
+
+
+
+
+ + ♻ ☆ From Wide to Deep: Dimension Lifting Network for Parameter-efficient + Knowledge Graph Embedding + + +
+ Knowledge graph embedding (KGE) that maps entities and relations into vector +representations is essential for downstream applications. Conventional KGE +methods require high-dimensional representations to learn the complex structure +of knowledge graph, but lead to oversized model parameters. Recent advances +reduce parameters by low-dimensional entity representations, while developing +techniques (e.g., knowledge distillation or reinvented representation forms) to +compensate for reduced dimension. However, such operations introduce +complicated computations and model designs that may not benefit large knowledge +graphs. To seek a simple strategy to improve the parameter efficiency of +conventional KGE models, we take inspiration from that deeper neural networks +require exponentially fewer parameters to achieve expressiveness comparable to +wider networks for compositional structures. We view all entity representations +as a single-layer embedding network, and conventional KGE methods that adopt +high-dimensional entity representations equal widening the embedding network to +gain expressiveness. To achieve parameter efficiency, we instead propose a +deeper embedding network for entity representations, i.e., a narrow entity +embedding layer plus a multi-layer dimension lifting network (LiftNet). +Experiments on three public datasets show that by integrating LiftNet, four +conventional KGE methods with 16-dimensional representations achieve comparable +link prediction accuracy as original models that adopt 512-dimensional +representations, saving 68.4% to 96.9% parameters. + +
+
+
+
+
+ + ♻ ☆ TrajDeleter: Enabling Trajectory Forgetting in Offline Reinforcement + Learning Agents NDSS 2025 + + +
+ Reinforcement learning (RL) trains an agent from experiences interacting with +the environment. In scenarios where online interactions are impractical, +offline RL, which trains the agent using pre-collected datasets, has become +popular. While this new paradigm presents remarkable effectiveness across +various real-world domains, like healthcare and energy management, there is a +growing demand to enable agents to rapidly and completely eliminate the +influence of specific trajectories from both the training dataset and the +trained agents. To meet this problem, this paper advocates Trajdeleter, the +first practical approach to trajectory unlearning for offline RL agents. The +key idea of Trajdeleter is to guide the agent to demonstrate deteriorating +performance when it encounters states associated with unlearning trajectories. +Simultaneously, it ensures the agent maintains its original performance level +when facing other remaining trajectories. Additionally, we introduce +Trajauditor, a simple yet efficient method to evaluate whether Trajdeleter +successfully eliminates the specific trajectories of influence from the offline +RL agent. Extensive experiments conducted on six offline RL algorithms and +three tasks demonstrate that Trajdeleter requires only about 1.5% of the time +needed for retraining from scratch. It effectively unlearns an average of 94.8% +of the targeted trajectories yet still performs well in actual environment +interactions after unlearning. The replication package and agent parameters are +available online. + +
+
+ comment: Accepted at NDSS 2025. The presented document here is the full + version of our paper +
+
+
+
+
+ + ♻ ☆ Blending Neural Operators and Relaxation Methods in PDE Numerical + Solvers + + +
+ Neural networks suffer from spectral bias having difficulty in representing +the high frequency components of a function while relaxation methods can +resolve high frequencies efficiently but stall at moderate to low frequencies. +We exploit the weaknesses of the two approaches by combining them +synergistically to develop a fast numerical solver of partial differential +equations (PDEs) at scale. Specifically, we propose HINTS, a hybrid, iterative, +numerical, and transferable solver by integrating a Deep Operator Network +(DeepONet) with standard relaxation methods, leading to parallel efficiency and +algorithmic scalability for a wide class of PDEs, not tractable with existing +monolithic solvers. HINTS balances the convergence behavior across the spectrum +of eigenmodes by utilizing the spectral bias of DeepONet, resulting in a +uniform convergence rate and hence exceptional performance of the hybrid solver +overall. Moreover, HINTS applies to large-scale, multidimensional systems, it +is flexible with regards to discretizations, computational domain, and boundary +conditions. + +
+
+ comment: Main text: 17 pages, 6 figures. Supplementary Information: 30 pages, + 8 figures, 2 tables, 4 algorithms +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Spectron: Target Speaker Extraction using Conditional Transformer with + Adversarial Refinement + + +
+ Recently, attention-based transformers have become a de facto standard in +many deep learning applications including natural language processing, computer +vision, signal processing, etc.. In this paper, we propose a transformer-based +end-to-end model to extract a target speaker's speech from a monaural +multi-speaker mixed audio signal. Unlike existing speaker extraction methods, +we introduce two additional objectives to impose speaker embedding consistency +and waveform encoder invertibility and jointly train both speaker encoder and +speech separator to better capture the speaker conditional embedding. +Furthermore, we leverage a multi-scale discriminator to refine the perceptual +quality of the extracted speech. Our experiments show that the use of a dual +path transformer in the separator backbone along with proposed training +paradigm improves the CNN baseline by $3.12$ dB points. Finally, we compare our +approach with recent state-of-the-arts and show that our model outperforms +existing methods by $4.1$ dB points on an average without creating additional +data dependency. + +
+
+
+
+
+ + ☆ Multi-Reference Generative Face Video Compression with Contrastive + Learning + + +
+ Generative face video coding (GFVC) has been demonstrated as a potential +approach to low-latency, low bitrate video conferencing. GFVC frameworks +achieve an extreme gain in coding efficiency with over 70% bitrate savings when +compared to conventional codecs at bitrates below 10kbps. In recent MPEG/JVET +standardization efforts, all the information required to reconstruct video +sequences using GFVC frameworks are adopted as part of the supplemental +enhancement information (SEI) in existing compression pipelines. In light of +this development, we aim to address a challenge that has been weakly addressed +in prior GFVC frameworks, i.e., reconstruction drift as the distance between +the reference and target frames increases. This challenge creates the need to +update the reference buffer more frequently by transmitting more Intra-refresh +frames, which are the most expensive element of the GFVC bitstream. To overcome +this problem, we propose instead multiple reference animation as a robust +approach to minimizing reconstruction drift, especially when used in a +bi-directional prediction mode. Further, we propose a contrastive learning +formulation for multi-reference animation. We observe that using a contrastive +learning framework enhances the representation capabilities of the animation +generator. The resulting framework, MRDAC (Multi-Reference Deep Animation +Codec) can therefore be used to compress longer sequences with fewer reference +frames or achieve a significant gain in reconstruction accuracy at comparable +bitrates to previous frameworks. Quantitative and qualitative results show +significant coding and reconstruction quality gains compared to previous GFVC +methods, and more accurate animation quality in presence of large pose and +facial expression changes. + +
+
+
+
+
+ + ☆ Interpretable Convolutional SyncNet + + +
+ Because videos in the wild can be out of sync for various reasons, a sync-net +is used to bring the video back into sync for tasks that require synchronized +videos. Previous state-of-the-art (SOTA) sync-nets use InfoNCE loss, rely on +the transformer architecture, or both. Unfortunately, the former makes the +model's output difficult to interpret, and the latter is unfriendly with large +images, thus limiting the usefulness of sync-nets. In this work, we train a +convolutional sync-net using the balanced BCE loss (BBCE), a loss inspired by +the binary cross entropy (BCE) and the InfoNCE losses. In contrast to the +InfoNCE loss, the BBCE loss does not require complicated sampling schemes. Our +model can better handle larger images, and its output can be given a +probabilistic interpretation. The probabilistic interpretation allows us to +define metrics such as probability at offset and offscreen ratio to evaluate +the sync quality of audio-visual (AV) speech datasets. Furthermore, our model +achieves SOTA accuracy of $96.5\%$ on the LRS2 dataset and $93.8\%$ on the LRS3 +dataset. + +
+
+ comment: 8+5 pages +
+
+
+
+
+ + ♻ ☆ Inter-Frame Compression for Dynamic Point Cloud Geometry Coding + + +
+ Efficient point cloud compression is essential for applications like virtual +and mixed reality, autonomous driving, and cultural heritage. This paper +proposes a deep learning-based inter-frame encoding scheme for dynamic point +cloud geometry compression. We propose a lossy geometry compression scheme that +predicts the latent representation of the current frame using the previous +frame by employing a novel feature space inter-prediction network. The proposed +network utilizes sparse convolutions with hierarchical multiscale 3D feature +learning to encode the current frame using the previous frame. The proposed +method introduces a novel predictor network for motion compensation in the +feature domain to map the latent representation of the previous frame to the +coordinates of the current frame to predict the current frame's feature +embedding. The framework transmits the residual of the predicted features and +the actual features by compressing them using a learned probabilistic +factorized entropy model. At the receiver, the decoder hierarchically +reconstructs the current frame by progressively rescaling the feature +embedding. The proposed framework is compared to the state-of-the-art +Video-based Point Cloud Compression (V-PCC) and Geometry-based Point Cloud +Compression (G-PCC) schemes standardized by the Moving Picture Experts Group +(MPEG). The proposed method achieves more than 88% BD-Rate (Bjontegaard Delta +Rate) reduction against G-PCCv20 Octree, more than 56% BD-Rate savings against +G-PCCv20 Trisoup, more than 62% BD-Rate reduction against V-PCC intra-frame +encoding mode, and more than 52% BD-Rate savings against V-PCC P-frame-based +inter-frame encoding mode using HEVC. These significant performance gains are +cross-checked and verified in the MPEG working group. + +
+
+
+
+
+ + ♻ ☆ MCDubber: Multimodal Context-Aware Expressive Video Dubbing SC2024 + + +
+ Automatic Video Dubbing (AVD) aims to take the given script and generate +speech that aligns with lip motion and prosody expressiveness. Current AVD +models mainly utilize visual information of the current sentence to enhance the +prosody of synthesized speech. However, it is crucial to consider whether the +prosody of the generated dubbing aligns with the multimodal context, as the +dubbing will be combined with the original context in the final video. This +aspect has been overlooked in previous studies. To address this issue, we +propose a Multimodal Context-aware video Dubbing model, termed +\textbf{MCDubber}, to convert the modeling object from a single sentence to a +longer sequence with context information to ensure the consistency of the +global context prosody. MCDubber comprises three main components: (1) A context +duration aligner aims to learn the context-aware alignment between the text and +lip frames; (2) A context prosody predictor seeks to read the global context +visual sequence and predict the context-aware global energy and pitch; (3) A +context acoustic decoder ultimately predicts the global context mel-spectrogram +with the assistance of adjacent ground-truth mel-spectrograms of the target +sentence. Through this process, MCDubber fully considers the influence of +multimodal context on the prosody expressiveness of the current sentence when +dubbing. The extracted mel-spectrogram belonging to the target sentence from +the output context mel-spectrograms is the final required dubbing audio. +Extensive experiments on the Chem benchmark dataset demonstrate that our +MCDubber significantly improves dubbing expressiveness compared to all advanced +baselines. The code and demos are available at +https://github.com/XiaoYuanJun-zy/MCDubber. + +
+
+ comment: Accepted by NCMMSC2024 +
+
+
+
+
+ + ♻ ☆ Show Me the World in My Language: Establishing the First Baseline for + Scene-Text to Scene-Text Translation ICPR 2024 + + +
+ In this work, we study the task of ``visually'' translating scene text from a +source language (e.g., Hindi) to a target language (e.g., English). Visual +translation involves not just the recognition and translation of scene text but +also the generation of the translated image that preserves visual features of +the source scene text, such as font, size, and background. There are several +challenges associated with this task, such as translation with limited context, +deciding between translation and transliteration, accommodating varying text +lengths within fixed spatial boundaries, and preserving the font and background +styles of the source scene text in the target language. To address this +problem, we make the following contributions: (i) We study visual translation +as a standalone problem for the first time in the literature. (ii) We present a +cascaded framework for visual translation that combines state-of-the-art +modules for scene text recognition, machine translation, and scene text +synthesis as a baseline for the task. (iii) We propose a set of task-specific +design enhancements to design a variant of the baseline to obtain performance +improvements. (iv) Currently, the existing related literature lacks any +comprehensive performance evaluation for this novel task. To fill this gap, we +introduce several automatic and user-assisted evaluation metrics designed +explicitly for evaluating visual translation. Further, we evaluate presented +baselines for translating scene text between Hindi and English. Our experiments +demonstrate that although we can effectively perform visual translation over a +large collection of scene text images, the presented baseline only partially +addresses challenges posed by visual translation tasks. We firmly believe that +this new task and the limitations of existing models, as reported in this +paper, should encourage further research in visual translation. + +
+
+ comment: Accepted at ICPR 2024, Project Website: + https://vl2g.github.io/projects/visTrans/ +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`